xref: /openbmc/qemu/hw/i386/intel_iommu.c (revision 9c4888c9)
1 /*
2  * QEMU emulation of an Intel IOMMU (VT-d)
3  *   (DMA Remapping device)
4  *
5  * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
6  * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12 
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17 
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "qemu/error-report.h"
24 #include "qemu/main-loop.h"
25 #include "qapi/error.h"
26 #include "hw/sysbus.h"
27 #include "intel_iommu_internal.h"
28 #include "hw/pci/pci.h"
29 #include "hw/pci/pci_bus.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/i386/pc.h"
32 #include "hw/i386/apic-msidef.h"
33 #include "hw/i386/x86-iommu.h"
34 #include "hw/pci-host/q35.h"
35 #include "sysemu/kvm.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/sysemu.h"
38 #include "hw/i386/apic_internal.h"
39 #include "kvm/kvm_i386.h"
40 #include "migration/vmstate.h"
41 #include "trace.h"
42 
43 /* context entry operations */
44 #define VTD_CE_GET_RID2PASID(ce) \
45     ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
46 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \
47     ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
48 
49 /* pe operations */
50 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
51 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
52 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
53     if (ret_fr) {                                                             \
54         ret_fr = -ret_fr;                                                     \
55         if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
56             trace_vtd_fault_disabled();                                       \
57         } else {                                                              \
58             vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
59         }                                                                     \
60         goto error;                                                           \
61     }                                                                         \
62 }
63 
64 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
65 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
66 
67 static void vtd_panic_require_caching_mode(void)
68 {
69     error_report("We need to set caching-mode=on for intel-iommu to enable "
70                  "device assignment with IOMMU protection.");
71     exit(1);
72 }
73 
74 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
75                             uint64_t wmask, uint64_t w1cmask)
76 {
77     stq_le_p(&s->csr[addr], val);
78     stq_le_p(&s->wmask[addr], wmask);
79     stq_le_p(&s->w1cmask[addr], w1cmask);
80 }
81 
82 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
83 {
84     stq_le_p(&s->womask[addr], mask);
85 }
86 
87 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
88                             uint32_t wmask, uint32_t w1cmask)
89 {
90     stl_le_p(&s->csr[addr], val);
91     stl_le_p(&s->wmask[addr], wmask);
92     stl_le_p(&s->w1cmask[addr], w1cmask);
93 }
94 
95 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
96 {
97     stl_le_p(&s->womask[addr], mask);
98 }
99 
100 /* "External" get/set operations */
101 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
102 {
103     uint64_t oldval = ldq_le_p(&s->csr[addr]);
104     uint64_t wmask = ldq_le_p(&s->wmask[addr]);
105     uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
106     stq_le_p(&s->csr[addr],
107              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
108 }
109 
110 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
111 {
112     uint32_t oldval = ldl_le_p(&s->csr[addr]);
113     uint32_t wmask = ldl_le_p(&s->wmask[addr]);
114     uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
115     stl_le_p(&s->csr[addr],
116              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
117 }
118 
119 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
120 {
121     uint64_t val = ldq_le_p(&s->csr[addr]);
122     uint64_t womask = ldq_le_p(&s->womask[addr]);
123     return val & ~womask;
124 }
125 
126 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
127 {
128     uint32_t val = ldl_le_p(&s->csr[addr]);
129     uint32_t womask = ldl_le_p(&s->womask[addr]);
130     return val & ~womask;
131 }
132 
133 /* "Internal" get/set operations */
134 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
135 {
136     return ldq_le_p(&s->csr[addr]);
137 }
138 
139 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
140 {
141     return ldl_le_p(&s->csr[addr]);
142 }
143 
144 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
145 {
146     stq_le_p(&s->csr[addr], val);
147 }
148 
149 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
150                                         uint32_t clear, uint32_t mask)
151 {
152     uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
153     stl_le_p(&s->csr[addr], new_val);
154     return new_val;
155 }
156 
157 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
158                                         uint64_t clear, uint64_t mask)
159 {
160     uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
161     stq_le_p(&s->csr[addr], new_val);
162     return new_val;
163 }
164 
165 static inline void vtd_iommu_lock(IntelIOMMUState *s)
166 {
167     qemu_mutex_lock(&s->iommu_lock);
168 }
169 
170 static inline void vtd_iommu_unlock(IntelIOMMUState *s)
171 {
172     qemu_mutex_unlock(&s->iommu_lock);
173 }
174 
175 static void vtd_update_scalable_state(IntelIOMMUState *s)
176 {
177     uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
178 
179     if (s->scalable_mode) {
180         s->root_scalable = val & VTD_RTADDR_SMT;
181     }
182 }
183 
184 /* Whether the address space needs to notify new mappings */
185 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
186 {
187     return as->notifier_flags & IOMMU_NOTIFIER_MAP;
188 }
189 
190 /* GHashTable functions */
191 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
192 {
193     return *((const uint64_t *)v1) == *((const uint64_t *)v2);
194 }
195 
196 static guint vtd_uint64_hash(gconstpointer v)
197 {
198     return (guint)*(const uint64_t *)v;
199 }
200 
201 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
202                                           gpointer user_data)
203 {
204     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
205     uint16_t domain_id = *(uint16_t *)user_data;
206     return entry->domain_id == domain_id;
207 }
208 
209 /* The shift of an addr for a certain level of paging structure */
210 static inline uint32_t vtd_slpt_level_shift(uint32_t level)
211 {
212     assert(level != 0);
213     return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
214 }
215 
216 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
217 {
218     return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
219 }
220 
221 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
222                                         gpointer user_data)
223 {
224     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
225     VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
226     uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
227     uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
228     return (entry->domain_id == info->domain_id) &&
229             (((entry->gfn & info->mask) == gfn) ||
230              (entry->gfn == gfn_tlb));
231 }
232 
233 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
234  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
235  */
236 static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
237 {
238     VTDAddressSpace *vtd_as;
239     VTDBus *vtd_bus;
240     GHashTableIter bus_it;
241     uint32_t devfn_it;
242 
243     trace_vtd_context_cache_reset();
244 
245     g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
246 
247     while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
248         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
249             vtd_as = vtd_bus->dev_as[devfn_it];
250             if (!vtd_as) {
251                 continue;
252             }
253             vtd_as->context_cache_entry.context_cache_gen = 0;
254         }
255     }
256     s->context_cache_gen = 1;
257 }
258 
259 /* Must be called with IOMMU lock held. */
260 static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
261 {
262     assert(s->iotlb);
263     g_hash_table_remove_all(s->iotlb);
264 }
265 
266 static void vtd_reset_iotlb(IntelIOMMUState *s)
267 {
268     vtd_iommu_lock(s);
269     vtd_reset_iotlb_locked(s);
270     vtd_iommu_unlock(s);
271 }
272 
273 static void vtd_reset_caches(IntelIOMMUState *s)
274 {
275     vtd_iommu_lock(s);
276     vtd_reset_iotlb_locked(s);
277     vtd_reset_context_cache_locked(s);
278     vtd_iommu_unlock(s);
279 }
280 
281 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
282                                   uint32_t level)
283 {
284     return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
285            ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
286 }
287 
288 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
289 {
290     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
291 }
292 
293 /* Must be called with IOMMU lock held */
294 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
295                                        hwaddr addr)
296 {
297     VTDIOTLBEntry *entry;
298     uint64_t key;
299     int level;
300 
301     for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
302         key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
303                                 source_id, level);
304         entry = g_hash_table_lookup(s->iotlb, &key);
305         if (entry) {
306             goto out;
307         }
308     }
309 
310 out:
311     return entry;
312 }
313 
314 /* Must be with IOMMU lock held */
315 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
316                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
317                              uint8_t access_flags, uint32_t level)
318 {
319     VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
320     uint64_t *key = g_malloc(sizeof(*key));
321     uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
322 
323     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
324     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
325         trace_vtd_iotlb_reset("iotlb exceeds size limit");
326         vtd_reset_iotlb_locked(s);
327     }
328 
329     entry->gfn = gfn;
330     entry->domain_id = domain_id;
331     entry->slpte = slpte;
332     entry->access_flags = access_flags;
333     entry->mask = vtd_slpt_level_page_mask(level);
334     *key = vtd_get_iotlb_key(gfn, source_id, level);
335     g_hash_table_replace(s->iotlb, key, entry);
336 }
337 
338 /* Given the reg addr of both the message data and address, generate an
339  * interrupt via MSI.
340  */
341 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
342                                    hwaddr mesg_data_reg)
343 {
344     MSIMessage msi;
345 
346     assert(mesg_data_reg < DMAR_REG_SIZE);
347     assert(mesg_addr_reg < DMAR_REG_SIZE);
348 
349     msi.address = vtd_get_long_raw(s, mesg_addr_reg);
350     msi.data = vtd_get_long_raw(s, mesg_data_reg);
351 
352     trace_vtd_irq_generate(msi.address, msi.data);
353 
354     apic_get_class()->send_msi(&msi);
355 }
356 
357 /* Generate a fault event to software via MSI if conditions are met.
358  * Notice that the value of FSTS_REG being passed to it should be the one
359  * before any update.
360  */
361 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
362 {
363     if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
364         pre_fsts & VTD_FSTS_IQE) {
365         error_report_once("There are previous interrupt conditions "
366                           "to be serviced by software, fault event "
367                           "is not generated");
368         return;
369     }
370     vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
371     if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
372         error_report_once("Interrupt Mask set, irq is not generated");
373     } else {
374         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
375         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
376     }
377 }
378 
379 /* Check if the Fault (F) field of the Fault Recording Register referenced by
380  * @index is Set.
381  */
382 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
383 {
384     /* Each reg is 128-bit */
385     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
386     addr += 8; /* Access the high 64-bit half */
387 
388     assert(index < DMAR_FRCD_REG_NR);
389 
390     return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
391 }
392 
393 /* Update the PPF field of Fault Status Register.
394  * Should be called whenever change the F field of any fault recording
395  * registers.
396  */
397 static void vtd_update_fsts_ppf(IntelIOMMUState *s)
398 {
399     uint32_t i;
400     uint32_t ppf_mask = 0;
401 
402     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
403         if (vtd_is_frcd_set(s, i)) {
404             ppf_mask = VTD_FSTS_PPF;
405             break;
406         }
407     }
408     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
409     trace_vtd_fsts_ppf(!!ppf_mask);
410 }
411 
412 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
413 {
414     /* Each reg is 128-bit */
415     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
416     addr += 8; /* Access the high 64-bit half */
417 
418     assert(index < DMAR_FRCD_REG_NR);
419 
420     vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
421     vtd_update_fsts_ppf(s);
422 }
423 
424 /* Must not update F field now, should be done later */
425 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
426                             uint16_t source_id, hwaddr addr,
427                             VTDFaultReason fault, bool is_write)
428 {
429     uint64_t hi = 0, lo;
430     hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
431 
432     assert(index < DMAR_FRCD_REG_NR);
433 
434     lo = VTD_FRCD_FI(addr);
435     hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
436     if (!is_write) {
437         hi |= VTD_FRCD_T;
438     }
439     vtd_set_quad_raw(s, frcd_reg_addr, lo);
440     vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
441 
442     trace_vtd_frr_new(index, hi, lo);
443 }
444 
445 /* Try to collapse multiple pending faults from the same requester */
446 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
447 {
448     uint32_t i;
449     uint64_t frcd_reg;
450     hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
451 
452     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
453         frcd_reg = vtd_get_quad_raw(s, addr);
454         if ((frcd_reg & VTD_FRCD_F) &&
455             ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
456             return true;
457         }
458         addr += 16; /* 128-bit for each */
459     }
460     return false;
461 }
462 
463 /* Log and report an DMAR (address translation) fault to software */
464 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
465                                   hwaddr addr, VTDFaultReason fault,
466                                   bool is_write)
467 {
468     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
469 
470     assert(fault < VTD_FR_MAX);
471 
472     if (fault == VTD_FR_RESERVED_ERR) {
473         /* This is not a normal fault reason case. Drop it. */
474         return;
475     }
476 
477     trace_vtd_dmar_fault(source_id, fault, addr, is_write);
478 
479     if (fsts_reg & VTD_FSTS_PFO) {
480         error_report_once("New fault is not recorded due to "
481                           "Primary Fault Overflow");
482         return;
483     }
484 
485     if (vtd_try_collapse_fault(s, source_id)) {
486         error_report_once("New fault is not recorded due to "
487                           "compression of faults");
488         return;
489     }
490 
491     if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
492         error_report_once("Next Fault Recording Reg is used, "
493                           "new fault is not recorded, set PFO field");
494         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
495         return;
496     }
497 
498     vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
499 
500     if (fsts_reg & VTD_FSTS_PPF) {
501         error_report_once("There are pending faults already, "
502                           "fault event is not generated");
503         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
504         s->next_frcd_reg++;
505         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
506             s->next_frcd_reg = 0;
507         }
508     } else {
509         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
510                                 VTD_FSTS_FRI(s->next_frcd_reg));
511         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
512         s->next_frcd_reg++;
513         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
514             s->next_frcd_reg = 0;
515         }
516         /* This case actually cause the PPF to be Set.
517          * So generate fault event (interrupt).
518          */
519          vtd_generate_fault_event(s, fsts_reg);
520     }
521 }
522 
523 /* Handle Invalidation Queue Errors of queued invalidation interface error
524  * conditions.
525  */
526 static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
527 {
528     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
529 
530     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
531     vtd_generate_fault_event(s, fsts_reg);
532 }
533 
534 /* Set the IWC field and try to generate an invalidation completion interrupt */
535 static void vtd_generate_completion_event(IntelIOMMUState *s)
536 {
537     if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
538         trace_vtd_inv_desc_wait_irq("One pending, skip current");
539         return;
540     }
541     vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
542     vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
543     if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
544         trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
545                                     "new event not generated");
546         return;
547     } else {
548         /* Generate the interrupt event */
549         trace_vtd_inv_desc_wait_irq("Generating complete event");
550         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
551         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
552     }
553 }
554 
555 static inline bool vtd_root_entry_present(IntelIOMMUState *s,
556                                           VTDRootEntry *re,
557                                           uint8_t devfn)
558 {
559     if (s->root_scalable && devfn > UINT8_MAX / 2) {
560         return re->hi & VTD_ROOT_ENTRY_P;
561     }
562 
563     return re->lo & VTD_ROOT_ENTRY_P;
564 }
565 
566 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
567                               VTDRootEntry *re)
568 {
569     dma_addr_t addr;
570 
571     addr = s->root + index * sizeof(*re);
572     if (dma_memory_read(&address_space_memory, addr,
573                         re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) {
574         re->lo = 0;
575         return -VTD_FR_ROOT_TABLE_INV;
576     }
577     re->lo = le64_to_cpu(re->lo);
578     re->hi = le64_to_cpu(re->hi);
579     return 0;
580 }
581 
582 static inline bool vtd_ce_present(VTDContextEntry *context)
583 {
584     return context->lo & VTD_CONTEXT_ENTRY_P;
585 }
586 
587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
588                                            VTDRootEntry *re,
589                                            uint8_t index,
590                                            VTDContextEntry *ce)
591 {
592     dma_addr_t addr, ce_size;
593 
594     /* we have checked that root entry is present */
595     ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
596               VTD_CTX_ENTRY_LEGACY_SIZE;
597 
598     if (s->root_scalable && index > UINT8_MAX / 2) {
599         index = index & (~VTD_DEVFN_CHECK_MASK);
600         addr = re->hi & VTD_ROOT_ENTRY_CTP;
601     } else {
602         addr = re->lo & VTD_ROOT_ENTRY_CTP;
603     }
604 
605     addr = addr + index * ce_size;
606     if (dma_memory_read(&address_space_memory, addr,
607                         ce, ce_size, MEMTXATTRS_UNSPECIFIED)) {
608         return -VTD_FR_CONTEXT_TABLE_INV;
609     }
610 
611     ce->lo = le64_to_cpu(ce->lo);
612     ce->hi = le64_to_cpu(ce->hi);
613     if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
614         ce->val[2] = le64_to_cpu(ce->val[2]);
615         ce->val[3] = le64_to_cpu(ce->val[3]);
616     }
617     return 0;
618 }
619 
620 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
621 {
622     return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
623 }
624 
625 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
626 {
627     return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
628 }
629 
630 /* Whether the pte indicates the address of the page frame */
631 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
632 {
633     return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
634 }
635 
636 /* Get the content of a spte located in @base_addr[@index] */
637 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
638 {
639     uint64_t slpte;
640 
641     assert(index < VTD_SL_PT_ENTRY_NR);
642 
643     if (dma_memory_read(&address_space_memory,
644                         base_addr + index * sizeof(slpte),
645                         &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) {
646         slpte = (uint64_t)-1;
647         return slpte;
648     }
649     slpte = le64_to_cpu(slpte);
650     return slpte;
651 }
652 
653 /* Given an iova and the level of paging structure, return the offset
654  * of current level.
655  */
656 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
657 {
658     return (iova >> vtd_slpt_level_shift(level)) &
659             ((1ULL << VTD_SL_LEVEL_BITS) - 1);
660 }
661 
662 /* Check Capability Register to see if the @level of page-table is supported */
663 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
664 {
665     return VTD_CAP_SAGAW_MASK & s->cap &
666            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
667 }
668 
669 /* Return true if check passed, otherwise false */
670 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
671                                      VTDPASIDEntry *pe)
672 {
673     switch (VTD_PE_GET_TYPE(pe)) {
674     case VTD_SM_PASID_ENTRY_FLT:
675     case VTD_SM_PASID_ENTRY_SLT:
676     case VTD_SM_PASID_ENTRY_NESTED:
677         break;
678     case VTD_SM_PASID_ENTRY_PT:
679         if (!x86_iommu->pt_supported) {
680             return false;
681         }
682         break;
683     default:
684         /* Unknown type */
685         return false;
686     }
687     return true;
688 }
689 
690 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
691 {
692     return pdire->val & 1;
693 }
694 
695 /**
696  * Caller of this function should check present bit if wants
697  * to use pdir entry for further usage except for fpd bit check.
698  */
699 static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
700                                          uint32_t pasid,
701                                          VTDPASIDDirEntry *pdire)
702 {
703     uint32_t index;
704     dma_addr_t addr, entry_size;
705 
706     index = VTD_PASID_DIR_INDEX(pasid);
707     entry_size = VTD_PASID_DIR_ENTRY_SIZE;
708     addr = pasid_dir_base + index * entry_size;
709     if (dma_memory_read(&address_space_memory, addr,
710                         pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
711         return -VTD_FR_PASID_TABLE_INV;
712     }
713 
714     return 0;
715 }
716 
717 static inline bool vtd_pe_present(VTDPASIDEntry *pe)
718 {
719     return pe->val[0] & VTD_PASID_ENTRY_P;
720 }
721 
722 static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
723                                           uint32_t pasid,
724                                           dma_addr_t addr,
725                                           VTDPASIDEntry *pe)
726 {
727     uint32_t index;
728     dma_addr_t entry_size;
729     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
730 
731     index = VTD_PASID_TABLE_INDEX(pasid);
732     entry_size = VTD_PASID_ENTRY_SIZE;
733     addr = addr + index * entry_size;
734     if (dma_memory_read(&address_space_memory, addr,
735                         pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
736         return -VTD_FR_PASID_TABLE_INV;
737     }
738 
739     /* Do translation type check */
740     if (!vtd_pe_type_check(x86_iommu, pe)) {
741         return -VTD_FR_PASID_TABLE_INV;
742     }
743 
744     if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
745         return -VTD_FR_PASID_TABLE_INV;
746     }
747 
748     return 0;
749 }
750 
751 /**
752  * Caller of this function should check present bit if wants
753  * to use pasid entry for further usage except for fpd bit check.
754  */
755 static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
756                                  uint32_t pasid,
757                                  VTDPASIDDirEntry *pdire,
758                                  VTDPASIDEntry *pe)
759 {
760     dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
761 
762     return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
763 }
764 
765 /**
766  * This function gets a pasid entry from a specified pasid
767  * table (includes dir and leaf table) with a specified pasid.
768  * Sanity check should be done to ensure return a present
769  * pasid entry to caller.
770  */
771 static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
772                                        dma_addr_t pasid_dir_base,
773                                        uint32_t pasid,
774                                        VTDPASIDEntry *pe)
775 {
776     int ret;
777     VTDPASIDDirEntry pdire;
778 
779     ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
780                                         pasid, &pdire);
781     if (ret) {
782         return ret;
783     }
784 
785     if (!vtd_pdire_present(&pdire)) {
786         return -VTD_FR_PASID_TABLE_INV;
787     }
788 
789     ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
790     if (ret) {
791         return ret;
792     }
793 
794     if (!vtd_pe_present(pe)) {
795         return -VTD_FR_PASID_TABLE_INV;
796     }
797 
798     return 0;
799 }
800 
801 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
802                                       VTDContextEntry *ce,
803                                       VTDPASIDEntry *pe)
804 {
805     uint32_t pasid;
806     dma_addr_t pasid_dir_base;
807     int ret = 0;
808 
809     pasid = VTD_CE_GET_RID2PASID(ce);
810     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
811     ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
812 
813     return ret;
814 }
815 
816 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
817                                 VTDContextEntry *ce,
818                                 bool *pe_fpd_set)
819 {
820     int ret;
821     uint32_t pasid;
822     dma_addr_t pasid_dir_base;
823     VTDPASIDDirEntry pdire;
824     VTDPASIDEntry pe;
825 
826     pasid = VTD_CE_GET_RID2PASID(ce);
827     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
828 
829     /*
830      * No present bit check since fpd is meaningful even
831      * if the present bit is clear.
832      */
833     ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
834     if (ret) {
835         return ret;
836     }
837 
838     if (pdire.val & VTD_PASID_DIR_FPD) {
839         *pe_fpd_set = true;
840         return 0;
841     }
842 
843     if (!vtd_pdire_present(&pdire)) {
844         return -VTD_FR_PASID_TABLE_INV;
845     }
846 
847     /*
848      * No present bit check since fpd is meaningful even
849      * if the present bit is clear.
850      */
851     ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
852     if (ret) {
853         return ret;
854     }
855 
856     if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
857         *pe_fpd_set = true;
858     }
859 
860     return 0;
861 }
862 
863 /* Get the page-table level that hardware should use for the second-level
864  * page-table walk from the Address Width field of context-entry.
865  */
866 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
867 {
868     return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
869 }
870 
871 static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
872                                    VTDContextEntry *ce)
873 {
874     VTDPASIDEntry pe;
875 
876     if (s->root_scalable) {
877         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
878         return VTD_PE_GET_LEVEL(&pe);
879     }
880 
881     return vtd_ce_get_level(ce);
882 }
883 
884 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
885 {
886     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
887 }
888 
889 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
890                                   VTDContextEntry *ce)
891 {
892     VTDPASIDEntry pe;
893 
894     if (s->root_scalable) {
895         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
896         return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
897     }
898 
899     return vtd_ce_get_agaw(ce);
900 }
901 
902 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
903 {
904     return ce->lo & VTD_CONTEXT_ENTRY_TT;
905 }
906 
907 /* Only for Legacy Mode. Return true if check passed, otherwise false */
908 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
909                                      VTDContextEntry *ce)
910 {
911     switch (vtd_ce_get_type(ce)) {
912     case VTD_CONTEXT_TT_MULTI_LEVEL:
913         /* Always supported */
914         break;
915     case VTD_CONTEXT_TT_DEV_IOTLB:
916         if (!x86_iommu->dt_supported) {
917             error_report_once("%s: DT specified but not supported", __func__);
918             return false;
919         }
920         break;
921     case VTD_CONTEXT_TT_PASS_THROUGH:
922         if (!x86_iommu->pt_supported) {
923             error_report_once("%s: PT specified but not supported", __func__);
924             return false;
925         }
926         break;
927     default:
928         /* Unknown type */
929         error_report_once("%s: unknown ce type: %"PRIu32, __func__,
930                           vtd_ce_get_type(ce));
931         return false;
932     }
933     return true;
934 }
935 
936 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
937                                       VTDContextEntry *ce, uint8_t aw)
938 {
939     uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
940     return 1ULL << MIN(ce_agaw, aw);
941 }
942 
943 /* Return true if IOVA passes range check, otherwise false. */
944 static inline bool vtd_iova_range_check(IntelIOMMUState *s,
945                                         uint64_t iova, VTDContextEntry *ce,
946                                         uint8_t aw)
947 {
948     /*
949      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
950      * in CAP_REG and AW in context-entry.
951      */
952     return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
953 }
954 
955 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
956                                           VTDContextEntry *ce)
957 {
958     VTDPASIDEntry pe;
959 
960     if (s->root_scalable) {
961         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
962         return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
963     }
964 
965     return vtd_ce_get_slpt_base(ce);
966 }
967 
968 /*
969  * Rsvd field masks for spte:
970  *     vtd_spte_rsvd 4k pages
971  *     vtd_spte_rsvd_large large pages
972  */
973 static uint64_t vtd_spte_rsvd[5];
974 static uint64_t vtd_spte_rsvd_large[5];
975 
976 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
977 {
978     uint64_t rsvd_mask = vtd_spte_rsvd[level];
979 
980     if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
981         (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
982         /* large page */
983         rsvd_mask = vtd_spte_rsvd_large[level];
984     }
985 
986     return slpte & rsvd_mask;
987 }
988 
989 /* Find the VTD address space associated with a given bus number */
990 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
991 {
992     VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
993     GHashTableIter iter;
994 
995     if (vtd_bus) {
996         return vtd_bus;
997     }
998 
999     /*
1000      * Iterate over the registered buses to find the one which
1001      * currently holds this bus number and update the bus_num
1002      * lookup table.
1003      */
1004     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1005     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1006         if (pci_bus_num(vtd_bus->bus) == bus_num) {
1007             s->vtd_as_by_bus_num[bus_num] = vtd_bus;
1008             return vtd_bus;
1009         }
1010     }
1011 
1012     return NULL;
1013 }
1014 
1015 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
1016  * of the translation, can be used for deciding the size of large page.
1017  */
1018 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
1019                              uint64_t iova, bool is_write,
1020                              uint64_t *slptep, uint32_t *slpte_level,
1021                              bool *reads, bool *writes, uint8_t aw_bits)
1022 {
1023     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1024     uint32_t level = vtd_get_iova_level(s, ce);
1025     uint32_t offset;
1026     uint64_t slpte;
1027     uint64_t access_right_check;
1028 
1029     if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
1030         error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
1031                           __func__, iova);
1032         return -VTD_FR_ADDR_BEYOND_MGAW;
1033     }
1034 
1035     /* FIXME: what is the Atomics request here? */
1036     access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
1037 
1038     while (true) {
1039         offset = vtd_iova_level_offset(iova, level);
1040         slpte = vtd_get_slpte(addr, offset);
1041 
1042         if (slpte == (uint64_t)-1) {
1043             error_report_once("%s: detected read error on DMAR slpte "
1044                               "(iova=0x%" PRIx64 ")", __func__, iova);
1045             if (level == vtd_get_iova_level(s, ce)) {
1046                 /* Invalid programming of context-entry */
1047                 return -VTD_FR_CONTEXT_ENTRY_INV;
1048             } else {
1049                 return -VTD_FR_PAGING_ENTRY_INV;
1050             }
1051         }
1052         *reads = (*reads) && (slpte & VTD_SL_R);
1053         *writes = (*writes) && (slpte & VTD_SL_W);
1054         if (!(slpte & access_right_check)) {
1055             error_report_once("%s: detected slpte permission error "
1056                               "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
1057                               "slpte=0x%" PRIx64 ", write=%d)", __func__,
1058                               iova, level, slpte, is_write);
1059             return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
1060         }
1061         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1062             error_report_once("%s: detected splte reserve non-zero "
1063                               "iova=0x%" PRIx64 ", level=0x%" PRIx32
1064                               "slpte=0x%" PRIx64 ")", __func__, iova,
1065                               level, slpte);
1066             return -VTD_FR_PAGING_ENTRY_RSVD;
1067         }
1068 
1069         if (vtd_is_last_slpte(slpte, level)) {
1070             *slptep = slpte;
1071             *slpte_level = level;
1072             return 0;
1073         }
1074         addr = vtd_get_slpte_addr(slpte, aw_bits);
1075         level--;
1076     }
1077 }
1078 
1079 typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
1080 
1081 /**
1082  * Constant information used during page walking
1083  *
1084  * @hook_fn: hook func to be called when detected page
1085  * @private: private data to be passed into hook func
1086  * @notify_unmap: whether we should notify invalid entries
1087  * @as: VT-d address space of the device
1088  * @aw: maximum address width
1089  * @domain: domain ID of the page walk
1090  */
1091 typedef struct {
1092     VTDAddressSpace *as;
1093     vtd_page_walk_hook hook_fn;
1094     void *private;
1095     bool notify_unmap;
1096     uint8_t aw;
1097     uint16_t domain_id;
1098 } vtd_page_walk_info;
1099 
1100 static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info)
1101 {
1102     VTDAddressSpace *as = info->as;
1103     vtd_page_walk_hook hook_fn = info->hook_fn;
1104     void *private = info->private;
1105     IOMMUTLBEntry *entry = &event->entry;
1106     DMAMap target = {
1107         .iova = entry->iova,
1108         .size = entry->addr_mask,
1109         .translated_addr = entry->translated_addr,
1110         .perm = entry->perm,
1111     };
1112     const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1113 
1114     if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) {
1115         trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1116         return 0;
1117     }
1118 
1119     assert(hook_fn);
1120 
1121     /* Update local IOVA mapped ranges */
1122     if (event->type == IOMMU_NOTIFIER_MAP) {
1123         if (mapped) {
1124             /* If it's exactly the same translation, skip */
1125             if (!memcmp(mapped, &target, sizeof(target))) {
1126                 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1127                                                  entry->translated_addr);
1128                 return 0;
1129             } else {
1130                 /*
1131                  * Translation changed.  Normally this should not
1132                  * happen, but it can happen when with buggy guest
1133                  * OSes.  Note that there will be a small window that
1134                  * we don't have map at all.  But that's the best
1135                  * effort we can do.  The ideal way to emulate this is
1136                  * atomically modify the PTE to follow what has
1137                  * changed, but we can't.  One example is that vfio
1138                  * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1139                  * interface to modify a mapping (meanwhile it seems
1140                  * meaningless to even provide one).  Anyway, let's
1141                  * mark this as a TODO in case one day we'll have
1142                  * a better solution.
1143                  */
1144                 IOMMUAccessFlags cache_perm = entry->perm;
1145                 int ret;
1146 
1147                 /* Emulate an UNMAP */
1148                 event->type = IOMMU_NOTIFIER_UNMAP;
1149                 entry->perm = IOMMU_NONE;
1150                 trace_vtd_page_walk_one(info->domain_id,
1151                                         entry->iova,
1152                                         entry->translated_addr,
1153                                         entry->addr_mask,
1154                                         entry->perm);
1155                 ret = hook_fn(event, private);
1156                 if (ret) {
1157                     return ret;
1158                 }
1159                 /* Drop any existing mapping */
1160                 iova_tree_remove(as->iova_tree, &target);
1161                 /* Recover the correct type */
1162                 event->type = IOMMU_NOTIFIER_MAP;
1163                 entry->perm = cache_perm;
1164             }
1165         }
1166         iova_tree_insert(as->iova_tree, &target);
1167     } else {
1168         if (!mapped) {
1169             /* Skip since we didn't map this range at all */
1170             trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1171             return 0;
1172         }
1173         iova_tree_remove(as->iova_tree, &target);
1174     }
1175 
1176     trace_vtd_page_walk_one(info->domain_id, entry->iova,
1177                             entry->translated_addr, entry->addr_mask,
1178                             entry->perm);
1179     return hook_fn(event, private);
1180 }
1181 
1182 /**
1183  * vtd_page_walk_level - walk over specific level for IOVA range
1184  *
1185  * @addr: base GPA addr to start the walk
1186  * @start: IOVA range start address
1187  * @end: IOVA range end address (start <= addr < end)
1188  * @read: whether parent level has read permission
1189  * @write: whether parent level has write permission
1190  * @info: constant information for the page walk
1191  */
1192 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1193                                uint64_t end, uint32_t level, bool read,
1194                                bool write, vtd_page_walk_info *info)
1195 {
1196     bool read_cur, write_cur, entry_valid;
1197     uint32_t offset;
1198     uint64_t slpte;
1199     uint64_t subpage_size, subpage_mask;
1200     IOMMUTLBEvent event;
1201     uint64_t iova = start;
1202     uint64_t iova_next;
1203     int ret = 0;
1204 
1205     trace_vtd_page_walk_level(addr, level, start, end);
1206 
1207     subpage_size = 1ULL << vtd_slpt_level_shift(level);
1208     subpage_mask = vtd_slpt_level_page_mask(level);
1209 
1210     while (iova < end) {
1211         iova_next = (iova & subpage_mask) + subpage_size;
1212 
1213         offset = vtd_iova_level_offset(iova, level);
1214         slpte = vtd_get_slpte(addr, offset);
1215 
1216         if (slpte == (uint64_t)-1) {
1217             trace_vtd_page_walk_skip_read(iova, iova_next);
1218             goto next;
1219         }
1220 
1221         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1222             trace_vtd_page_walk_skip_reserve(iova, iova_next);
1223             goto next;
1224         }
1225 
1226         /* Permissions are stacked with parents' */
1227         read_cur = read && (slpte & VTD_SL_R);
1228         write_cur = write && (slpte & VTD_SL_W);
1229 
1230         /*
1231          * As long as we have either read/write permission, this is a
1232          * valid entry. The rule works for both page entries and page
1233          * table entries.
1234          */
1235         entry_valid = read_cur | write_cur;
1236 
1237         if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1238             /*
1239              * This is a valid PDE (or even bigger than PDE).  We need
1240              * to walk one further level.
1241              */
1242             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1243                                       iova, MIN(iova_next, end), level - 1,
1244                                       read_cur, write_cur, info);
1245         } else {
1246             /*
1247              * This means we are either:
1248              *
1249              * (1) the real page entry (either 4K page, or huge page)
1250              * (2) the whole range is invalid
1251              *
1252              * In either case, we send an IOTLB notification down.
1253              */
1254             event.entry.target_as = &address_space_memory;
1255             event.entry.iova = iova & subpage_mask;
1256             event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1257             event.entry.addr_mask = ~subpage_mask;
1258             /* NOTE: this is only meaningful if entry_valid == true */
1259             event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1260             event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
1261                                             IOMMU_NOTIFIER_UNMAP;
1262             ret = vtd_page_walk_one(&event, info);
1263         }
1264 
1265         if (ret < 0) {
1266             return ret;
1267         }
1268 
1269 next:
1270         iova = iova_next;
1271     }
1272 
1273     return 0;
1274 }
1275 
1276 /**
1277  * vtd_page_walk - walk specific IOVA range, and call the hook
1278  *
1279  * @s: intel iommu state
1280  * @ce: context entry to walk upon
1281  * @start: IOVA address to start the walk
1282  * @end: IOVA range end address (start <= addr < end)
1283  * @info: page walking information struct
1284  */
1285 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
1286                          uint64_t start, uint64_t end,
1287                          vtd_page_walk_info *info)
1288 {
1289     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1290     uint32_t level = vtd_get_iova_level(s, ce);
1291 
1292     if (!vtd_iova_range_check(s, start, ce, info->aw)) {
1293         return -VTD_FR_ADDR_BEYOND_MGAW;
1294     }
1295 
1296     if (!vtd_iova_range_check(s, end, ce, info->aw)) {
1297         /* Fix end so that it reaches the maximum */
1298         end = vtd_iova_limit(s, ce, info->aw);
1299     }
1300 
1301     return vtd_page_walk_level(addr, start, end, level, true, true, info);
1302 }
1303 
1304 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
1305                                           VTDRootEntry *re)
1306 {
1307     /* Legacy Mode reserved bits check */
1308     if (!s->root_scalable &&
1309         (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1310         goto rsvd_err;
1311 
1312     /* Scalable Mode reserved bits check */
1313     if (s->root_scalable &&
1314         ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
1315          (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1316         goto rsvd_err;
1317 
1318     return 0;
1319 
1320 rsvd_err:
1321     error_report_once("%s: invalid root entry: hi=0x%"PRIx64
1322                       ", lo=0x%"PRIx64,
1323                       __func__, re->hi, re->lo);
1324     return -VTD_FR_ROOT_ENTRY_RSVD;
1325 }
1326 
1327 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
1328                                                     VTDContextEntry *ce)
1329 {
1330     if (!s->root_scalable &&
1331         (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
1332          ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1333         error_report_once("%s: invalid context entry: hi=%"PRIx64
1334                           ", lo=%"PRIx64" (reserved nonzero)",
1335                           __func__, ce->hi, ce->lo);
1336         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1337     }
1338 
1339     if (s->root_scalable &&
1340         (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
1341          ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
1342          ce->val[2] ||
1343          ce->val[3])) {
1344         error_report_once("%s: invalid context entry: val[3]=%"PRIx64
1345                           ", val[2]=%"PRIx64
1346                           ", val[1]=%"PRIx64
1347                           ", val[0]=%"PRIx64" (reserved nonzero)",
1348                           __func__, ce->val[3], ce->val[2],
1349                           ce->val[1], ce->val[0]);
1350         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1351     }
1352 
1353     return 0;
1354 }
1355 
1356 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
1357                                   VTDContextEntry *ce)
1358 {
1359     VTDPASIDEntry pe;
1360 
1361     /*
1362      * Make sure in Scalable Mode, a present context entry
1363      * has valid rid2pasid setting, which includes valid
1364      * rid2pasid field and corresponding pasid entry setting
1365      */
1366     return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1367 }
1368 
1369 /* Map a device to its corresponding domain (context-entry) */
1370 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1371                                     uint8_t devfn, VTDContextEntry *ce)
1372 {
1373     VTDRootEntry re;
1374     int ret_fr;
1375     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
1376 
1377     ret_fr = vtd_get_root_entry(s, bus_num, &re);
1378     if (ret_fr) {
1379         return ret_fr;
1380     }
1381 
1382     if (!vtd_root_entry_present(s, &re, devfn)) {
1383         /* Not error - it's okay we don't have root entry. */
1384         trace_vtd_re_not_present(bus_num);
1385         return -VTD_FR_ROOT_ENTRY_P;
1386     }
1387 
1388     ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
1389     if (ret_fr) {
1390         return ret_fr;
1391     }
1392 
1393     ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
1394     if (ret_fr) {
1395         return ret_fr;
1396     }
1397 
1398     if (!vtd_ce_present(ce)) {
1399         /* Not error - it's okay we don't have context entry. */
1400         trace_vtd_ce_not_present(bus_num, devfn);
1401         return -VTD_FR_CONTEXT_ENTRY_P;
1402     }
1403 
1404     ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
1405     if (ret_fr) {
1406         return ret_fr;
1407     }
1408 
1409     /* Check if the programming of context-entry is valid */
1410     if (!s->root_scalable &&
1411         !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1412         error_report_once("%s: invalid context entry: hi=%"PRIx64
1413                           ", lo=%"PRIx64" (level %d not supported)",
1414                           __func__, ce->hi, ce->lo,
1415                           vtd_ce_get_level(ce));
1416         return -VTD_FR_CONTEXT_ENTRY_INV;
1417     }
1418 
1419     if (!s->root_scalable) {
1420         /* Do translation type check */
1421         if (!vtd_ce_type_check(x86_iommu, ce)) {
1422             /* Errors dumped in vtd_ce_type_check() */
1423             return -VTD_FR_CONTEXT_ENTRY_INV;
1424         }
1425     } else {
1426         /*
1427          * Check if the programming of context-entry.rid2pasid
1428          * and corresponding pasid setting is valid, and thus
1429          * avoids to check pasid entry fetching result in future
1430          * helper function calling.
1431          */
1432         ret_fr = vtd_ce_rid2pasid_check(s, ce);
1433         if (ret_fr) {
1434             return ret_fr;
1435         }
1436     }
1437 
1438     return 0;
1439 }
1440 
1441 static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event,
1442                                      void *private)
1443 {
1444     memory_region_notify_iommu(private, 0, *event);
1445     return 0;
1446 }
1447 
1448 static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
1449                                   VTDContextEntry *ce)
1450 {
1451     VTDPASIDEntry pe;
1452 
1453     if (s->root_scalable) {
1454         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1455         return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
1456     }
1457 
1458     return VTD_CONTEXT_ENTRY_DID(ce->hi);
1459 }
1460 
1461 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1462                                             VTDContextEntry *ce,
1463                                             hwaddr addr, hwaddr size)
1464 {
1465     IntelIOMMUState *s = vtd_as->iommu_state;
1466     vtd_page_walk_info info = {
1467         .hook_fn = vtd_sync_shadow_page_hook,
1468         .private = (void *)&vtd_as->iommu,
1469         .notify_unmap = true,
1470         .aw = s->aw_bits,
1471         .as = vtd_as,
1472         .domain_id = vtd_get_domain_id(s, ce),
1473     };
1474 
1475     return vtd_page_walk(s, ce, addr, addr + size, &info);
1476 }
1477 
1478 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1479 {
1480     int ret;
1481     VTDContextEntry ce;
1482     IOMMUNotifier *n;
1483 
1484     if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
1485         return 0;
1486     }
1487 
1488     ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
1489                                    pci_bus_num(vtd_as->bus),
1490                                    vtd_as->devfn, &ce);
1491     if (ret) {
1492         if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
1493             /*
1494              * It's a valid scenario to have a context entry that is
1495              * not present.  For example, when a device is removed
1496              * from an existing domain then the context entry will be
1497              * zeroed by the guest before it was put into another
1498              * domain.  When this happens, instead of synchronizing
1499              * the shadow pages we should invalidate all existing
1500              * mappings and notify the backends.
1501              */
1502             IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
1503                 vtd_address_space_unmap(vtd_as, n);
1504             }
1505             ret = 0;
1506         }
1507         return ret;
1508     }
1509 
1510     return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
1511 }
1512 
1513 /*
1514  * Check if specific device is configured to bypass address
1515  * translation for DMA requests. In Scalable Mode, bypass
1516  * 1st-level translation or 2nd-level translation, it depends
1517  * on PGTT setting.
1518  */
1519 static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce)
1520 {
1521     VTDPASIDEntry pe;
1522     int ret;
1523 
1524     if (s->root_scalable) {
1525         ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1526         if (ret) {
1527             error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
1528                               __func__, ret);
1529             return false;
1530         }
1531         return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
1532     }
1533 
1534     return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
1535 
1536 }
1537 
1538 static bool vtd_as_pt_enabled(VTDAddressSpace *as)
1539 {
1540     IntelIOMMUState *s;
1541     VTDContextEntry ce;
1542     int ret;
1543 
1544     assert(as);
1545 
1546     s = as->iommu_state;
1547     ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1548                                    as->devfn, &ce);
1549     if (ret) {
1550         /*
1551          * Possibly failed to parse the context entry for some reason
1552          * (e.g., during init, or any guest configuration errors on
1553          * context entries). We should assume PT not enabled for
1554          * safety.
1555          */
1556         return false;
1557     }
1558 
1559     return vtd_dev_pt_enabled(s, &ce);
1560 }
1561 
1562 /* Return whether the device is using IOMMU translation. */
1563 static bool vtd_switch_address_space(VTDAddressSpace *as)
1564 {
1565     bool use_iommu;
1566     /* Whether we need to take the BQL on our own */
1567     bool take_bql = !qemu_mutex_iothread_locked();
1568 
1569     assert(as);
1570 
1571     use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
1572 
1573     trace_vtd_switch_address_space(pci_bus_num(as->bus),
1574                                    VTD_PCI_SLOT(as->devfn),
1575                                    VTD_PCI_FUNC(as->devfn),
1576                                    use_iommu);
1577 
1578     /*
1579      * It's possible that we reach here without BQL, e.g., when called
1580      * from vtd_pt_enable_fast_path(). However the memory APIs need
1581      * it. We'd better make sure we have had it already, or, take it.
1582      */
1583     if (take_bql) {
1584         qemu_mutex_lock_iothread();
1585     }
1586 
1587     /* Turn off first then on the other */
1588     if (use_iommu) {
1589         memory_region_set_enabled(&as->nodmar, false);
1590         memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1591     } else {
1592         memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1593         memory_region_set_enabled(&as->nodmar, true);
1594     }
1595 
1596     if (take_bql) {
1597         qemu_mutex_unlock_iothread();
1598     }
1599 
1600     return use_iommu;
1601 }
1602 
1603 static void vtd_switch_address_space_all(IntelIOMMUState *s)
1604 {
1605     GHashTableIter iter;
1606     VTDBus *vtd_bus;
1607     int i;
1608 
1609     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1610     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1611         for (i = 0; i < PCI_DEVFN_MAX; i++) {
1612             if (!vtd_bus->dev_as[i]) {
1613                 continue;
1614             }
1615             vtd_switch_address_space(vtd_bus->dev_as[i]);
1616         }
1617     }
1618 }
1619 
1620 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1621 {
1622     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1623 }
1624 
1625 static const bool vtd_qualified_faults[] = {
1626     [VTD_FR_RESERVED] = false,
1627     [VTD_FR_ROOT_ENTRY_P] = false,
1628     [VTD_FR_CONTEXT_ENTRY_P] = true,
1629     [VTD_FR_CONTEXT_ENTRY_INV] = true,
1630     [VTD_FR_ADDR_BEYOND_MGAW] = true,
1631     [VTD_FR_WRITE] = true,
1632     [VTD_FR_READ] = true,
1633     [VTD_FR_PAGING_ENTRY_INV] = true,
1634     [VTD_FR_ROOT_TABLE_INV] = false,
1635     [VTD_FR_CONTEXT_TABLE_INV] = false,
1636     [VTD_FR_ROOT_ENTRY_RSVD] = false,
1637     [VTD_FR_PAGING_ENTRY_RSVD] = true,
1638     [VTD_FR_CONTEXT_ENTRY_TT] = true,
1639     [VTD_FR_PASID_TABLE_INV] = false,
1640     [VTD_FR_RESERVED_ERR] = false,
1641     [VTD_FR_MAX] = false,
1642 };
1643 
1644 /* To see if a fault condition is "qualified", which is reported to software
1645  * only if the FPD field in the context-entry used to process the faulting
1646  * request is 0.
1647  */
1648 static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1649 {
1650     return vtd_qualified_faults[fault];
1651 }
1652 
1653 static inline bool vtd_is_interrupt_addr(hwaddr addr)
1654 {
1655     return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1656 }
1657 
1658 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1659 {
1660     VTDBus *vtd_bus;
1661     VTDAddressSpace *vtd_as;
1662     bool success = false;
1663 
1664     vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1665     if (!vtd_bus) {
1666         goto out;
1667     }
1668 
1669     vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1670     if (!vtd_as) {
1671         goto out;
1672     }
1673 
1674     if (vtd_switch_address_space(vtd_as) == false) {
1675         /* We switched off IOMMU region successfully. */
1676         success = true;
1677     }
1678 
1679 out:
1680     trace_vtd_pt_enable_fast_path(source_id, success);
1681 }
1682 
1683 /* Map dev to context-entry then do a paging-structures walk to do a iommu
1684  * translation.
1685  *
1686  * Called from RCU critical section.
1687  *
1688  * @bus_num: The bus number
1689  * @devfn: The devfn, which is the  combined of device and function number
1690  * @is_write: The access is a write operation
1691  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1692  *
1693  * Returns true if translation is successful, otherwise false.
1694  */
1695 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1696                                    uint8_t devfn, hwaddr addr, bool is_write,
1697                                    IOMMUTLBEntry *entry)
1698 {
1699     IntelIOMMUState *s = vtd_as->iommu_state;
1700     VTDContextEntry ce;
1701     uint8_t bus_num = pci_bus_num(bus);
1702     VTDContextCacheEntry *cc_entry;
1703     uint64_t slpte, page_mask;
1704     uint32_t level;
1705     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1706     int ret_fr;
1707     bool is_fpd_set = false;
1708     bool reads = true;
1709     bool writes = true;
1710     uint8_t access_flags;
1711     VTDIOTLBEntry *iotlb_entry;
1712 
1713     /*
1714      * We have standalone memory region for interrupt addresses, we
1715      * should never receive translation requests in this region.
1716      */
1717     assert(!vtd_is_interrupt_addr(addr));
1718 
1719     vtd_iommu_lock(s);
1720 
1721     cc_entry = &vtd_as->context_cache_entry;
1722 
1723     /* Try to fetch slpte form IOTLB */
1724     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1725     if (iotlb_entry) {
1726         trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1727                                  iotlb_entry->domain_id);
1728         slpte = iotlb_entry->slpte;
1729         access_flags = iotlb_entry->access_flags;
1730         page_mask = iotlb_entry->mask;
1731         goto out;
1732     }
1733 
1734     /* Try to fetch context-entry from cache first */
1735     if (cc_entry->context_cache_gen == s->context_cache_gen) {
1736         trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1737                                cc_entry->context_entry.lo,
1738                                cc_entry->context_cache_gen);
1739         ce = cc_entry->context_entry;
1740         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1741         if (!is_fpd_set && s->root_scalable) {
1742             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1743             VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1744         }
1745     } else {
1746         ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1747         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1748         if (!ret_fr && !is_fpd_set && s->root_scalable) {
1749             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1750         }
1751         VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1752         /* Update context-cache */
1753         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1754                                   cc_entry->context_cache_gen,
1755                                   s->context_cache_gen);
1756         cc_entry->context_entry = ce;
1757         cc_entry->context_cache_gen = s->context_cache_gen;
1758     }
1759 
1760     /*
1761      * We don't need to translate for pass-through context entries.
1762      * Also, let's ignore IOTLB caching as well for PT devices.
1763      */
1764     if (vtd_dev_pt_enabled(s, &ce)) {
1765         entry->iova = addr & VTD_PAGE_MASK_4K;
1766         entry->translated_addr = entry->iova;
1767         entry->addr_mask = ~VTD_PAGE_MASK_4K;
1768         entry->perm = IOMMU_RW;
1769         trace_vtd_translate_pt(source_id, entry->iova);
1770 
1771         /*
1772          * When this happens, it means firstly caching-mode is not
1773          * enabled, and this is the first passthrough translation for
1774          * the device. Let's enable the fast path for passthrough.
1775          *
1776          * When passthrough is disabled again for the device, we can
1777          * capture it via the context entry invalidation, then the
1778          * IOMMU region can be swapped back.
1779          */
1780         vtd_pt_enable_fast_path(s, source_id);
1781         vtd_iommu_unlock(s);
1782         return true;
1783     }
1784 
1785     ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
1786                                &reads, &writes, s->aw_bits);
1787     VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1788 
1789     page_mask = vtd_slpt_level_page_mask(level);
1790     access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1791     vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
1792                      access_flags, level);
1793 out:
1794     vtd_iommu_unlock(s);
1795     entry->iova = addr & page_mask;
1796     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1797     entry->addr_mask = ~page_mask;
1798     entry->perm = access_flags;
1799     return true;
1800 
1801 error:
1802     vtd_iommu_unlock(s);
1803     entry->iova = 0;
1804     entry->translated_addr = 0;
1805     entry->addr_mask = 0;
1806     entry->perm = IOMMU_NONE;
1807     return false;
1808 }
1809 
1810 static void vtd_root_table_setup(IntelIOMMUState *s)
1811 {
1812     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1813     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1814 
1815     vtd_update_scalable_state(s);
1816 
1817     trace_vtd_reg_dmar_root(s->root, s->root_scalable);
1818 }
1819 
1820 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1821                                uint32_t index, uint32_t mask)
1822 {
1823     x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1824 }
1825 
1826 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1827 {
1828     uint64_t value = 0;
1829     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1830     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1831     s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1832     s->intr_eime = value & VTD_IRTA_EIME;
1833 
1834     /* Notify global invalidation */
1835     vtd_iec_notify_all(s, true, 0, 0);
1836 
1837     trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1838 }
1839 
1840 static void vtd_iommu_replay_all(IntelIOMMUState *s)
1841 {
1842     VTDAddressSpace *vtd_as;
1843 
1844     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1845         vtd_sync_shadow_page_table(vtd_as);
1846     }
1847 }
1848 
1849 static void vtd_context_global_invalidate(IntelIOMMUState *s)
1850 {
1851     trace_vtd_inv_desc_cc_global();
1852     /* Protects context cache */
1853     vtd_iommu_lock(s);
1854     s->context_cache_gen++;
1855     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1856         vtd_reset_context_cache_locked(s);
1857     }
1858     vtd_iommu_unlock(s);
1859     vtd_address_space_refresh_all(s);
1860     /*
1861      * From VT-d spec 6.5.2.1, a global context entry invalidation
1862      * should be followed by a IOTLB global invalidation, so we should
1863      * be safe even without this. Hoewever, let's replay the region as
1864      * well to be safer, and go back here when we need finer tunes for
1865      * VT-d emulation codes.
1866      */
1867     vtd_iommu_replay_all(s);
1868 }
1869 
1870 /* Do a context-cache device-selective invalidation.
1871  * @func_mask: FM field after shifting
1872  */
1873 static void vtd_context_device_invalidate(IntelIOMMUState *s,
1874                                           uint16_t source_id,
1875                                           uint16_t func_mask)
1876 {
1877     uint16_t mask;
1878     VTDBus *vtd_bus;
1879     VTDAddressSpace *vtd_as;
1880     uint8_t bus_n, devfn;
1881     uint16_t devfn_it;
1882 
1883     trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1884 
1885     switch (func_mask & 3) {
1886     case 0:
1887         mask = 0;   /* No bits in the SID field masked */
1888         break;
1889     case 1:
1890         mask = 4;   /* Mask bit 2 in the SID field */
1891         break;
1892     case 2:
1893         mask = 6;   /* Mask bit 2:1 in the SID field */
1894         break;
1895     case 3:
1896         mask = 7;   /* Mask bit 2:0 in the SID field */
1897         break;
1898     default:
1899         g_assert_not_reached();
1900     }
1901     mask = ~mask;
1902 
1903     bus_n = VTD_SID_TO_BUS(source_id);
1904     vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1905     if (vtd_bus) {
1906         devfn = VTD_SID_TO_DEVFN(source_id);
1907         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1908             vtd_as = vtd_bus->dev_as[devfn_it];
1909             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1910                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1911                                              VTD_PCI_FUNC(devfn_it));
1912                 vtd_iommu_lock(s);
1913                 vtd_as->context_cache_entry.context_cache_gen = 0;
1914                 vtd_iommu_unlock(s);
1915                 /*
1916                  * Do switch address space when needed, in case if the
1917                  * device passthrough bit is switched.
1918                  */
1919                 vtd_switch_address_space(vtd_as);
1920                 /*
1921                  * So a device is moving out of (or moving into) a
1922                  * domain, resync the shadow page table.
1923                  * This won't bring bad even if we have no such
1924                  * notifier registered - the IOMMU notification
1925                  * framework will skip MAP notifications if that
1926                  * happened.
1927                  */
1928                 vtd_sync_shadow_page_table(vtd_as);
1929             }
1930         }
1931     }
1932 }
1933 
1934 /* Context-cache invalidation
1935  * Returns the Context Actual Invalidation Granularity.
1936  * @val: the content of the CCMD_REG
1937  */
1938 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1939 {
1940     uint64_t caig;
1941     uint64_t type = val & VTD_CCMD_CIRG_MASK;
1942 
1943     switch (type) {
1944     case VTD_CCMD_DOMAIN_INVL:
1945         /* Fall through */
1946     case VTD_CCMD_GLOBAL_INVL:
1947         caig = VTD_CCMD_GLOBAL_INVL_A;
1948         vtd_context_global_invalidate(s);
1949         break;
1950 
1951     case VTD_CCMD_DEVICE_INVL:
1952         caig = VTD_CCMD_DEVICE_INVL_A;
1953         vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1954         break;
1955 
1956     default:
1957         error_report_once("%s: invalid context: 0x%" PRIx64,
1958                           __func__, val);
1959         caig = 0;
1960     }
1961     return caig;
1962 }
1963 
1964 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1965 {
1966     trace_vtd_inv_desc_iotlb_global();
1967     vtd_reset_iotlb(s);
1968     vtd_iommu_replay_all(s);
1969 }
1970 
1971 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1972 {
1973     VTDContextEntry ce;
1974     VTDAddressSpace *vtd_as;
1975 
1976     trace_vtd_inv_desc_iotlb_domain(domain_id);
1977 
1978     vtd_iommu_lock(s);
1979     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1980                                 &domain_id);
1981     vtd_iommu_unlock(s);
1982 
1983     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1984         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1985                                       vtd_as->devfn, &ce) &&
1986             domain_id == vtd_get_domain_id(s, &ce)) {
1987             vtd_sync_shadow_page_table(vtd_as);
1988         }
1989     }
1990 }
1991 
1992 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1993                                            uint16_t domain_id, hwaddr addr,
1994                                            uint8_t am)
1995 {
1996     VTDAddressSpace *vtd_as;
1997     VTDContextEntry ce;
1998     int ret;
1999     hwaddr size = (1 << am) * VTD_PAGE_SIZE;
2000 
2001     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
2002         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
2003                                        vtd_as->devfn, &ce);
2004         if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
2005             if (vtd_as_has_map_notifier(vtd_as)) {
2006                 /*
2007                  * As long as we have MAP notifications registered in
2008                  * any of our IOMMU notifiers, we need to sync the
2009                  * shadow page table.
2010                  */
2011                 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
2012             } else {
2013                 /*
2014                  * For UNMAP-only notifiers, we don't need to walk the
2015                  * page tables.  We just deliver the PSI down to
2016                  * invalidate caches.
2017                  */
2018                 IOMMUTLBEvent event = {
2019                     .type = IOMMU_NOTIFIER_UNMAP,
2020                     .entry = {
2021                         .target_as = &address_space_memory,
2022                         .iova = addr,
2023                         .translated_addr = 0,
2024                         .addr_mask = size - 1,
2025                         .perm = IOMMU_NONE,
2026                     },
2027                 };
2028                 memory_region_notify_iommu(&vtd_as->iommu, 0, event);
2029             }
2030         }
2031     }
2032 }
2033 
2034 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
2035                                       hwaddr addr, uint8_t am)
2036 {
2037     VTDIOTLBPageInvInfo info;
2038 
2039     trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
2040 
2041     assert(am <= VTD_MAMV);
2042     info.domain_id = domain_id;
2043     info.addr = addr;
2044     info.mask = ~((1 << am) - 1);
2045     vtd_iommu_lock(s);
2046     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
2047     vtd_iommu_unlock(s);
2048     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
2049 }
2050 
2051 /* Flush IOTLB
2052  * Returns the IOTLB Actual Invalidation Granularity.
2053  * @val: the content of the IOTLB_REG
2054  */
2055 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
2056 {
2057     uint64_t iaig;
2058     uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
2059     uint16_t domain_id;
2060     hwaddr addr;
2061     uint8_t am;
2062 
2063     switch (type) {
2064     case VTD_TLB_GLOBAL_FLUSH:
2065         iaig = VTD_TLB_GLOBAL_FLUSH_A;
2066         vtd_iotlb_global_invalidate(s);
2067         break;
2068 
2069     case VTD_TLB_DSI_FLUSH:
2070         domain_id = VTD_TLB_DID(val);
2071         iaig = VTD_TLB_DSI_FLUSH_A;
2072         vtd_iotlb_domain_invalidate(s, domain_id);
2073         break;
2074 
2075     case VTD_TLB_PSI_FLUSH:
2076         domain_id = VTD_TLB_DID(val);
2077         addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
2078         am = VTD_IVA_AM(addr);
2079         addr = VTD_IVA_ADDR(addr);
2080         if (am > VTD_MAMV) {
2081             error_report_once("%s: address mask overflow: 0x%" PRIx64,
2082                               __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
2083             iaig = 0;
2084             break;
2085         }
2086         iaig = VTD_TLB_PSI_FLUSH_A;
2087         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2088         break;
2089 
2090     default:
2091         error_report_once("%s: invalid granularity: 0x%" PRIx64,
2092                           __func__, val);
2093         iaig = 0;
2094     }
2095     return iaig;
2096 }
2097 
2098 static void vtd_fetch_inv_desc(IntelIOMMUState *s);
2099 
2100 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
2101 {
2102     return s->qi_enabled && (s->iq_tail == s->iq_head) &&
2103            (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
2104 }
2105 
2106 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
2107 {
2108     uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
2109 
2110     trace_vtd_inv_qi_enable(en);
2111 
2112     if (en) {
2113         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
2114         /* 2^(x+8) entries */
2115         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
2116         s->qi_enabled = true;
2117         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
2118         /* Ok - report back to driver */
2119         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
2120 
2121         if (s->iq_tail != 0) {
2122             /*
2123              * This is a spec violation but Windows guests are known to set up
2124              * Queued Invalidation this way so we allow the write and process
2125              * Invalidation Descriptors right away.
2126              */
2127             trace_vtd_warn_invalid_qi_tail(s->iq_tail);
2128             if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2129                 vtd_fetch_inv_desc(s);
2130             }
2131         }
2132     } else {
2133         if (vtd_queued_inv_disable_check(s)) {
2134             /* disable Queued Invalidation */
2135             vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
2136             s->iq_head = 0;
2137             s->qi_enabled = false;
2138             /* Ok - report back to driver */
2139             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
2140         } else {
2141             error_report_once("%s: detected improper state when disable QI "
2142                               "(head=0x%x, tail=0x%x, last_type=%d)",
2143                               __func__,
2144                               s->iq_head, s->iq_tail, s->iq_last_desc_type);
2145         }
2146     }
2147 }
2148 
2149 /* Set Root Table Pointer */
2150 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
2151 {
2152     vtd_root_table_setup(s);
2153     /* Ok - report back to driver */
2154     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
2155     vtd_reset_caches(s);
2156     vtd_address_space_refresh_all(s);
2157 }
2158 
2159 /* Set Interrupt Remap Table Pointer */
2160 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
2161 {
2162     vtd_interrupt_remap_table_setup(s);
2163     /* Ok - report back to driver */
2164     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
2165 }
2166 
2167 /* Handle Translation Enable/Disable */
2168 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
2169 {
2170     if (s->dmar_enabled == en) {
2171         return;
2172     }
2173 
2174     trace_vtd_dmar_enable(en);
2175 
2176     if (en) {
2177         s->dmar_enabled = true;
2178         /* Ok - report back to driver */
2179         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
2180     } else {
2181         s->dmar_enabled = false;
2182 
2183         /* Clear the index of Fault Recording Register */
2184         s->next_frcd_reg = 0;
2185         /* Ok - report back to driver */
2186         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
2187     }
2188 
2189     vtd_reset_caches(s);
2190     vtd_address_space_refresh_all(s);
2191 }
2192 
2193 /* Handle Interrupt Remap Enable/Disable */
2194 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
2195 {
2196     trace_vtd_ir_enable(en);
2197 
2198     if (en) {
2199         s->intr_enabled = true;
2200         /* Ok - report back to driver */
2201         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
2202     } else {
2203         s->intr_enabled = false;
2204         /* Ok - report back to driver */
2205         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
2206     }
2207 }
2208 
2209 /* Handle write to Global Command Register */
2210 static void vtd_handle_gcmd_write(IntelIOMMUState *s)
2211 {
2212     uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
2213     uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
2214     uint32_t changed = status ^ val;
2215 
2216     trace_vtd_reg_write_gcmd(status, val);
2217     if (changed & VTD_GCMD_TE) {
2218         /* Translation enable/disable */
2219         vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
2220     }
2221     if (val & VTD_GCMD_SRTP) {
2222         /* Set/update the root-table pointer */
2223         vtd_handle_gcmd_srtp(s);
2224     }
2225     if (changed & VTD_GCMD_QIE) {
2226         /* Queued Invalidation Enable */
2227         vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
2228     }
2229     if (val & VTD_GCMD_SIRTP) {
2230         /* Set/update the interrupt remapping root-table pointer */
2231         vtd_handle_gcmd_sirtp(s);
2232     }
2233     if (changed & VTD_GCMD_IRE) {
2234         /* Interrupt remap enable/disable */
2235         vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
2236     }
2237 }
2238 
2239 /* Handle write to Context Command Register */
2240 static void vtd_handle_ccmd_write(IntelIOMMUState *s)
2241 {
2242     uint64_t ret;
2243     uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
2244 
2245     /* Context-cache invalidation request */
2246     if (val & VTD_CCMD_ICC) {
2247         if (s->qi_enabled) {
2248             error_report_once("Queued Invalidation enabled, "
2249                               "should not use register-based invalidation");
2250             return;
2251         }
2252         ret = vtd_context_cache_invalidate(s, val);
2253         /* Invalidation completed. Change something to show */
2254         vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
2255         ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
2256                                       ret);
2257     }
2258 }
2259 
2260 /* Handle write to IOTLB Invalidation Register */
2261 static void vtd_handle_iotlb_write(IntelIOMMUState *s)
2262 {
2263     uint64_t ret;
2264     uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
2265 
2266     /* IOTLB invalidation request */
2267     if (val & VTD_TLB_IVT) {
2268         if (s->qi_enabled) {
2269             error_report_once("Queued Invalidation enabled, "
2270                               "should not use register-based invalidation");
2271             return;
2272         }
2273         ret = vtd_iotlb_flush(s, val);
2274         /* Invalidation completed. Change something to show */
2275         vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
2276         ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
2277                                       VTD_TLB_FLUSH_GRANU_MASK_A, ret);
2278     }
2279 }
2280 
2281 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
2282 static bool vtd_get_inv_desc(IntelIOMMUState *s,
2283                              VTDInvDesc *inv_desc)
2284 {
2285     dma_addr_t base_addr = s->iq;
2286     uint32_t offset = s->iq_head;
2287     uint32_t dw = s->iq_dw ? 32 : 16;
2288     dma_addr_t addr = base_addr + offset * dw;
2289 
2290     if (dma_memory_read(&address_space_memory, addr,
2291                         inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) {
2292         error_report_once("Read INV DESC failed.");
2293         return false;
2294     }
2295     inv_desc->lo = le64_to_cpu(inv_desc->lo);
2296     inv_desc->hi = le64_to_cpu(inv_desc->hi);
2297     if (dw == 32) {
2298         inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
2299         inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
2300     }
2301     return true;
2302 }
2303 
2304 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2305 {
2306     if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
2307         (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
2308         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2309                           " (reserved nonzero)", __func__, inv_desc->hi,
2310                           inv_desc->lo);
2311         return false;
2312     }
2313     if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
2314         /* Status Write */
2315         uint32_t status_data = (uint32_t)(inv_desc->lo >>
2316                                VTD_INV_DESC_WAIT_DATA_SHIFT);
2317 
2318         assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
2319 
2320         /* FIXME: need to be masked with HAW? */
2321         dma_addr_t status_addr = inv_desc->hi;
2322         trace_vtd_inv_desc_wait_sw(status_addr, status_data);
2323         status_data = cpu_to_le32(status_data);
2324         if (dma_memory_write(&address_space_memory, status_addr,
2325                              &status_data, sizeof(status_data),
2326                              MEMTXATTRS_UNSPECIFIED)) {
2327             trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
2328             return false;
2329         }
2330     } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
2331         /* Interrupt flag */
2332         vtd_generate_completion_event(s);
2333     } else {
2334         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2335                           " (unknown type)", __func__, inv_desc->hi,
2336                           inv_desc->lo);
2337         return false;
2338     }
2339     return true;
2340 }
2341 
2342 static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
2343                                            VTDInvDesc *inv_desc)
2344 {
2345     uint16_t sid, fmask;
2346 
2347     if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
2348         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2349                           " (reserved nonzero)", __func__, inv_desc->hi,
2350                           inv_desc->lo);
2351         return false;
2352     }
2353     switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
2354     case VTD_INV_DESC_CC_DOMAIN:
2355         trace_vtd_inv_desc_cc_domain(
2356             (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
2357         /* Fall through */
2358     case VTD_INV_DESC_CC_GLOBAL:
2359         vtd_context_global_invalidate(s);
2360         break;
2361 
2362     case VTD_INV_DESC_CC_DEVICE:
2363         sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
2364         fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
2365         vtd_context_device_invalidate(s, sid, fmask);
2366         break;
2367 
2368     default:
2369         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2370                           " (invalid type)", __func__, inv_desc->hi,
2371                           inv_desc->lo);
2372         return false;
2373     }
2374     return true;
2375 }
2376 
2377 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2378 {
2379     uint16_t domain_id;
2380     uint8_t am;
2381     hwaddr addr;
2382 
2383     if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
2384         (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
2385         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2386                           ", lo=0x%"PRIx64" (reserved bits unzero)",
2387                           __func__, inv_desc->hi, inv_desc->lo);
2388         return false;
2389     }
2390 
2391     switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
2392     case VTD_INV_DESC_IOTLB_GLOBAL:
2393         vtd_iotlb_global_invalidate(s);
2394         break;
2395 
2396     case VTD_INV_DESC_IOTLB_DOMAIN:
2397         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2398         vtd_iotlb_domain_invalidate(s, domain_id);
2399         break;
2400 
2401     case VTD_INV_DESC_IOTLB_PAGE:
2402         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2403         addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
2404         am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
2405         if (am > VTD_MAMV) {
2406             error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2407                               ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)",
2408                               __func__, inv_desc->hi, inv_desc->lo,
2409                               am, (unsigned)VTD_MAMV);
2410             return false;
2411         }
2412         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2413         break;
2414 
2415     default:
2416         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2417                           ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
2418                           __func__, inv_desc->hi, inv_desc->lo,
2419                           inv_desc->lo & VTD_INV_DESC_IOTLB_G);
2420         return false;
2421     }
2422     return true;
2423 }
2424 
2425 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
2426                                      VTDInvDesc *inv_desc)
2427 {
2428     trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
2429                            inv_desc->iec.index,
2430                            inv_desc->iec.index_mask);
2431 
2432     vtd_iec_notify_all(s, !inv_desc->iec.granularity,
2433                        inv_desc->iec.index,
2434                        inv_desc->iec.index_mask);
2435     return true;
2436 }
2437 
2438 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
2439                                           VTDInvDesc *inv_desc)
2440 {
2441     VTDAddressSpace *vtd_dev_as;
2442     IOMMUTLBEvent event;
2443     struct VTDBus *vtd_bus;
2444     hwaddr addr;
2445     uint64_t sz;
2446     uint16_t sid;
2447     uint8_t devfn;
2448     bool size;
2449     uint8_t bus_num;
2450 
2451     addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
2452     sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
2453     devfn = sid & 0xff;
2454     bus_num = sid >> 8;
2455     size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
2456 
2457     if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
2458         (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
2459         error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
2460                           ", lo=%"PRIx64" (reserved nonzero)", __func__,
2461                           inv_desc->hi, inv_desc->lo);
2462         return false;
2463     }
2464 
2465     vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2466     if (!vtd_bus) {
2467         goto done;
2468     }
2469 
2470     vtd_dev_as = vtd_bus->dev_as[devfn];
2471     if (!vtd_dev_as) {
2472         goto done;
2473     }
2474 
2475     /* According to ATS spec table 2.4:
2476      * S = 0, bits 15:12 = xxxx     range size: 4K
2477      * S = 1, bits 15:12 = xxx0     range size: 8K
2478      * S = 1, bits 15:12 = xx01     range size: 16K
2479      * S = 1, bits 15:12 = x011     range size: 32K
2480      * S = 1, bits 15:12 = 0111     range size: 64K
2481      * ...
2482      */
2483     if (size) {
2484         sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2485         addr &= ~(sz - 1);
2486     } else {
2487         sz = VTD_PAGE_SIZE;
2488     }
2489 
2490     event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
2491     event.entry.target_as = &vtd_dev_as->as;
2492     event.entry.addr_mask = sz - 1;
2493     event.entry.iova = addr;
2494     event.entry.perm = IOMMU_NONE;
2495     event.entry.translated_addr = 0;
2496     memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
2497 
2498 done:
2499     return true;
2500 }
2501 
2502 static bool vtd_process_inv_desc(IntelIOMMUState *s)
2503 {
2504     VTDInvDesc inv_desc;
2505     uint8_t desc_type;
2506 
2507     trace_vtd_inv_qi_head(s->iq_head);
2508     if (!vtd_get_inv_desc(s, &inv_desc)) {
2509         s->iq_last_desc_type = VTD_INV_DESC_NONE;
2510         return false;
2511     }
2512 
2513     desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2514     /* FIXME: should update at first or at last? */
2515     s->iq_last_desc_type = desc_type;
2516 
2517     switch (desc_type) {
2518     case VTD_INV_DESC_CC:
2519         trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2520         if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2521             return false;
2522         }
2523         break;
2524 
2525     case VTD_INV_DESC_IOTLB:
2526         trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2527         if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2528             return false;
2529         }
2530         break;
2531 
2532     /*
2533      * TODO: the entity of below two cases will be implemented in future series.
2534      * To make guest (which integrates scalable mode support patch set in
2535      * iommu driver) work, just return true is enough so far.
2536      */
2537     case VTD_INV_DESC_PC:
2538         break;
2539 
2540     case VTD_INV_DESC_PIOTLB:
2541         break;
2542 
2543     case VTD_INV_DESC_WAIT:
2544         trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2545         if (!vtd_process_wait_desc(s, &inv_desc)) {
2546             return false;
2547         }
2548         break;
2549 
2550     case VTD_INV_DESC_IEC:
2551         trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2552         if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2553             return false;
2554         }
2555         break;
2556 
2557     case VTD_INV_DESC_DEVICE:
2558         trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2559         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2560             return false;
2561         }
2562         break;
2563 
2564     default:
2565         error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
2566                           " (unknown type)", __func__, inv_desc.hi,
2567                           inv_desc.lo);
2568         return false;
2569     }
2570     s->iq_head++;
2571     if (s->iq_head == s->iq_size) {
2572         s->iq_head = 0;
2573     }
2574     return true;
2575 }
2576 
2577 /* Try to fetch and process more Invalidation Descriptors */
2578 static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2579 {
2580     int qi_shift;
2581 
2582     /* Refer to 10.4.23 of VT-d spec 3.0 */
2583     qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4;
2584 
2585     trace_vtd_inv_qi_fetch();
2586 
2587     if (s->iq_tail >= s->iq_size) {
2588         /* Detects an invalid Tail pointer */
2589         error_report_once("%s: detected invalid QI tail "
2590                           "(tail=0x%x, size=0x%x)",
2591                           __func__, s->iq_tail, s->iq_size);
2592         vtd_handle_inv_queue_error(s);
2593         return;
2594     }
2595     while (s->iq_head != s->iq_tail) {
2596         if (!vtd_process_inv_desc(s)) {
2597             /* Invalidation Queue Errors */
2598             vtd_handle_inv_queue_error(s);
2599             break;
2600         }
2601         /* Must update the IQH_REG in time */
2602         vtd_set_quad_raw(s, DMAR_IQH_REG,
2603                          (((uint64_t)(s->iq_head)) << qi_shift) &
2604                          VTD_IQH_QH_MASK);
2605     }
2606 }
2607 
2608 /* Handle write to Invalidation Queue Tail Register */
2609 static void vtd_handle_iqt_write(IntelIOMMUState *s)
2610 {
2611     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2612 
2613     if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
2614         error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
2615                           __func__, val);
2616         return;
2617     }
2618     s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
2619     trace_vtd_inv_qi_tail(s->iq_tail);
2620 
2621     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2622         /* Process Invalidation Queue here */
2623         vtd_fetch_inv_desc(s);
2624     }
2625 }
2626 
2627 static void vtd_handle_fsts_write(IntelIOMMUState *s)
2628 {
2629     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2630     uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2631     uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2632 
2633     if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2634         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2635         trace_vtd_fsts_clear_ip();
2636     }
2637     /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2638      * Descriptors if there are any when Queued Invalidation is enabled?
2639      */
2640 }
2641 
2642 static void vtd_handle_fectl_write(IntelIOMMUState *s)
2643 {
2644     uint32_t fectl_reg;
2645     /* FIXME: when software clears the IM field, check the IP field. But do we
2646      * need to compare the old value and the new value to conclude that
2647      * software clears the IM field? Or just check if the IM field is zero?
2648      */
2649     fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2650 
2651     trace_vtd_reg_write_fectl(fectl_reg);
2652 
2653     if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2654         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2655         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2656     }
2657 }
2658 
2659 static void vtd_handle_ics_write(IntelIOMMUState *s)
2660 {
2661     uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2662     uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2663 
2664     if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2665         trace_vtd_reg_ics_clear_ip();
2666         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2667     }
2668 }
2669 
2670 static void vtd_handle_iectl_write(IntelIOMMUState *s)
2671 {
2672     uint32_t iectl_reg;
2673     /* FIXME: when software clears the IM field, check the IP field. But do we
2674      * need to compare the old value and the new value to conclude that
2675      * software clears the IM field? Or just check if the IM field is zero?
2676      */
2677     iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2678 
2679     trace_vtd_reg_write_iectl(iectl_reg);
2680 
2681     if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2682         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2683         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2684     }
2685 }
2686 
2687 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2688 {
2689     IntelIOMMUState *s = opaque;
2690     uint64_t val;
2691 
2692     trace_vtd_reg_read(addr, size);
2693 
2694     if (addr + size > DMAR_REG_SIZE) {
2695         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2696                           " size=0x%x", __func__, addr, size);
2697         return (uint64_t)-1;
2698     }
2699 
2700     switch (addr) {
2701     /* Root Table Address Register, 64-bit */
2702     case DMAR_RTADDR_REG:
2703         val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
2704         if (size == 4) {
2705             val = val & ((1ULL << 32) - 1);
2706         }
2707         break;
2708 
2709     case DMAR_RTADDR_REG_HI:
2710         assert(size == 4);
2711         val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
2712         break;
2713 
2714     /* Invalidation Queue Address Register, 64-bit */
2715     case DMAR_IQA_REG:
2716         val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2717         if (size == 4) {
2718             val = val & ((1ULL << 32) - 1);
2719         }
2720         break;
2721 
2722     case DMAR_IQA_REG_HI:
2723         assert(size == 4);
2724         val = s->iq >> 32;
2725         break;
2726 
2727     default:
2728         if (size == 4) {
2729             val = vtd_get_long(s, addr);
2730         } else {
2731             val = vtd_get_quad(s, addr);
2732         }
2733     }
2734 
2735     return val;
2736 }
2737 
2738 static void vtd_mem_write(void *opaque, hwaddr addr,
2739                           uint64_t val, unsigned size)
2740 {
2741     IntelIOMMUState *s = opaque;
2742 
2743     trace_vtd_reg_write(addr, size, val);
2744 
2745     if (addr + size > DMAR_REG_SIZE) {
2746         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2747                           " size=0x%x", __func__, addr, size);
2748         return;
2749     }
2750 
2751     switch (addr) {
2752     /* Global Command Register, 32-bit */
2753     case DMAR_GCMD_REG:
2754         vtd_set_long(s, addr, val);
2755         vtd_handle_gcmd_write(s);
2756         break;
2757 
2758     /* Context Command Register, 64-bit */
2759     case DMAR_CCMD_REG:
2760         if (size == 4) {
2761             vtd_set_long(s, addr, val);
2762         } else {
2763             vtd_set_quad(s, addr, val);
2764             vtd_handle_ccmd_write(s);
2765         }
2766         break;
2767 
2768     case DMAR_CCMD_REG_HI:
2769         assert(size == 4);
2770         vtd_set_long(s, addr, val);
2771         vtd_handle_ccmd_write(s);
2772         break;
2773 
2774     /* IOTLB Invalidation Register, 64-bit */
2775     case DMAR_IOTLB_REG:
2776         if (size == 4) {
2777             vtd_set_long(s, addr, val);
2778         } else {
2779             vtd_set_quad(s, addr, val);
2780             vtd_handle_iotlb_write(s);
2781         }
2782         break;
2783 
2784     case DMAR_IOTLB_REG_HI:
2785         assert(size == 4);
2786         vtd_set_long(s, addr, val);
2787         vtd_handle_iotlb_write(s);
2788         break;
2789 
2790     /* Invalidate Address Register, 64-bit */
2791     case DMAR_IVA_REG:
2792         if (size == 4) {
2793             vtd_set_long(s, addr, val);
2794         } else {
2795             vtd_set_quad(s, addr, val);
2796         }
2797         break;
2798 
2799     case DMAR_IVA_REG_HI:
2800         assert(size == 4);
2801         vtd_set_long(s, addr, val);
2802         break;
2803 
2804     /* Fault Status Register, 32-bit */
2805     case DMAR_FSTS_REG:
2806         assert(size == 4);
2807         vtd_set_long(s, addr, val);
2808         vtd_handle_fsts_write(s);
2809         break;
2810 
2811     /* Fault Event Control Register, 32-bit */
2812     case DMAR_FECTL_REG:
2813         assert(size == 4);
2814         vtd_set_long(s, addr, val);
2815         vtd_handle_fectl_write(s);
2816         break;
2817 
2818     /* Fault Event Data Register, 32-bit */
2819     case DMAR_FEDATA_REG:
2820         assert(size == 4);
2821         vtd_set_long(s, addr, val);
2822         break;
2823 
2824     /* Fault Event Address Register, 32-bit */
2825     case DMAR_FEADDR_REG:
2826         if (size == 4) {
2827             vtd_set_long(s, addr, val);
2828         } else {
2829             /*
2830              * While the register is 32-bit only, some guests (Xen...) write to
2831              * it with 64-bit.
2832              */
2833             vtd_set_quad(s, addr, val);
2834         }
2835         break;
2836 
2837     /* Fault Event Upper Address Register, 32-bit */
2838     case DMAR_FEUADDR_REG:
2839         assert(size == 4);
2840         vtd_set_long(s, addr, val);
2841         break;
2842 
2843     /* Protected Memory Enable Register, 32-bit */
2844     case DMAR_PMEN_REG:
2845         assert(size == 4);
2846         vtd_set_long(s, addr, val);
2847         break;
2848 
2849     /* Root Table Address Register, 64-bit */
2850     case DMAR_RTADDR_REG:
2851         if (size == 4) {
2852             vtd_set_long(s, addr, val);
2853         } else {
2854             vtd_set_quad(s, addr, val);
2855         }
2856         break;
2857 
2858     case DMAR_RTADDR_REG_HI:
2859         assert(size == 4);
2860         vtd_set_long(s, addr, val);
2861         break;
2862 
2863     /* Invalidation Queue Tail Register, 64-bit */
2864     case DMAR_IQT_REG:
2865         if (size == 4) {
2866             vtd_set_long(s, addr, val);
2867         } else {
2868             vtd_set_quad(s, addr, val);
2869         }
2870         vtd_handle_iqt_write(s);
2871         break;
2872 
2873     case DMAR_IQT_REG_HI:
2874         assert(size == 4);
2875         vtd_set_long(s, addr, val);
2876         /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2877         break;
2878 
2879     /* Invalidation Queue Address Register, 64-bit */
2880     case DMAR_IQA_REG:
2881         if (size == 4) {
2882             vtd_set_long(s, addr, val);
2883         } else {
2884             vtd_set_quad(s, addr, val);
2885         }
2886         if (s->ecap & VTD_ECAP_SMTS &&
2887             val & VTD_IQA_DW_MASK) {
2888             s->iq_dw = true;
2889         } else {
2890             s->iq_dw = false;
2891         }
2892         break;
2893 
2894     case DMAR_IQA_REG_HI:
2895         assert(size == 4);
2896         vtd_set_long(s, addr, val);
2897         break;
2898 
2899     /* Invalidation Completion Status Register, 32-bit */
2900     case DMAR_ICS_REG:
2901         assert(size == 4);
2902         vtd_set_long(s, addr, val);
2903         vtd_handle_ics_write(s);
2904         break;
2905 
2906     /* Invalidation Event Control Register, 32-bit */
2907     case DMAR_IECTL_REG:
2908         assert(size == 4);
2909         vtd_set_long(s, addr, val);
2910         vtd_handle_iectl_write(s);
2911         break;
2912 
2913     /* Invalidation Event Data Register, 32-bit */
2914     case DMAR_IEDATA_REG:
2915         assert(size == 4);
2916         vtd_set_long(s, addr, val);
2917         break;
2918 
2919     /* Invalidation Event Address Register, 32-bit */
2920     case DMAR_IEADDR_REG:
2921         assert(size == 4);
2922         vtd_set_long(s, addr, val);
2923         break;
2924 
2925     /* Invalidation Event Upper Address Register, 32-bit */
2926     case DMAR_IEUADDR_REG:
2927         assert(size == 4);
2928         vtd_set_long(s, addr, val);
2929         break;
2930 
2931     /* Fault Recording Registers, 128-bit */
2932     case DMAR_FRCD_REG_0_0:
2933         if (size == 4) {
2934             vtd_set_long(s, addr, val);
2935         } else {
2936             vtd_set_quad(s, addr, val);
2937         }
2938         break;
2939 
2940     case DMAR_FRCD_REG_0_1:
2941         assert(size == 4);
2942         vtd_set_long(s, addr, val);
2943         break;
2944 
2945     case DMAR_FRCD_REG_0_2:
2946         if (size == 4) {
2947             vtd_set_long(s, addr, val);
2948         } else {
2949             vtd_set_quad(s, addr, val);
2950             /* May clear bit 127 (Fault), update PPF */
2951             vtd_update_fsts_ppf(s);
2952         }
2953         break;
2954 
2955     case DMAR_FRCD_REG_0_3:
2956         assert(size == 4);
2957         vtd_set_long(s, addr, val);
2958         /* May clear bit 127 (Fault), update PPF */
2959         vtd_update_fsts_ppf(s);
2960         break;
2961 
2962     case DMAR_IRTA_REG:
2963         if (size == 4) {
2964             vtd_set_long(s, addr, val);
2965         } else {
2966             vtd_set_quad(s, addr, val);
2967         }
2968         break;
2969 
2970     case DMAR_IRTA_REG_HI:
2971         assert(size == 4);
2972         vtd_set_long(s, addr, val);
2973         break;
2974 
2975     default:
2976         if (size == 4) {
2977             vtd_set_long(s, addr, val);
2978         } else {
2979             vtd_set_quad(s, addr, val);
2980         }
2981     }
2982 }
2983 
2984 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
2985                                          IOMMUAccessFlags flag, int iommu_idx)
2986 {
2987     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2988     IntelIOMMUState *s = vtd_as->iommu_state;
2989     IOMMUTLBEntry iotlb = {
2990         /* We'll fill in the rest later. */
2991         .target_as = &address_space_memory,
2992     };
2993     bool success;
2994 
2995     if (likely(s->dmar_enabled)) {
2996         success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
2997                                          addr, flag & IOMMU_WO, &iotlb);
2998     } else {
2999         /* DMAR disabled, passthrough, use 4k-page*/
3000         iotlb.iova = addr & VTD_PAGE_MASK_4K;
3001         iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
3002         iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
3003         iotlb.perm = IOMMU_RW;
3004         success = true;
3005     }
3006 
3007     if (likely(success)) {
3008         trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
3009                                  VTD_PCI_SLOT(vtd_as->devfn),
3010                                  VTD_PCI_FUNC(vtd_as->devfn),
3011                                  iotlb.iova, iotlb.translated_addr,
3012                                  iotlb.addr_mask);
3013     } else {
3014         error_report_once("%s: detected translation failure "
3015                           "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
3016                           __func__, pci_bus_num(vtd_as->bus),
3017                           VTD_PCI_SLOT(vtd_as->devfn),
3018                           VTD_PCI_FUNC(vtd_as->devfn),
3019                           addr);
3020     }
3021 
3022     return iotlb;
3023 }
3024 
3025 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
3026                                          IOMMUNotifierFlag old,
3027                                          IOMMUNotifierFlag new,
3028                                          Error **errp)
3029 {
3030     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
3031     IntelIOMMUState *s = vtd_as->iommu_state;
3032 
3033     /* TODO: add support for VFIO and vhost users */
3034     if (s->snoop_control) {
3035         error_setg_errno(errp, -ENOTSUP,
3036                          "Snoop Control with vhost or VFIO is not supported");
3037         return -ENOTSUP;
3038     }
3039 
3040     /* Update per-address-space notifier flags */
3041     vtd_as->notifier_flags = new;
3042 
3043     if (old == IOMMU_NOTIFIER_NONE) {
3044         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
3045     } else if (new == IOMMU_NOTIFIER_NONE) {
3046         QLIST_REMOVE(vtd_as, next);
3047     }
3048     return 0;
3049 }
3050 
3051 static int vtd_post_load(void *opaque, int version_id)
3052 {
3053     IntelIOMMUState *iommu = opaque;
3054 
3055     /*
3056      * Memory regions are dynamically turned on/off depending on
3057      * context entry configurations from the guest. After migration,
3058      * we need to make sure the memory regions are still correct.
3059      */
3060     vtd_switch_address_space_all(iommu);
3061 
3062     /*
3063      * We don't need to migrate the root_scalable because we can
3064      * simply do the calculation after the loading is complete.  We
3065      * can actually do similar things with root, dmar_enabled, etc.
3066      * however since we've had them already so we'd better keep them
3067      * for compatibility of migration.
3068      */
3069     vtd_update_scalable_state(iommu);
3070 
3071     return 0;
3072 }
3073 
3074 static const VMStateDescription vtd_vmstate = {
3075     .name = "iommu-intel",
3076     .version_id = 1,
3077     .minimum_version_id = 1,
3078     .priority = MIG_PRI_IOMMU,
3079     .post_load = vtd_post_load,
3080     .fields = (VMStateField[]) {
3081         VMSTATE_UINT64(root, IntelIOMMUState),
3082         VMSTATE_UINT64(intr_root, IntelIOMMUState),
3083         VMSTATE_UINT64(iq, IntelIOMMUState),
3084         VMSTATE_UINT32(intr_size, IntelIOMMUState),
3085         VMSTATE_UINT16(iq_head, IntelIOMMUState),
3086         VMSTATE_UINT16(iq_tail, IntelIOMMUState),
3087         VMSTATE_UINT16(iq_size, IntelIOMMUState),
3088         VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
3089         VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
3090         VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
3091         VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
3092         VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
3093         VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
3094         VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
3095         VMSTATE_BOOL(intr_eime, IntelIOMMUState),
3096         VMSTATE_END_OF_LIST()
3097     }
3098 };
3099 
3100 static const MemoryRegionOps vtd_mem_ops = {
3101     .read = vtd_mem_read,
3102     .write = vtd_mem_write,
3103     .endianness = DEVICE_LITTLE_ENDIAN,
3104     .impl = {
3105         .min_access_size = 4,
3106         .max_access_size = 8,
3107     },
3108     .valid = {
3109         .min_access_size = 4,
3110         .max_access_size = 8,
3111     },
3112 };
3113 
3114 static Property vtd_properties[] = {
3115     DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
3116     DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
3117                             ON_OFF_AUTO_AUTO),
3118     DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
3119     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
3120                       VTD_HOST_ADDRESS_WIDTH),
3121     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
3122     DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
3123     DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
3124     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
3125     DEFINE_PROP_END_OF_LIST(),
3126 };
3127 
3128 /* Read IRTE entry with specific index */
3129 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
3130                         VTD_IR_TableEntry *entry, uint16_t sid)
3131 {
3132     static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
3133         {0xffff, 0xfffb, 0xfff9, 0xfff8};
3134     dma_addr_t addr = 0x00;
3135     uint16_t mask, source_id;
3136     uint8_t bus, bus_max, bus_min;
3137 
3138     if (index >= iommu->intr_size) {
3139         error_report_once("%s: index too large: ind=0x%x",
3140                           __func__, index);
3141         return -VTD_FR_IR_INDEX_OVER;
3142     }
3143 
3144     addr = iommu->intr_root + index * sizeof(*entry);
3145     if (dma_memory_read(&address_space_memory, addr,
3146                         entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) {
3147         error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
3148                           __func__, index, addr);
3149         return -VTD_FR_IR_ROOT_INVAL;
3150     }
3151 
3152     trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
3153                           le64_to_cpu(entry->data[0]));
3154 
3155     if (!entry->irte.present) {
3156         error_report_once("%s: detected non-present IRTE "
3157                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3158                           __func__, index, le64_to_cpu(entry->data[1]),
3159                           le64_to_cpu(entry->data[0]));
3160         return -VTD_FR_IR_ENTRY_P;
3161     }
3162 
3163     if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
3164         entry->irte.__reserved_2) {
3165         error_report_once("%s: detected non-zero reserved IRTE "
3166                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3167                           __func__, index, le64_to_cpu(entry->data[1]),
3168                           le64_to_cpu(entry->data[0]));
3169         return -VTD_FR_IR_IRTE_RSVD;
3170     }
3171 
3172     if (sid != X86_IOMMU_SID_INVALID) {
3173         /* Validate IRTE SID */
3174         source_id = le32_to_cpu(entry->irte.source_id);
3175         switch (entry->irte.sid_vtype) {
3176         case VTD_SVT_NONE:
3177             break;
3178 
3179         case VTD_SVT_ALL:
3180             mask = vtd_svt_mask[entry->irte.sid_q];
3181             if ((source_id & mask) != (sid & mask)) {
3182                 error_report_once("%s: invalid IRTE SID "
3183                                   "(index=%u, sid=%u, source_id=%u)",
3184                                   __func__, index, sid, source_id);
3185                 return -VTD_FR_IR_SID_ERR;
3186             }
3187             break;
3188 
3189         case VTD_SVT_BUS:
3190             bus_max = source_id >> 8;
3191             bus_min = source_id & 0xff;
3192             bus = sid >> 8;
3193             if (bus > bus_max || bus < bus_min) {
3194                 error_report_once("%s: invalid SVT_BUS "
3195                                   "(index=%u, bus=%u, min=%u, max=%u)",
3196                                   __func__, index, bus, bus_min, bus_max);
3197                 return -VTD_FR_IR_SID_ERR;
3198             }
3199             break;
3200 
3201         default:
3202             error_report_once("%s: detected invalid IRTE SVT "
3203                               "(index=%u, type=%d)", __func__,
3204                               index, entry->irte.sid_vtype);
3205             /* Take this as verification failure. */
3206             return -VTD_FR_IR_SID_ERR;
3207         }
3208     }
3209 
3210     return 0;
3211 }
3212 
3213 /* Fetch IRQ information of specific IR index */
3214 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
3215                              X86IOMMUIrq *irq, uint16_t sid)
3216 {
3217     VTD_IR_TableEntry irte = {};
3218     int ret = 0;
3219 
3220     ret = vtd_irte_get(iommu, index, &irte, sid);
3221     if (ret) {
3222         return ret;
3223     }
3224 
3225     irq->trigger_mode = irte.irte.trigger_mode;
3226     irq->vector = irte.irte.vector;
3227     irq->delivery_mode = irte.irte.delivery_mode;
3228     irq->dest = le32_to_cpu(irte.irte.dest_id);
3229     if (!iommu->intr_eime) {
3230 #define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
3231 #define  VTD_IR_APIC_DEST_SHIFT        (8)
3232         irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
3233             VTD_IR_APIC_DEST_SHIFT;
3234     }
3235     irq->dest_mode = irte.irte.dest_mode;
3236     irq->redir_hint = irte.irte.redir_hint;
3237 
3238     trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
3239                        irq->delivery_mode, irq->dest, irq->dest_mode);
3240 
3241     return 0;
3242 }
3243 
3244 /* Interrupt remapping for MSI/MSI-X entry */
3245 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
3246                                    MSIMessage *origin,
3247                                    MSIMessage *translated,
3248                                    uint16_t sid)
3249 {
3250     int ret = 0;
3251     VTD_IR_MSIAddress addr;
3252     uint16_t index;
3253     X86IOMMUIrq irq = {};
3254 
3255     assert(origin && translated);
3256 
3257     trace_vtd_ir_remap_msi_req(origin->address, origin->data);
3258 
3259     if (!iommu || !iommu->intr_enabled) {
3260         memcpy(translated, origin, sizeof(*origin));
3261         goto out;
3262     }
3263 
3264     if (origin->address & VTD_MSI_ADDR_HI_MASK) {
3265         error_report_once("%s: MSI address high 32 bits non-zero detected: "
3266                           "address=0x%" PRIx64, __func__, origin->address);
3267         return -VTD_FR_IR_REQ_RSVD;
3268     }
3269 
3270     addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
3271     if (addr.addr.__head != 0xfee) {
3272         error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
3273                           __func__, addr.data);
3274         return -VTD_FR_IR_REQ_RSVD;
3275     }
3276 
3277     /* This is compatible mode. */
3278     if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
3279         memcpy(translated, origin, sizeof(*origin));
3280         goto out;
3281     }
3282 
3283     index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
3284 
3285 #define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
3286 #define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
3287 
3288     if (addr.addr.sub_valid) {
3289         /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
3290         index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
3291     }
3292 
3293     ret = vtd_remap_irq_get(iommu, index, &irq, sid);
3294     if (ret) {
3295         return ret;
3296     }
3297 
3298     if (addr.addr.sub_valid) {
3299         trace_vtd_ir_remap_type("MSI");
3300         if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
3301             error_report_once("%s: invalid IR MSI "
3302                               "(sid=%u, address=0x%" PRIx64
3303                               ", data=0x%" PRIx32 ")",
3304                               __func__, sid, origin->address, origin->data);
3305             return -VTD_FR_IR_REQ_RSVD;
3306         }
3307     } else {
3308         uint8_t vector = origin->data & 0xff;
3309         uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
3310 
3311         trace_vtd_ir_remap_type("IOAPIC");
3312         /* IOAPIC entry vector should be aligned with IRTE vector
3313          * (see vt-d spec 5.1.5.1). */
3314         if (vector != irq.vector) {
3315             trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
3316         }
3317 
3318         /* The Trigger Mode field must match the Trigger Mode in the IRTE.
3319          * (see vt-d spec 5.1.5.1). */
3320         if (trigger_mode != irq.trigger_mode) {
3321             trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
3322                                       irq.trigger_mode);
3323         }
3324     }
3325 
3326     /*
3327      * We'd better keep the last two bits, assuming that guest OS
3328      * might modify it. Keep it does not hurt after all.
3329      */
3330     irq.msi_addr_last_bits = addr.addr.__not_care;
3331 
3332     /* Translate X86IOMMUIrq to MSI message */
3333     x86_iommu_irq_to_msi_message(&irq, translated);
3334 
3335 out:
3336     trace_vtd_ir_remap_msi(origin->address, origin->data,
3337                            translated->address, translated->data);
3338     return 0;
3339 }
3340 
3341 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
3342                          MSIMessage *dst, uint16_t sid)
3343 {
3344     return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
3345                                    src, dst, sid);
3346 }
3347 
3348 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
3349                                    uint64_t *data, unsigned size,
3350                                    MemTxAttrs attrs)
3351 {
3352     return MEMTX_OK;
3353 }
3354 
3355 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
3356                                     uint64_t value, unsigned size,
3357                                     MemTxAttrs attrs)
3358 {
3359     int ret = 0;
3360     MSIMessage from = {}, to = {};
3361     uint16_t sid = X86_IOMMU_SID_INVALID;
3362 
3363     from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
3364     from.data = (uint32_t) value;
3365 
3366     if (!attrs.unspecified) {
3367         /* We have explicit Source ID */
3368         sid = attrs.requester_id;
3369     }
3370 
3371     ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
3372     if (ret) {
3373         /* TODO: report error */
3374         /* Drop this interrupt */
3375         return MEMTX_ERROR;
3376     }
3377 
3378     apic_get_class()->send_msi(&to);
3379 
3380     return MEMTX_OK;
3381 }
3382 
3383 static const MemoryRegionOps vtd_mem_ir_ops = {
3384     .read_with_attrs = vtd_mem_ir_read,
3385     .write_with_attrs = vtd_mem_ir_write,
3386     .endianness = DEVICE_LITTLE_ENDIAN,
3387     .impl = {
3388         .min_access_size = 4,
3389         .max_access_size = 4,
3390     },
3391     .valid = {
3392         .min_access_size = 4,
3393         .max_access_size = 4,
3394     },
3395 };
3396 
3397 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
3398 {
3399     uintptr_t key = (uintptr_t)bus;
3400     VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
3401     VTDAddressSpace *vtd_dev_as;
3402     char name[128];
3403 
3404     if (!vtd_bus) {
3405         uintptr_t *new_key = g_malloc(sizeof(*new_key));
3406         *new_key = (uintptr_t)bus;
3407         /* No corresponding free() */
3408         vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
3409                             PCI_DEVFN_MAX);
3410         vtd_bus->bus = bus;
3411         g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
3412     }
3413 
3414     vtd_dev_as = vtd_bus->dev_as[devfn];
3415 
3416     if (!vtd_dev_as) {
3417         snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
3418                  PCI_FUNC(devfn));
3419         vtd_bus->dev_as[devfn] = vtd_dev_as = g_new0(VTDAddressSpace, 1);
3420 
3421         vtd_dev_as->bus = bus;
3422         vtd_dev_as->devfn = (uint8_t)devfn;
3423         vtd_dev_as->iommu_state = s;
3424         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
3425         vtd_dev_as->iova_tree = iova_tree_new();
3426 
3427         memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
3428         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
3429 
3430         /*
3431          * Build the DMAR-disabled container with aliases to the
3432          * shared MRs.  Note that aliasing to a shared memory region
3433          * could help the memory API to detect same FlatViews so we
3434          * can have devices to share the same FlatView when DMAR is
3435          * disabled (either by not providing "intel_iommu=on" or with
3436          * "iommu=pt").  It will greatly reduce the total number of
3437          * FlatViews of the system hence VM runs faster.
3438          */
3439         memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
3440                                  "vtd-nodmar", &s->mr_nodmar, 0,
3441                                  memory_region_size(&s->mr_nodmar));
3442 
3443         /*
3444          * Build the per-device DMAR-enabled container.
3445          *
3446          * TODO: currently we have per-device IOMMU memory region only
3447          * because we have per-device IOMMU notifiers for devices.  If
3448          * one day we can abstract the IOMMU notifiers out of the
3449          * memory regions then we can also share the same memory
3450          * region here just like what we've done above with the nodmar
3451          * region.
3452          */
3453         strcat(name, "-dmar");
3454         memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
3455                                  TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
3456                                  name, UINT64_MAX);
3457         memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
3458                                  &s->mr_ir, 0, memory_region_size(&s->mr_ir));
3459         memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
3460                                             VTD_INTERRUPT_ADDR_FIRST,
3461                                             &vtd_dev_as->iommu_ir, 1);
3462 
3463         /*
3464          * Hook both the containers under the root container, we
3465          * switch between DMAR & noDMAR by enable/disable
3466          * corresponding sub-containers
3467          */
3468         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3469                                             MEMORY_REGION(&vtd_dev_as->iommu),
3470                                             0);
3471         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3472                                             &vtd_dev_as->nodmar, 0);
3473 
3474         vtd_switch_address_space(vtd_dev_as);
3475     }
3476     return vtd_dev_as;
3477 }
3478 
3479 /* Unmap the whole range in the notifier's scope. */
3480 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
3481 {
3482     hwaddr size, remain;
3483     hwaddr start = n->start;
3484     hwaddr end = n->end;
3485     IntelIOMMUState *s = as->iommu_state;
3486     DMAMap map;
3487 
3488     /*
3489      * Note: all the codes in this function has a assumption that IOVA
3490      * bits are no more than VTD_MGAW bits (which is restricted by
3491      * VT-d spec), otherwise we need to consider overflow of 64 bits.
3492      */
3493 
3494     if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
3495         /*
3496          * Don't need to unmap regions that is bigger than the whole
3497          * VT-d supported address space size
3498          */
3499         end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
3500     }
3501 
3502     assert(start <= end);
3503     size = remain = end - start + 1;
3504 
3505     while (remain >= VTD_PAGE_SIZE) {
3506         IOMMUTLBEvent event;
3507         uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits);
3508         uint64_t size = mask + 1;
3509 
3510         assert(size);
3511 
3512         event.type = IOMMU_NOTIFIER_UNMAP;
3513         event.entry.iova = start;
3514         event.entry.addr_mask = mask;
3515         event.entry.target_as = &address_space_memory;
3516         event.entry.perm = IOMMU_NONE;
3517         /* This field is meaningless for unmap */
3518         event.entry.translated_addr = 0;
3519 
3520         memory_region_notify_iommu_one(n, &event);
3521 
3522         start += size;
3523         remain -= size;
3524     }
3525 
3526     assert(!remain);
3527 
3528     trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3529                              VTD_PCI_SLOT(as->devfn),
3530                              VTD_PCI_FUNC(as->devfn),
3531                              n->start, size);
3532 
3533     map.iova = n->start;
3534     map.size = size;
3535     iova_tree_remove(as->iova_tree, &map);
3536 }
3537 
3538 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3539 {
3540     VTDAddressSpace *vtd_as;
3541     IOMMUNotifier *n;
3542 
3543     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3544         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3545             vtd_address_space_unmap(vtd_as, n);
3546         }
3547     }
3548 }
3549 
3550 static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3551 {
3552     vtd_address_space_unmap_all(s);
3553     vtd_switch_address_space_all(s);
3554 }
3555 
3556 static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
3557 {
3558     memory_region_notify_iommu_one(private, event);
3559     return 0;
3560 }
3561 
3562 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3563 {
3564     VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3565     IntelIOMMUState *s = vtd_as->iommu_state;
3566     uint8_t bus_n = pci_bus_num(vtd_as->bus);
3567     VTDContextEntry ce;
3568 
3569     /*
3570      * The replay can be triggered by either a invalidation or a newly
3571      * created entry. No matter what, we release existing mappings
3572      * (it means flushing caches for UNMAP-only registers).
3573      */
3574     vtd_address_space_unmap(vtd_as, n);
3575 
3576     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3577         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
3578                                   "legacy mode",
3579                                   bus_n, PCI_SLOT(vtd_as->devfn),
3580                                   PCI_FUNC(vtd_as->devfn),
3581                                   vtd_get_domain_id(s, &ce),
3582                                   ce.hi, ce.lo);
3583         if (vtd_as_has_map_notifier(vtd_as)) {
3584             /* This is required only for MAP typed notifiers */
3585             vtd_page_walk_info info = {
3586                 .hook_fn = vtd_replay_hook,
3587                 .private = (void *)n,
3588                 .notify_unmap = false,
3589                 .aw = s->aw_bits,
3590                 .as = vtd_as,
3591                 .domain_id = vtd_get_domain_id(s, &ce),
3592             };
3593 
3594             vtd_page_walk(s, &ce, 0, ~0ULL, &info);
3595         }
3596     } else {
3597         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3598                                     PCI_FUNC(vtd_as->devfn));
3599     }
3600 
3601     return;
3602 }
3603 
3604 /* Do the initialization. It will also be called when reset, so pay
3605  * attention when adding new initialization stuff.
3606  */
3607 static void vtd_init(IntelIOMMUState *s)
3608 {
3609     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3610 
3611     memset(s->csr, 0, DMAR_REG_SIZE);
3612     memset(s->wmask, 0, DMAR_REG_SIZE);
3613     memset(s->w1cmask, 0, DMAR_REG_SIZE);
3614     memset(s->womask, 0, DMAR_REG_SIZE);
3615 
3616     s->root = 0;
3617     s->root_scalable = false;
3618     s->dmar_enabled = false;
3619     s->intr_enabled = false;
3620     s->iq_head = 0;
3621     s->iq_tail = 0;
3622     s->iq = 0;
3623     s->iq_size = 0;
3624     s->qi_enabled = false;
3625     s->iq_last_desc_type = VTD_INV_DESC_NONE;
3626     s->iq_dw = false;
3627     s->next_frcd_reg = 0;
3628     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3629              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3630              VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
3631     if (s->dma_drain) {
3632         s->cap |= VTD_CAP_DRAIN;
3633     }
3634     if (s->aw_bits == VTD_HOST_AW_48BIT) {
3635         s->cap |= VTD_CAP_SAGAW_48bit;
3636     }
3637     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3638 
3639     /*
3640      * Rsvd field masks for spte
3641      */
3642     vtd_spte_rsvd[0] = ~0ULL;
3643     vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
3644                                                   x86_iommu->dt_supported);
3645     vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3646     vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3647     vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3648 
3649     vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
3650                                                          x86_iommu->dt_supported);
3651     vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
3652                                                          x86_iommu->dt_supported);
3653 
3654     if (s->scalable_mode || s->snoop_control) {
3655         vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
3656         vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
3657         vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
3658     }
3659 
3660     if (x86_iommu_ir_supported(x86_iommu)) {
3661         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3662         if (s->intr_eim == ON_OFF_AUTO_ON) {
3663             s->ecap |= VTD_ECAP_EIM;
3664         }
3665         assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3666     }
3667 
3668     if (x86_iommu->dt_supported) {
3669         s->ecap |= VTD_ECAP_DT;
3670     }
3671 
3672     if (x86_iommu->pt_supported) {
3673         s->ecap |= VTD_ECAP_PT;
3674     }
3675 
3676     if (s->caching_mode) {
3677         s->cap |= VTD_CAP_CM;
3678     }
3679 
3680     /* TODO: read cap/ecap from host to decide which cap to be exposed. */
3681     if (s->scalable_mode) {
3682         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
3683     }
3684 
3685     if (s->snoop_control) {
3686         s->ecap |= VTD_ECAP_SC;
3687     }
3688 
3689     vtd_reset_caches(s);
3690 
3691     /* Define registers with default values and bit semantics */
3692     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3693     vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3694     vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3695     vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3696     vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3697     vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3698     vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
3699     vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3700     vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3701 
3702     /* Advanced Fault Logging not supported */
3703     vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3704     vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3705     vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3706     vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3707 
3708     /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3709      * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3710      */
3711     vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3712 
3713     /* Treated as RO for implementations that PLMR and PHMR fields reported
3714      * as Clear in the CAP_REG.
3715      * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3716      */
3717     vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3718 
3719     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3720     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3721     vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
3722     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3723     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3724     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3725     vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3726     /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3727     vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3728 
3729     /* IOTLB registers */
3730     vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3731     vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3732     vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3733 
3734     /* Fault Recording Registers, 128-bit */
3735     vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3736     vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3737 
3738     /*
3739      * Interrupt remapping registers.
3740      */
3741     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3742 }
3743 
3744 /* Should not reset address_spaces when reset because devices will still use
3745  * the address space they got at first (won't ask the bus again).
3746  */
3747 static void vtd_reset(DeviceState *dev)
3748 {
3749     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3750 
3751     vtd_init(s);
3752     vtd_address_space_refresh_all(s);
3753 }
3754 
3755 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3756 {
3757     IntelIOMMUState *s = opaque;
3758     VTDAddressSpace *vtd_as;
3759 
3760     assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3761 
3762     vtd_as = vtd_find_add_as(s, bus, devfn);
3763     return &vtd_as->as;
3764 }
3765 
3766 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3767 {
3768     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3769 
3770     if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
3771         error_setg(errp, "eim=on cannot be selected without intremap=on");
3772         return false;
3773     }
3774 
3775     if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3776         s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3777                       && x86_iommu_ir_supported(x86_iommu) ?
3778                                               ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3779     }
3780     if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3781         if (!kvm_irqchip_in_kernel()) {
3782             error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3783             return false;
3784         }
3785         if (!kvm_enable_x2apic()) {
3786             error_setg(errp, "eim=on requires support on the KVM side"
3787                              "(X2APIC_API, first shipped in v4.7)");
3788             return false;
3789         }
3790     }
3791 
3792     /* Currently only address widths supported are 39 and 48 bits */
3793     if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3794         (s->aw_bits != VTD_HOST_AW_48BIT)) {
3795         error_setg(errp, "Supported values for aw-bits are: %d, %d",
3796                    VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3797         return false;
3798     }
3799 
3800     if (s->scalable_mode && !s->dma_drain) {
3801         error_setg(errp, "Need to set dma_drain for scalable mode");
3802         return false;
3803     }
3804 
3805     return true;
3806 }
3807 
3808 static int vtd_machine_done_notify_one(Object *child, void *unused)
3809 {
3810     IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
3811 
3812     /*
3813      * We hard-coded here because vfio-pci is the only special case
3814      * here.  Let's be more elegant in the future when we can, but so
3815      * far there seems to be no better way.
3816      */
3817     if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
3818         vtd_panic_require_caching_mode();
3819     }
3820 
3821     return 0;
3822 }
3823 
3824 static void vtd_machine_done_hook(Notifier *notifier, void *unused)
3825 {
3826     object_child_foreach_recursive(object_get_root(),
3827                                    vtd_machine_done_notify_one, NULL);
3828 }
3829 
3830 static Notifier vtd_machine_done_notify = {
3831     .notify = vtd_machine_done_hook,
3832 };
3833 
3834 static void vtd_realize(DeviceState *dev, Error **errp)
3835 {
3836     MachineState *ms = MACHINE(qdev_get_machine());
3837     PCMachineState *pcms = PC_MACHINE(ms);
3838     X86MachineState *x86ms = X86_MACHINE(ms);
3839     PCIBus *bus = pcms->bus;
3840     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3841 
3842     if (!vtd_decide_config(s, errp)) {
3843         return;
3844     }
3845 
3846     QLIST_INIT(&s->vtd_as_with_notifiers);
3847     qemu_mutex_init(&s->iommu_lock);
3848     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3849     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3850                           "intel_iommu", DMAR_REG_SIZE);
3851 
3852     /* Create the shared memory regions by all devices */
3853     memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
3854                        UINT64_MAX);
3855     memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
3856                           s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
3857     memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
3858                              "vtd-sys-alias", get_system_memory(), 0,
3859                              memory_region_size(get_system_memory()));
3860     memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
3861                                         &s->mr_sys_alias, 0);
3862     memory_region_add_subregion_overlap(&s->mr_nodmar,
3863                                         VTD_INTERRUPT_ADDR_FIRST,
3864                                         &s->mr_ir, 1);
3865 
3866     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3867     /* No corresponding destroy */
3868     s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3869                                      g_free, g_free);
3870     s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3871                                               g_free, g_free);
3872     vtd_init(s);
3873     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3874     pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3875     /* Pseudo address space under root PCI bus. */
3876     x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3877     qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
3878 }
3879 
3880 static void vtd_class_init(ObjectClass *klass, void *data)
3881 {
3882     DeviceClass *dc = DEVICE_CLASS(klass);
3883     X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
3884 
3885     dc->reset = vtd_reset;
3886     dc->vmsd = &vtd_vmstate;
3887     device_class_set_props(dc, vtd_properties);
3888     dc->hotpluggable = false;
3889     x86_class->realize = vtd_realize;
3890     x86_class->int_remap = vtd_int_remap;
3891     /* Supported by the pc-q35-* machine types */
3892     dc->user_creatable = true;
3893     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3894     dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
3895 }
3896 
3897 static const TypeInfo vtd_info = {
3898     .name          = TYPE_INTEL_IOMMU_DEVICE,
3899     .parent        = TYPE_X86_IOMMU_DEVICE,
3900     .instance_size = sizeof(IntelIOMMUState),
3901     .class_init    = vtd_class_init,
3902 };
3903 
3904 static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3905                                                      void *data)
3906 {
3907     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3908 
3909     imrc->translate = vtd_iommu_translate;
3910     imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3911     imrc->replay = vtd_iommu_replay;
3912 }
3913 
3914 static const TypeInfo vtd_iommu_memory_region_info = {
3915     .parent = TYPE_IOMMU_MEMORY_REGION,
3916     .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3917     .class_init = vtd_iommu_memory_region_class_init,
3918 };
3919 
3920 static void vtd_register_types(void)
3921 {
3922     type_register_static(&vtd_info);
3923     type_register_static(&vtd_iommu_memory_region_info);
3924 }
3925 
3926 type_init(vtd_register_types)
3927