xref: /openbmc/qemu/hw/i386/intel_iommu.c (revision e068b57d)
1 /*
2  * QEMU emulation of an Intel IOMMU (VT-d)
3  *   (DMA Remapping device)
4  *
5  * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
6  * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12 
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17 
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "qemu/error-report.h"
24 #include "qemu/main-loop.h"
25 #include "qapi/error.h"
26 #include "hw/sysbus.h"
27 #include "intel_iommu_internal.h"
28 #include "hw/pci/pci.h"
29 #include "hw/pci/pci_bus.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/i386/pc.h"
32 #include "hw/i386/apic-msidef.h"
33 #include "hw/i386/x86-iommu.h"
34 #include "hw/pci-host/q35.h"
35 #include "sysemu/kvm.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/sysemu.h"
38 #include "hw/i386/apic_internal.h"
39 #include "kvm/kvm_i386.h"
40 #include "migration/vmstate.h"
41 #include "trace.h"
42 
43 /* context entry operations */
44 #define VTD_CE_GET_RID2PASID(ce) \
45     ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
46 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \
47     ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
48 
49 /* pe operations */
50 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
51 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
52 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
53     if (ret_fr) {                                                             \
54         ret_fr = -ret_fr;                                                     \
55         if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
56             trace_vtd_fault_disabled();                                       \
57         } else {                                                              \
58             vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
59         }                                                                     \
60         goto error;                                                           \
61     }                                                                         \
62 }
63 
64 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
65 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
66 
67 static void vtd_panic_require_caching_mode(void)
68 {
69     error_report("We need to set caching-mode=on for intel-iommu to enable "
70                  "device assignment with IOMMU protection.");
71     exit(1);
72 }
73 
74 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
75                             uint64_t wmask, uint64_t w1cmask)
76 {
77     stq_le_p(&s->csr[addr], val);
78     stq_le_p(&s->wmask[addr], wmask);
79     stq_le_p(&s->w1cmask[addr], w1cmask);
80 }
81 
82 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
83 {
84     stq_le_p(&s->womask[addr], mask);
85 }
86 
87 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
88                             uint32_t wmask, uint32_t w1cmask)
89 {
90     stl_le_p(&s->csr[addr], val);
91     stl_le_p(&s->wmask[addr], wmask);
92     stl_le_p(&s->w1cmask[addr], w1cmask);
93 }
94 
95 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
96 {
97     stl_le_p(&s->womask[addr], mask);
98 }
99 
100 /* "External" get/set operations */
101 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
102 {
103     uint64_t oldval = ldq_le_p(&s->csr[addr]);
104     uint64_t wmask = ldq_le_p(&s->wmask[addr]);
105     uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
106     stq_le_p(&s->csr[addr],
107              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
108 }
109 
110 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
111 {
112     uint32_t oldval = ldl_le_p(&s->csr[addr]);
113     uint32_t wmask = ldl_le_p(&s->wmask[addr]);
114     uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
115     stl_le_p(&s->csr[addr],
116              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
117 }
118 
119 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
120 {
121     uint64_t val = ldq_le_p(&s->csr[addr]);
122     uint64_t womask = ldq_le_p(&s->womask[addr]);
123     return val & ~womask;
124 }
125 
126 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
127 {
128     uint32_t val = ldl_le_p(&s->csr[addr]);
129     uint32_t womask = ldl_le_p(&s->womask[addr]);
130     return val & ~womask;
131 }
132 
133 /* "Internal" get/set operations */
134 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
135 {
136     return ldq_le_p(&s->csr[addr]);
137 }
138 
139 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
140 {
141     return ldl_le_p(&s->csr[addr]);
142 }
143 
144 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
145 {
146     stq_le_p(&s->csr[addr], val);
147 }
148 
149 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
150                                         uint32_t clear, uint32_t mask)
151 {
152     uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
153     stl_le_p(&s->csr[addr], new_val);
154     return new_val;
155 }
156 
157 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
158                                         uint64_t clear, uint64_t mask)
159 {
160     uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
161     stq_le_p(&s->csr[addr], new_val);
162     return new_val;
163 }
164 
165 static inline void vtd_iommu_lock(IntelIOMMUState *s)
166 {
167     qemu_mutex_lock(&s->iommu_lock);
168 }
169 
170 static inline void vtd_iommu_unlock(IntelIOMMUState *s)
171 {
172     qemu_mutex_unlock(&s->iommu_lock);
173 }
174 
175 static void vtd_update_scalable_state(IntelIOMMUState *s)
176 {
177     uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
178 
179     if (s->scalable_mode) {
180         s->root_scalable = val & VTD_RTADDR_SMT;
181     }
182 }
183 
184 /* Whether the address space needs to notify new mappings */
185 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
186 {
187     return as->notifier_flags & IOMMU_NOTIFIER_MAP;
188 }
189 
190 /* GHashTable functions */
191 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
192 {
193     return *((const uint64_t *)v1) == *((const uint64_t *)v2);
194 }
195 
196 static guint vtd_uint64_hash(gconstpointer v)
197 {
198     return (guint)*(const uint64_t *)v;
199 }
200 
201 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
202                                           gpointer user_data)
203 {
204     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
205     uint16_t domain_id = *(uint16_t *)user_data;
206     return entry->domain_id == domain_id;
207 }
208 
209 /* The shift of an addr for a certain level of paging structure */
210 static inline uint32_t vtd_slpt_level_shift(uint32_t level)
211 {
212     assert(level != 0);
213     return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
214 }
215 
216 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
217 {
218     return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
219 }
220 
221 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
222                                         gpointer user_data)
223 {
224     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
225     VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
226     uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
227     uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
228     return (entry->domain_id == info->domain_id) &&
229             (((entry->gfn & info->mask) == gfn) ||
230              (entry->gfn == gfn_tlb));
231 }
232 
233 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
234  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
235  */
236 static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
237 {
238     VTDAddressSpace *vtd_as;
239     VTDBus *vtd_bus;
240     GHashTableIter bus_it;
241     uint32_t devfn_it;
242 
243     trace_vtd_context_cache_reset();
244 
245     g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
246 
247     while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
248         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
249             vtd_as = vtd_bus->dev_as[devfn_it];
250             if (!vtd_as) {
251                 continue;
252             }
253             vtd_as->context_cache_entry.context_cache_gen = 0;
254         }
255     }
256     s->context_cache_gen = 1;
257 }
258 
259 /* Must be called with IOMMU lock held. */
260 static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
261 {
262     assert(s->iotlb);
263     g_hash_table_remove_all(s->iotlb);
264 }
265 
266 static void vtd_reset_iotlb(IntelIOMMUState *s)
267 {
268     vtd_iommu_lock(s);
269     vtd_reset_iotlb_locked(s);
270     vtd_iommu_unlock(s);
271 }
272 
273 static void vtd_reset_caches(IntelIOMMUState *s)
274 {
275     vtd_iommu_lock(s);
276     vtd_reset_iotlb_locked(s);
277     vtd_reset_context_cache_locked(s);
278     vtd_iommu_unlock(s);
279 }
280 
281 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
282                                   uint32_t level)
283 {
284     return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
285            ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
286 }
287 
288 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
289 {
290     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
291 }
292 
293 /* Must be called with IOMMU lock held */
294 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
295                                        hwaddr addr)
296 {
297     VTDIOTLBEntry *entry;
298     uint64_t key;
299     int level;
300 
301     for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
302         key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
303                                 source_id, level);
304         entry = g_hash_table_lookup(s->iotlb, &key);
305         if (entry) {
306             goto out;
307         }
308     }
309 
310 out:
311     return entry;
312 }
313 
314 /* Must be with IOMMU lock held */
315 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
316                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
317                              uint8_t access_flags, uint32_t level)
318 {
319     VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
320     uint64_t *key = g_malloc(sizeof(*key));
321     uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
322 
323     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
324     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
325         trace_vtd_iotlb_reset("iotlb exceeds size limit");
326         vtd_reset_iotlb_locked(s);
327     }
328 
329     entry->gfn = gfn;
330     entry->domain_id = domain_id;
331     entry->slpte = slpte;
332     entry->access_flags = access_flags;
333     entry->mask = vtd_slpt_level_page_mask(level);
334     *key = vtd_get_iotlb_key(gfn, source_id, level);
335     g_hash_table_replace(s->iotlb, key, entry);
336 }
337 
338 /* Given the reg addr of both the message data and address, generate an
339  * interrupt via MSI.
340  */
341 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
342                                    hwaddr mesg_data_reg)
343 {
344     MSIMessage msi;
345 
346     assert(mesg_data_reg < DMAR_REG_SIZE);
347     assert(mesg_addr_reg < DMAR_REG_SIZE);
348 
349     msi.address = vtd_get_long_raw(s, mesg_addr_reg);
350     msi.data = vtd_get_long_raw(s, mesg_data_reg);
351 
352     trace_vtd_irq_generate(msi.address, msi.data);
353 
354     apic_get_class()->send_msi(&msi);
355 }
356 
357 /* Generate a fault event to software via MSI if conditions are met.
358  * Notice that the value of FSTS_REG being passed to it should be the one
359  * before any update.
360  */
361 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
362 {
363     if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
364         pre_fsts & VTD_FSTS_IQE) {
365         error_report_once("There are previous interrupt conditions "
366                           "to be serviced by software, fault event "
367                           "is not generated");
368         return;
369     }
370     vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
371     if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
372         error_report_once("Interrupt Mask set, irq is not generated");
373     } else {
374         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
375         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
376     }
377 }
378 
379 /* Check if the Fault (F) field of the Fault Recording Register referenced by
380  * @index is Set.
381  */
382 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
383 {
384     /* Each reg is 128-bit */
385     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
386     addr += 8; /* Access the high 64-bit half */
387 
388     assert(index < DMAR_FRCD_REG_NR);
389 
390     return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
391 }
392 
393 /* Update the PPF field of Fault Status Register.
394  * Should be called whenever change the F field of any fault recording
395  * registers.
396  */
397 static void vtd_update_fsts_ppf(IntelIOMMUState *s)
398 {
399     uint32_t i;
400     uint32_t ppf_mask = 0;
401 
402     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
403         if (vtd_is_frcd_set(s, i)) {
404             ppf_mask = VTD_FSTS_PPF;
405             break;
406         }
407     }
408     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
409     trace_vtd_fsts_ppf(!!ppf_mask);
410 }
411 
412 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
413 {
414     /* Each reg is 128-bit */
415     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
416     addr += 8; /* Access the high 64-bit half */
417 
418     assert(index < DMAR_FRCD_REG_NR);
419 
420     vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
421     vtd_update_fsts_ppf(s);
422 }
423 
424 /* Must not update F field now, should be done later */
425 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
426                             uint16_t source_id, hwaddr addr,
427                             VTDFaultReason fault, bool is_write)
428 {
429     uint64_t hi = 0, lo;
430     hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
431 
432     assert(index < DMAR_FRCD_REG_NR);
433 
434     lo = VTD_FRCD_FI(addr);
435     hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
436     if (!is_write) {
437         hi |= VTD_FRCD_T;
438     }
439     vtd_set_quad_raw(s, frcd_reg_addr, lo);
440     vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
441 
442     trace_vtd_frr_new(index, hi, lo);
443 }
444 
445 /* Try to collapse multiple pending faults from the same requester */
446 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
447 {
448     uint32_t i;
449     uint64_t frcd_reg;
450     hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
451 
452     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
453         frcd_reg = vtd_get_quad_raw(s, addr);
454         if ((frcd_reg & VTD_FRCD_F) &&
455             ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
456             return true;
457         }
458         addr += 16; /* 128-bit for each */
459     }
460     return false;
461 }
462 
463 /* Log and report an DMAR (address translation) fault to software */
464 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
465                                   hwaddr addr, VTDFaultReason fault,
466                                   bool is_write)
467 {
468     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
469 
470     assert(fault < VTD_FR_MAX);
471 
472     if (fault == VTD_FR_RESERVED_ERR) {
473         /* This is not a normal fault reason case. Drop it. */
474         return;
475     }
476 
477     trace_vtd_dmar_fault(source_id, fault, addr, is_write);
478 
479     if (fsts_reg & VTD_FSTS_PFO) {
480         error_report_once("New fault is not recorded due to "
481                           "Primary Fault Overflow");
482         return;
483     }
484 
485     if (vtd_try_collapse_fault(s, source_id)) {
486         error_report_once("New fault is not recorded due to "
487                           "compression of faults");
488         return;
489     }
490 
491     if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
492         error_report_once("Next Fault Recording Reg is used, "
493                           "new fault is not recorded, set PFO field");
494         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
495         return;
496     }
497 
498     vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
499 
500     if (fsts_reg & VTD_FSTS_PPF) {
501         error_report_once("There are pending faults already, "
502                           "fault event is not generated");
503         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
504         s->next_frcd_reg++;
505         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
506             s->next_frcd_reg = 0;
507         }
508     } else {
509         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
510                                 VTD_FSTS_FRI(s->next_frcd_reg));
511         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
512         s->next_frcd_reg++;
513         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
514             s->next_frcd_reg = 0;
515         }
516         /* This case actually cause the PPF to be Set.
517          * So generate fault event (interrupt).
518          */
519          vtd_generate_fault_event(s, fsts_reg);
520     }
521 }
522 
523 /* Handle Invalidation Queue Errors of queued invalidation interface error
524  * conditions.
525  */
526 static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
527 {
528     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
529 
530     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
531     vtd_generate_fault_event(s, fsts_reg);
532 }
533 
534 /* Set the IWC field and try to generate an invalidation completion interrupt */
535 static void vtd_generate_completion_event(IntelIOMMUState *s)
536 {
537     if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
538         trace_vtd_inv_desc_wait_irq("One pending, skip current");
539         return;
540     }
541     vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
542     vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
543     if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
544         trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
545                                     "new event not generated");
546         return;
547     } else {
548         /* Generate the interrupt event */
549         trace_vtd_inv_desc_wait_irq("Generating complete event");
550         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
551         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
552     }
553 }
554 
555 static inline bool vtd_root_entry_present(IntelIOMMUState *s,
556                                           VTDRootEntry *re,
557                                           uint8_t devfn)
558 {
559     if (s->root_scalable && devfn > UINT8_MAX / 2) {
560         return re->hi & VTD_ROOT_ENTRY_P;
561     }
562 
563     return re->lo & VTD_ROOT_ENTRY_P;
564 }
565 
566 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
567                               VTDRootEntry *re)
568 {
569     dma_addr_t addr;
570 
571     addr = s->root + index * sizeof(*re);
572     if (dma_memory_read(&address_space_memory, addr,
573                         re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) {
574         re->lo = 0;
575         return -VTD_FR_ROOT_TABLE_INV;
576     }
577     re->lo = le64_to_cpu(re->lo);
578     re->hi = le64_to_cpu(re->hi);
579     return 0;
580 }
581 
582 static inline bool vtd_ce_present(VTDContextEntry *context)
583 {
584     return context->lo & VTD_CONTEXT_ENTRY_P;
585 }
586 
587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
588                                            VTDRootEntry *re,
589                                            uint8_t index,
590                                            VTDContextEntry *ce)
591 {
592     dma_addr_t addr, ce_size;
593 
594     /* we have checked that root entry is present */
595     ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
596               VTD_CTX_ENTRY_LEGACY_SIZE;
597 
598     if (s->root_scalable && index > UINT8_MAX / 2) {
599         index = index & (~VTD_DEVFN_CHECK_MASK);
600         addr = re->hi & VTD_ROOT_ENTRY_CTP;
601     } else {
602         addr = re->lo & VTD_ROOT_ENTRY_CTP;
603     }
604 
605     addr = addr + index * ce_size;
606     if (dma_memory_read(&address_space_memory, addr,
607                         ce, ce_size, MEMTXATTRS_UNSPECIFIED)) {
608         return -VTD_FR_CONTEXT_TABLE_INV;
609     }
610 
611     ce->lo = le64_to_cpu(ce->lo);
612     ce->hi = le64_to_cpu(ce->hi);
613     if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
614         ce->val[2] = le64_to_cpu(ce->val[2]);
615         ce->val[3] = le64_to_cpu(ce->val[3]);
616     }
617     return 0;
618 }
619 
620 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
621 {
622     return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
623 }
624 
625 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
626 {
627     return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
628 }
629 
630 /* Whether the pte indicates the address of the page frame */
631 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
632 {
633     return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
634 }
635 
636 /* Get the content of a spte located in @base_addr[@index] */
637 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
638 {
639     uint64_t slpte;
640 
641     assert(index < VTD_SL_PT_ENTRY_NR);
642 
643     if (dma_memory_read(&address_space_memory,
644                         base_addr + index * sizeof(slpte),
645                         &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) {
646         slpte = (uint64_t)-1;
647         return slpte;
648     }
649     slpte = le64_to_cpu(slpte);
650     return slpte;
651 }
652 
653 /* Given an iova and the level of paging structure, return the offset
654  * of current level.
655  */
656 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
657 {
658     return (iova >> vtd_slpt_level_shift(level)) &
659             ((1ULL << VTD_SL_LEVEL_BITS) - 1);
660 }
661 
662 /* Check Capability Register to see if the @level of page-table is supported */
663 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
664 {
665     return VTD_CAP_SAGAW_MASK & s->cap &
666            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
667 }
668 
669 /* Return true if check passed, otherwise false */
670 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
671                                      VTDPASIDEntry *pe)
672 {
673     switch (VTD_PE_GET_TYPE(pe)) {
674     case VTD_SM_PASID_ENTRY_FLT:
675     case VTD_SM_PASID_ENTRY_SLT:
676     case VTD_SM_PASID_ENTRY_NESTED:
677         break;
678     case VTD_SM_PASID_ENTRY_PT:
679         if (!x86_iommu->pt_supported) {
680             return false;
681         }
682         break;
683     default:
684         /* Unknown type */
685         return false;
686     }
687     return true;
688 }
689 
690 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
691 {
692     return pdire->val & 1;
693 }
694 
695 /**
696  * Caller of this function should check present bit if wants
697  * to use pdir entry for further usage except for fpd bit check.
698  */
699 static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
700                                          uint32_t pasid,
701                                          VTDPASIDDirEntry *pdire)
702 {
703     uint32_t index;
704     dma_addr_t addr, entry_size;
705 
706     index = VTD_PASID_DIR_INDEX(pasid);
707     entry_size = VTD_PASID_DIR_ENTRY_SIZE;
708     addr = pasid_dir_base + index * entry_size;
709     if (dma_memory_read(&address_space_memory, addr,
710                         pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
711         return -VTD_FR_PASID_TABLE_INV;
712     }
713 
714     return 0;
715 }
716 
717 static inline bool vtd_pe_present(VTDPASIDEntry *pe)
718 {
719     return pe->val[0] & VTD_PASID_ENTRY_P;
720 }
721 
722 static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
723                                           uint32_t pasid,
724                                           dma_addr_t addr,
725                                           VTDPASIDEntry *pe)
726 {
727     uint32_t index;
728     dma_addr_t entry_size;
729     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
730 
731     index = VTD_PASID_TABLE_INDEX(pasid);
732     entry_size = VTD_PASID_ENTRY_SIZE;
733     addr = addr + index * entry_size;
734     if (dma_memory_read(&address_space_memory, addr,
735                         pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
736         return -VTD_FR_PASID_TABLE_INV;
737     }
738 
739     /* Do translation type check */
740     if (!vtd_pe_type_check(x86_iommu, pe)) {
741         return -VTD_FR_PASID_TABLE_INV;
742     }
743 
744     if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
745         return -VTD_FR_PASID_TABLE_INV;
746     }
747 
748     return 0;
749 }
750 
751 /**
752  * Caller of this function should check present bit if wants
753  * to use pasid entry for further usage except for fpd bit check.
754  */
755 static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
756                                  uint32_t pasid,
757                                  VTDPASIDDirEntry *pdire,
758                                  VTDPASIDEntry *pe)
759 {
760     dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
761 
762     return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
763 }
764 
765 /**
766  * This function gets a pasid entry from a specified pasid
767  * table (includes dir and leaf table) with a specified pasid.
768  * Sanity check should be done to ensure return a present
769  * pasid entry to caller.
770  */
771 static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
772                                        dma_addr_t pasid_dir_base,
773                                        uint32_t pasid,
774                                        VTDPASIDEntry *pe)
775 {
776     int ret;
777     VTDPASIDDirEntry pdire;
778 
779     ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
780                                         pasid, &pdire);
781     if (ret) {
782         return ret;
783     }
784 
785     if (!vtd_pdire_present(&pdire)) {
786         return -VTD_FR_PASID_TABLE_INV;
787     }
788 
789     ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
790     if (ret) {
791         return ret;
792     }
793 
794     if (!vtd_pe_present(pe)) {
795         return -VTD_FR_PASID_TABLE_INV;
796     }
797 
798     return 0;
799 }
800 
801 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
802                                       VTDContextEntry *ce,
803                                       VTDPASIDEntry *pe)
804 {
805     uint32_t pasid;
806     dma_addr_t pasid_dir_base;
807     int ret = 0;
808 
809     pasid = VTD_CE_GET_RID2PASID(ce);
810     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
811     ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
812 
813     return ret;
814 }
815 
816 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
817                                 VTDContextEntry *ce,
818                                 bool *pe_fpd_set)
819 {
820     int ret;
821     uint32_t pasid;
822     dma_addr_t pasid_dir_base;
823     VTDPASIDDirEntry pdire;
824     VTDPASIDEntry pe;
825 
826     pasid = VTD_CE_GET_RID2PASID(ce);
827     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
828 
829     /*
830      * No present bit check since fpd is meaningful even
831      * if the present bit is clear.
832      */
833     ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
834     if (ret) {
835         return ret;
836     }
837 
838     if (pdire.val & VTD_PASID_DIR_FPD) {
839         *pe_fpd_set = true;
840         return 0;
841     }
842 
843     if (!vtd_pdire_present(&pdire)) {
844         return -VTD_FR_PASID_TABLE_INV;
845     }
846 
847     /*
848      * No present bit check since fpd is meaningful even
849      * if the present bit is clear.
850      */
851     ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
852     if (ret) {
853         return ret;
854     }
855 
856     if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
857         *pe_fpd_set = true;
858     }
859 
860     return 0;
861 }
862 
863 /* Get the page-table level that hardware should use for the second-level
864  * page-table walk from the Address Width field of context-entry.
865  */
866 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
867 {
868     return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
869 }
870 
871 static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
872                                    VTDContextEntry *ce)
873 {
874     VTDPASIDEntry pe;
875 
876     if (s->root_scalable) {
877         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
878         return VTD_PE_GET_LEVEL(&pe);
879     }
880 
881     return vtd_ce_get_level(ce);
882 }
883 
884 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
885 {
886     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
887 }
888 
889 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
890                                   VTDContextEntry *ce)
891 {
892     VTDPASIDEntry pe;
893 
894     if (s->root_scalable) {
895         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
896         return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
897     }
898 
899     return vtd_ce_get_agaw(ce);
900 }
901 
902 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
903 {
904     return ce->lo & VTD_CONTEXT_ENTRY_TT;
905 }
906 
907 /* Only for Legacy Mode. Return true if check passed, otherwise false */
908 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
909                                      VTDContextEntry *ce)
910 {
911     switch (vtd_ce_get_type(ce)) {
912     case VTD_CONTEXT_TT_MULTI_LEVEL:
913         /* Always supported */
914         break;
915     case VTD_CONTEXT_TT_DEV_IOTLB:
916         if (!x86_iommu->dt_supported) {
917             error_report_once("%s: DT specified but not supported", __func__);
918             return false;
919         }
920         break;
921     case VTD_CONTEXT_TT_PASS_THROUGH:
922         if (!x86_iommu->pt_supported) {
923             error_report_once("%s: PT specified but not supported", __func__);
924             return false;
925         }
926         break;
927     default:
928         /* Unknown type */
929         error_report_once("%s: unknown ce type: %"PRIu32, __func__,
930                           vtd_ce_get_type(ce));
931         return false;
932     }
933     return true;
934 }
935 
936 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
937                                       VTDContextEntry *ce, uint8_t aw)
938 {
939     uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
940     return 1ULL << MIN(ce_agaw, aw);
941 }
942 
943 /* Return true if IOVA passes range check, otherwise false. */
944 static inline bool vtd_iova_range_check(IntelIOMMUState *s,
945                                         uint64_t iova, VTDContextEntry *ce,
946                                         uint8_t aw)
947 {
948     /*
949      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
950      * in CAP_REG and AW in context-entry.
951      */
952     return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
953 }
954 
955 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
956                                           VTDContextEntry *ce)
957 {
958     VTDPASIDEntry pe;
959 
960     if (s->root_scalable) {
961         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
962         return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
963     }
964 
965     return vtd_ce_get_slpt_base(ce);
966 }
967 
968 /*
969  * Rsvd field masks for spte:
970  *     vtd_spte_rsvd 4k pages
971  *     vtd_spte_rsvd_large large pages
972  */
973 static uint64_t vtd_spte_rsvd[5];
974 static uint64_t vtd_spte_rsvd_large[5];
975 
976 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
977 {
978     uint64_t rsvd_mask = vtd_spte_rsvd[level];
979 
980     if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
981         (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
982         /* large page */
983         rsvd_mask = vtd_spte_rsvd_large[level];
984     }
985 
986     return slpte & rsvd_mask;
987 }
988 
989 /* Find the VTD address space associated with a given bus number */
990 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
991 {
992     VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
993     GHashTableIter iter;
994 
995     if (vtd_bus) {
996         return vtd_bus;
997     }
998 
999     /*
1000      * Iterate over the registered buses to find the one which
1001      * currently holds this bus number and update the bus_num
1002      * lookup table.
1003      */
1004     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1005     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1006         if (pci_bus_num(vtd_bus->bus) == bus_num) {
1007             s->vtd_as_by_bus_num[bus_num] = vtd_bus;
1008             return vtd_bus;
1009         }
1010     }
1011 
1012     return NULL;
1013 }
1014 
1015 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
1016  * of the translation, can be used for deciding the size of large page.
1017  */
1018 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
1019                              uint64_t iova, bool is_write,
1020                              uint64_t *slptep, uint32_t *slpte_level,
1021                              bool *reads, bool *writes, uint8_t aw_bits)
1022 {
1023     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1024     uint32_t level = vtd_get_iova_level(s, ce);
1025     uint32_t offset;
1026     uint64_t slpte;
1027     uint64_t access_right_check;
1028 
1029     if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
1030         error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
1031                           __func__, iova);
1032         return -VTD_FR_ADDR_BEYOND_MGAW;
1033     }
1034 
1035     /* FIXME: what is the Atomics request here? */
1036     access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
1037 
1038     while (true) {
1039         offset = vtd_iova_level_offset(iova, level);
1040         slpte = vtd_get_slpte(addr, offset);
1041 
1042         if (slpte == (uint64_t)-1) {
1043             error_report_once("%s: detected read error on DMAR slpte "
1044                               "(iova=0x%" PRIx64 ")", __func__, iova);
1045             if (level == vtd_get_iova_level(s, ce)) {
1046                 /* Invalid programming of context-entry */
1047                 return -VTD_FR_CONTEXT_ENTRY_INV;
1048             } else {
1049                 return -VTD_FR_PAGING_ENTRY_INV;
1050             }
1051         }
1052         *reads = (*reads) && (slpte & VTD_SL_R);
1053         *writes = (*writes) && (slpte & VTD_SL_W);
1054         if (!(slpte & access_right_check)) {
1055             error_report_once("%s: detected slpte permission error "
1056                               "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
1057                               "slpte=0x%" PRIx64 ", write=%d)", __func__,
1058                               iova, level, slpte, is_write);
1059             return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
1060         }
1061         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1062             error_report_once("%s: detected splte reserve non-zero "
1063                               "iova=0x%" PRIx64 ", level=0x%" PRIx32
1064                               "slpte=0x%" PRIx64 ")", __func__, iova,
1065                               level, slpte);
1066             return -VTD_FR_PAGING_ENTRY_RSVD;
1067         }
1068 
1069         if (vtd_is_last_slpte(slpte, level)) {
1070             *slptep = slpte;
1071             *slpte_level = level;
1072             return 0;
1073         }
1074         addr = vtd_get_slpte_addr(slpte, aw_bits);
1075         level--;
1076     }
1077 }
1078 
1079 typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
1080 
1081 /**
1082  * Constant information used during page walking
1083  *
1084  * @hook_fn: hook func to be called when detected page
1085  * @private: private data to be passed into hook func
1086  * @notify_unmap: whether we should notify invalid entries
1087  * @as: VT-d address space of the device
1088  * @aw: maximum address width
1089  * @domain: domain ID of the page walk
1090  */
1091 typedef struct {
1092     VTDAddressSpace *as;
1093     vtd_page_walk_hook hook_fn;
1094     void *private;
1095     bool notify_unmap;
1096     uint8_t aw;
1097     uint16_t domain_id;
1098 } vtd_page_walk_info;
1099 
1100 static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info)
1101 {
1102     VTDAddressSpace *as = info->as;
1103     vtd_page_walk_hook hook_fn = info->hook_fn;
1104     void *private = info->private;
1105     IOMMUTLBEntry *entry = &event->entry;
1106     DMAMap target = {
1107         .iova = entry->iova,
1108         .size = entry->addr_mask,
1109         .translated_addr = entry->translated_addr,
1110         .perm = entry->perm,
1111     };
1112     const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1113 
1114     if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) {
1115         trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1116         return 0;
1117     }
1118 
1119     assert(hook_fn);
1120 
1121     /* Update local IOVA mapped ranges */
1122     if (event->type == IOMMU_NOTIFIER_MAP) {
1123         if (mapped) {
1124             /* If it's exactly the same translation, skip */
1125             if (!memcmp(mapped, &target, sizeof(target))) {
1126                 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1127                                                  entry->translated_addr);
1128                 return 0;
1129             } else {
1130                 /*
1131                  * Translation changed.  Normally this should not
1132                  * happen, but it can happen when with buggy guest
1133                  * OSes.  Note that there will be a small window that
1134                  * we don't have map at all.  But that's the best
1135                  * effort we can do.  The ideal way to emulate this is
1136                  * atomically modify the PTE to follow what has
1137                  * changed, but we can't.  One example is that vfio
1138                  * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1139                  * interface to modify a mapping (meanwhile it seems
1140                  * meaningless to even provide one).  Anyway, let's
1141                  * mark this as a TODO in case one day we'll have
1142                  * a better solution.
1143                  */
1144                 IOMMUAccessFlags cache_perm = entry->perm;
1145                 int ret;
1146 
1147                 /* Emulate an UNMAP */
1148                 event->type = IOMMU_NOTIFIER_UNMAP;
1149                 entry->perm = IOMMU_NONE;
1150                 trace_vtd_page_walk_one(info->domain_id,
1151                                         entry->iova,
1152                                         entry->translated_addr,
1153                                         entry->addr_mask,
1154                                         entry->perm);
1155                 ret = hook_fn(event, private);
1156                 if (ret) {
1157                     return ret;
1158                 }
1159                 /* Drop any existing mapping */
1160                 iova_tree_remove(as->iova_tree, &target);
1161                 /* Recover the correct type */
1162                 event->type = IOMMU_NOTIFIER_MAP;
1163                 entry->perm = cache_perm;
1164             }
1165         }
1166         iova_tree_insert(as->iova_tree, &target);
1167     } else {
1168         if (!mapped) {
1169             /* Skip since we didn't map this range at all */
1170             trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1171             return 0;
1172         }
1173         iova_tree_remove(as->iova_tree, &target);
1174     }
1175 
1176     trace_vtd_page_walk_one(info->domain_id, entry->iova,
1177                             entry->translated_addr, entry->addr_mask,
1178                             entry->perm);
1179     return hook_fn(event, private);
1180 }
1181 
1182 /**
1183  * vtd_page_walk_level - walk over specific level for IOVA range
1184  *
1185  * @addr: base GPA addr to start the walk
1186  * @start: IOVA range start address
1187  * @end: IOVA range end address (start <= addr < end)
1188  * @read: whether parent level has read permission
1189  * @write: whether parent level has write permission
1190  * @info: constant information for the page walk
1191  */
1192 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1193                                uint64_t end, uint32_t level, bool read,
1194                                bool write, vtd_page_walk_info *info)
1195 {
1196     bool read_cur, write_cur, entry_valid;
1197     uint32_t offset;
1198     uint64_t slpte;
1199     uint64_t subpage_size, subpage_mask;
1200     IOMMUTLBEvent event;
1201     uint64_t iova = start;
1202     uint64_t iova_next;
1203     int ret = 0;
1204 
1205     trace_vtd_page_walk_level(addr, level, start, end);
1206 
1207     subpage_size = 1ULL << vtd_slpt_level_shift(level);
1208     subpage_mask = vtd_slpt_level_page_mask(level);
1209 
1210     while (iova < end) {
1211         iova_next = (iova & subpage_mask) + subpage_size;
1212 
1213         offset = vtd_iova_level_offset(iova, level);
1214         slpte = vtd_get_slpte(addr, offset);
1215 
1216         if (slpte == (uint64_t)-1) {
1217             trace_vtd_page_walk_skip_read(iova, iova_next);
1218             goto next;
1219         }
1220 
1221         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1222             trace_vtd_page_walk_skip_reserve(iova, iova_next);
1223             goto next;
1224         }
1225 
1226         /* Permissions are stacked with parents' */
1227         read_cur = read && (slpte & VTD_SL_R);
1228         write_cur = write && (slpte & VTD_SL_W);
1229 
1230         /*
1231          * As long as we have either read/write permission, this is a
1232          * valid entry. The rule works for both page entries and page
1233          * table entries.
1234          */
1235         entry_valid = read_cur | write_cur;
1236 
1237         if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1238             /*
1239              * This is a valid PDE (or even bigger than PDE).  We need
1240              * to walk one further level.
1241              */
1242             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1243                                       iova, MIN(iova_next, end), level - 1,
1244                                       read_cur, write_cur, info);
1245         } else {
1246             /*
1247              * This means we are either:
1248              *
1249              * (1) the real page entry (either 4K page, or huge page)
1250              * (2) the whole range is invalid
1251              *
1252              * In either case, we send an IOTLB notification down.
1253              */
1254             event.entry.target_as = &address_space_memory;
1255             event.entry.iova = iova & subpage_mask;
1256             event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1257             event.entry.addr_mask = ~subpage_mask;
1258             /* NOTE: this is only meaningful if entry_valid == true */
1259             event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1260             event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
1261                                             IOMMU_NOTIFIER_UNMAP;
1262             ret = vtd_page_walk_one(&event, info);
1263         }
1264 
1265         if (ret < 0) {
1266             return ret;
1267         }
1268 
1269 next:
1270         iova = iova_next;
1271     }
1272 
1273     return 0;
1274 }
1275 
1276 /**
1277  * vtd_page_walk - walk specific IOVA range, and call the hook
1278  *
1279  * @s: intel iommu state
1280  * @ce: context entry to walk upon
1281  * @start: IOVA address to start the walk
1282  * @end: IOVA range end address (start <= addr < end)
1283  * @info: page walking information struct
1284  */
1285 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
1286                          uint64_t start, uint64_t end,
1287                          vtd_page_walk_info *info)
1288 {
1289     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1290     uint32_t level = vtd_get_iova_level(s, ce);
1291 
1292     if (!vtd_iova_range_check(s, start, ce, info->aw)) {
1293         return -VTD_FR_ADDR_BEYOND_MGAW;
1294     }
1295 
1296     if (!vtd_iova_range_check(s, end, ce, info->aw)) {
1297         /* Fix end so that it reaches the maximum */
1298         end = vtd_iova_limit(s, ce, info->aw);
1299     }
1300 
1301     return vtd_page_walk_level(addr, start, end, level, true, true, info);
1302 }
1303 
1304 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
1305                                           VTDRootEntry *re)
1306 {
1307     /* Legacy Mode reserved bits check */
1308     if (!s->root_scalable &&
1309         (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1310         goto rsvd_err;
1311 
1312     /* Scalable Mode reserved bits check */
1313     if (s->root_scalable &&
1314         ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
1315          (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1316         goto rsvd_err;
1317 
1318     return 0;
1319 
1320 rsvd_err:
1321     error_report_once("%s: invalid root entry: hi=0x%"PRIx64
1322                       ", lo=0x%"PRIx64,
1323                       __func__, re->hi, re->lo);
1324     return -VTD_FR_ROOT_ENTRY_RSVD;
1325 }
1326 
1327 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
1328                                                     VTDContextEntry *ce)
1329 {
1330     if (!s->root_scalable &&
1331         (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
1332          ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1333         error_report_once("%s: invalid context entry: hi=%"PRIx64
1334                           ", lo=%"PRIx64" (reserved nonzero)",
1335                           __func__, ce->hi, ce->lo);
1336         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1337     }
1338 
1339     if (s->root_scalable &&
1340         (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
1341          ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
1342          ce->val[2] ||
1343          ce->val[3])) {
1344         error_report_once("%s: invalid context entry: val[3]=%"PRIx64
1345                           ", val[2]=%"PRIx64
1346                           ", val[1]=%"PRIx64
1347                           ", val[0]=%"PRIx64" (reserved nonzero)",
1348                           __func__, ce->val[3], ce->val[2],
1349                           ce->val[1], ce->val[0]);
1350         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1351     }
1352 
1353     return 0;
1354 }
1355 
1356 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
1357                                   VTDContextEntry *ce)
1358 {
1359     VTDPASIDEntry pe;
1360 
1361     /*
1362      * Make sure in Scalable Mode, a present context entry
1363      * has valid rid2pasid setting, which includes valid
1364      * rid2pasid field and corresponding pasid entry setting
1365      */
1366     return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1367 }
1368 
1369 /* Map a device to its corresponding domain (context-entry) */
1370 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1371                                     uint8_t devfn, VTDContextEntry *ce)
1372 {
1373     VTDRootEntry re;
1374     int ret_fr;
1375     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
1376 
1377     ret_fr = vtd_get_root_entry(s, bus_num, &re);
1378     if (ret_fr) {
1379         return ret_fr;
1380     }
1381 
1382     if (!vtd_root_entry_present(s, &re, devfn)) {
1383         /* Not error - it's okay we don't have root entry. */
1384         trace_vtd_re_not_present(bus_num);
1385         return -VTD_FR_ROOT_ENTRY_P;
1386     }
1387 
1388     ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
1389     if (ret_fr) {
1390         return ret_fr;
1391     }
1392 
1393     ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
1394     if (ret_fr) {
1395         return ret_fr;
1396     }
1397 
1398     if (!vtd_ce_present(ce)) {
1399         /* Not error - it's okay we don't have context entry. */
1400         trace_vtd_ce_not_present(bus_num, devfn);
1401         return -VTD_FR_CONTEXT_ENTRY_P;
1402     }
1403 
1404     ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
1405     if (ret_fr) {
1406         return ret_fr;
1407     }
1408 
1409     /* Check if the programming of context-entry is valid */
1410     if (!s->root_scalable &&
1411         !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1412         error_report_once("%s: invalid context entry: hi=%"PRIx64
1413                           ", lo=%"PRIx64" (level %d not supported)",
1414                           __func__, ce->hi, ce->lo,
1415                           vtd_ce_get_level(ce));
1416         return -VTD_FR_CONTEXT_ENTRY_INV;
1417     }
1418 
1419     if (!s->root_scalable) {
1420         /* Do translation type check */
1421         if (!vtd_ce_type_check(x86_iommu, ce)) {
1422             /* Errors dumped in vtd_ce_type_check() */
1423             return -VTD_FR_CONTEXT_ENTRY_INV;
1424         }
1425     } else {
1426         /*
1427          * Check if the programming of context-entry.rid2pasid
1428          * and corresponding pasid setting is valid, and thus
1429          * avoids to check pasid entry fetching result in future
1430          * helper function calling.
1431          */
1432         ret_fr = vtd_ce_rid2pasid_check(s, ce);
1433         if (ret_fr) {
1434             return ret_fr;
1435         }
1436     }
1437 
1438     return 0;
1439 }
1440 
1441 static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event,
1442                                      void *private)
1443 {
1444     memory_region_notify_iommu(private, 0, *event);
1445     return 0;
1446 }
1447 
1448 static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
1449                                   VTDContextEntry *ce)
1450 {
1451     VTDPASIDEntry pe;
1452 
1453     if (s->root_scalable) {
1454         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1455         return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
1456     }
1457 
1458     return VTD_CONTEXT_ENTRY_DID(ce->hi);
1459 }
1460 
1461 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1462                                             VTDContextEntry *ce,
1463                                             hwaddr addr, hwaddr size)
1464 {
1465     IntelIOMMUState *s = vtd_as->iommu_state;
1466     vtd_page_walk_info info = {
1467         .hook_fn = vtd_sync_shadow_page_hook,
1468         .private = (void *)&vtd_as->iommu,
1469         .notify_unmap = true,
1470         .aw = s->aw_bits,
1471         .as = vtd_as,
1472         .domain_id = vtd_get_domain_id(s, ce),
1473     };
1474 
1475     return vtd_page_walk(s, ce, addr, addr + size, &info);
1476 }
1477 
1478 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1479 {
1480     int ret;
1481     VTDContextEntry ce;
1482     IOMMUNotifier *n;
1483 
1484     if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
1485         return 0;
1486     }
1487 
1488     ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
1489                                    pci_bus_num(vtd_as->bus),
1490                                    vtd_as->devfn, &ce);
1491     if (ret) {
1492         if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
1493             /*
1494              * It's a valid scenario to have a context entry that is
1495              * not present.  For example, when a device is removed
1496              * from an existing domain then the context entry will be
1497              * zeroed by the guest before it was put into another
1498              * domain.  When this happens, instead of synchronizing
1499              * the shadow pages we should invalidate all existing
1500              * mappings and notify the backends.
1501              */
1502             IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
1503                 vtd_address_space_unmap(vtd_as, n);
1504             }
1505             ret = 0;
1506         }
1507         return ret;
1508     }
1509 
1510     return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
1511 }
1512 
1513 /*
1514  * Check if specific device is configured to bypass address
1515  * translation for DMA requests. In Scalable Mode, bypass
1516  * 1st-level translation or 2nd-level translation, it depends
1517  * on PGTT setting.
1518  */
1519 static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
1520 {
1521     IntelIOMMUState *s;
1522     VTDContextEntry ce;
1523     VTDPASIDEntry pe;
1524     int ret;
1525 
1526     assert(as);
1527 
1528     s = as->iommu_state;
1529     ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1530                                    as->devfn, &ce);
1531     if (ret) {
1532         /*
1533          * Possibly failed to parse the context entry for some reason
1534          * (e.g., during init, or any guest configuration errors on
1535          * context entries). We should assume PT not enabled for
1536          * safety.
1537          */
1538         return false;
1539     }
1540 
1541     if (s->root_scalable) {
1542         ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
1543         if (ret) {
1544             error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
1545                               __func__, ret);
1546             return false;
1547         }
1548         return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
1549     }
1550 
1551     return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
1552 }
1553 
1554 /* Return whether the device is using IOMMU translation. */
1555 static bool vtd_switch_address_space(VTDAddressSpace *as)
1556 {
1557     bool use_iommu;
1558     /* Whether we need to take the BQL on our own */
1559     bool take_bql = !qemu_mutex_iothread_locked();
1560 
1561     assert(as);
1562 
1563     use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as);
1564 
1565     trace_vtd_switch_address_space(pci_bus_num(as->bus),
1566                                    VTD_PCI_SLOT(as->devfn),
1567                                    VTD_PCI_FUNC(as->devfn),
1568                                    use_iommu);
1569 
1570     /*
1571      * It's possible that we reach here without BQL, e.g., when called
1572      * from vtd_pt_enable_fast_path(). However the memory APIs need
1573      * it. We'd better make sure we have had it already, or, take it.
1574      */
1575     if (take_bql) {
1576         qemu_mutex_lock_iothread();
1577     }
1578 
1579     /* Turn off first then on the other */
1580     if (use_iommu) {
1581         memory_region_set_enabled(&as->nodmar, false);
1582         memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1583     } else {
1584         memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1585         memory_region_set_enabled(&as->nodmar, true);
1586     }
1587 
1588     if (take_bql) {
1589         qemu_mutex_unlock_iothread();
1590     }
1591 
1592     return use_iommu;
1593 }
1594 
1595 static void vtd_switch_address_space_all(IntelIOMMUState *s)
1596 {
1597     GHashTableIter iter;
1598     VTDBus *vtd_bus;
1599     int i;
1600 
1601     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1602     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1603         for (i = 0; i < PCI_DEVFN_MAX; i++) {
1604             if (!vtd_bus->dev_as[i]) {
1605                 continue;
1606             }
1607             vtd_switch_address_space(vtd_bus->dev_as[i]);
1608         }
1609     }
1610 }
1611 
1612 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1613 {
1614     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1615 }
1616 
1617 static const bool vtd_qualified_faults[] = {
1618     [VTD_FR_RESERVED] = false,
1619     [VTD_FR_ROOT_ENTRY_P] = false,
1620     [VTD_FR_CONTEXT_ENTRY_P] = true,
1621     [VTD_FR_CONTEXT_ENTRY_INV] = true,
1622     [VTD_FR_ADDR_BEYOND_MGAW] = true,
1623     [VTD_FR_WRITE] = true,
1624     [VTD_FR_READ] = true,
1625     [VTD_FR_PAGING_ENTRY_INV] = true,
1626     [VTD_FR_ROOT_TABLE_INV] = false,
1627     [VTD_FR_CONTEXT_TABLE_INV] = false,
1628     [VTD_FR_ROOT_ENTRY_RSVD] = false,
1629     [VTD_FR_PAGING_ENTRY_RSVD] = true,
1630     [VTD_FR_CONTEXT_ENTRY_TT] = true,
1631     [VTD_FR_PASID_TABLE_INV] = false,
1632     [VTD_FR_RESERVED_ERR] = false,
1633     [VTD_FR_MAX] = false,
1634 };
1635 
1636 /* To see if a fault condition is "qualified", which is reported to software
1637  * only if the FPD field in the context-entry used to process the faulting
1638  * request is 0.
1639  */
1640 static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1641 {
1642     return vtd_qualified_faults[fault];
1643 }
1644 
1645 static inline bool vtd_is_interrupt_addr(hwaddr addr)
1646 {
1647     return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1648 }
1649 
1650 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1651 {
1652     VTDBus *vtd_bus;
1653     VTDAddressSpace *vtd_as;
1654     bool success = false;
1655 
1656     vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1657     if (!vtd_bus) {
1658         goto out;
1659     }
1660 
1661     vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1662     if (!vtd_as) {
1663         goto out;
1664     }
1665 
1666     if (vtd_switch_address_space(vtd_as) == false) {
1667         /* We switched off IOMMU region successfully. */
1668         success = true;
1669     }
1670 
1671 out:
1672     trace_vtd_pt_enable_fast_path(source_id, success);
1673 }
1674 
1675 /* Map dev to context-entry then do a paging-structures walk to do a iommu
1676  * translation.
1677  *
1678  * Called from RCU critical section.
1679  *
1680  * @bus_num: The bus number
1681  * @devfn: The devfn, which is the  combined of device and function number
1682  * @is_write: The access is a write operation
1683  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1684  *
1685  * Returns true if translation is successful, otherwise false.
1686  */
1687 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1688                                    uint8_t devfn, hwaddr addr, bool is_write,
1689                                    IOMMUTLBEntry *entry)
1690 {
1691     IntelIOMMUState *s = vtd_as->iommu_state;
1692     VTDContextEntry ce;
1693     uint8_t bus_num = pci_bus_num(bus);
1694     VTDContextCacheEntry *cc_entry;
1695     uint64_t slpte, page_mask;
1696     uint32_t level;
1697     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1698     int ret_fr;
1699     bool is_fpd_set = false;
1700     bool reads = true;
1701     bool writes = true;
1702     uint8_t access_flags;
1703     VTDIOTLBEntry *iotlb_entry;
1704 
1705     /*
1706      * We have standalone memory region for interrupt addresses, we
1707      * should never receive translation requests in this region.
1708      */
1709     assert(!vtd_is_interrupt_addr(addr));
1710 
1711     vtd_iommu_lock(s);
1712 
1713     cc_entry = &vtd_as->context_cache_entry;
1714 
1715     /* Try to fetch slpte form IOTLB */
1716     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1717     if (iotlb_entry) {
1718         trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1719                                  iotlb_entry->domain_id);
1720         slpte = iotlb_entry->slpte;
1721         access_flags = iotlb_entry->access_flags;
1722         page_mask = iotlb_entry->mask;
1723         goto out;
1724     }
1725 
1726     /* Try to fetch context-entry from cache first */
1727     if (cc_entry->context_cache_gen == s->context_cache_gen) {
1728         trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1729                                cc_entry->context_entry.lo,
1730                                cc_entry->context_cache_gen);
1731         ce = cc_entry->context_entry;
1732         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1733         if (!is_fpd_set && s->root_scalable) {
1734             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1735             VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1736         }
1737     } else {
1738         ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1739         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1740         if (!ret_fr && !is_fpd_set && s->root_scalable) {
1741             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1742         }
1743         VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1744         /* Update context-cache */
1745         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1746                                   cc_entry->context_cache_gen,
1747                                   s->context_cache_gen);
1748         cc_entry->context_entry = ce;
1749         cc_entry->context_cache_gen = s->context_cache_gen;
1750     }
1751 
1752     /*
1753      * We don't need to translate for pass-through context entries.
1754      * Also, let's ignore IOTLB caching as well for PT devices.
1755      */
1756     if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
1757         entry->iova = addr & VTD_PAGE_MASK_4K;
1758         entry->translated_addr = entry->iova;
1759         entry->addr_mask = ~VTD_PAGE_MASK_4K;
1760         entry->perm = IOMMU_RW;
1761         trace_vtd_translate_pt(source_id, entry->iova);
1762 
1763         /*
1764          * When this happens, it means firstly caching-mode is not
1765          * enabled, and this is the first passthrough translation for
1766          * the device. Let's enable the fast path for passthrough.
1767          *
1768          * When passthrough is disabled again for the device, we can
1769          * capture it via the context entry invalidation, then the
1770          * IOMMU region can be swapped back.
1771          */
1772         vtd_pt_enable_fast_path(s, source_id);
1773         vtd_iommu_unlock(s);
1774         return true;
1775     }
1776 
1777     ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
1778                                &reads, &writes, s->aw_bits);
1779     VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1780 
1781     page_mask = vtd_slpt_level_page_mask(level);
1782     access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1783     vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
1784                      access_flags, level);
1785 out:
1786     vtd_iommu_unlock(s);
1787     entry->iova = addr & page_mask;
1788     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1789     entry->addr_mask = ~page_mask;
1790     entry->perm = access_flags;
1791     return true;
1792 
1793 error:
1794     vtd_iommu_unlock(s);
1795     entry->iova = 0;
1796     entry->translated_addr = 0;
1797     entry->addr_mask = 0;
1798     entry->perm = IOMMU_NONE;
1799     return false;
1800 }
1801 
1802 static void vtd_root_table_setup(IntelIOMMUState *s)
1803 {
1804     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1805     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1806 
1807     vtd_update_scalable_state(s);
1808 
1809     trace_vtd_reg_dmar_root(s->root, s->root_scalable);
1810 }
1811 
1812 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1813                                uint32_t index, uint32_t mask)
1814 {
1815     x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1816 }
1817 
1818 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1819 {
1820     uint64_t value = 0;
1821     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1822     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1823     s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1824     s->intr_eime = value & VTD_IRTA_EIME;
1825 
1826     /* Notify global invalidation */
1827     vtd_iec_notify_all(s, true, 0, 0);
1828 
1829     trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1830 }
1831 
1832 static void vtd_iommu_replay_all(IntelIOMMUState *s)
1833 {
1834     VTDAddressSpace *vtd_as;
1835 
1836     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1837         vtd_sync_shadow_page_table(vtd_as);
1838     }
1839 }
1840 
1841 static void vtd_context_global_invalidate(IntelIOMMUState *s)
1842 {
1843     trace_vtd_inv_desc_cc_global();
1844     /* Protects context cache */
1845     vtd_iommu_lock(s);
1846     s->context_cache_gen++;
1847     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1848         vtd_reset_context_cache_locked(s);
1849     }
1850     vtd_iommu_unlock(s);
1851     vtd_address_space_refresh_all(s);
1852     /*
1853      * From VT-d spec 6.5.2.1, a global context entry invalidation
1854      * should be followed by a IOTLB global invalidation, so we should
1855      * be safe even without this. Hoewever, let's replay the region as
1856      * well to be safer, and go back here when we need finer tunes for
1857      * VT-d emulation codes.
1858      */
1859     vtd_iommu_replay_all(s);
1860 }
1861 
1862 /* Do a context-cache device-selective invalidation.
1863  * @func_mask: FM field after shifting
1864  */
1865 static void vtd_context_device_invalidate(IntelIOMMUState *s,
1866                                           uint16_t source_id,
1867                                           uint16_t func_mask)
1868 {
1869     uint16_t mask;
1870     VTDBus *vtd_bus;
1871     VTDAddressSpace *vtd_as;
1872     uint8_t bus_n, devfn;
1873     uint16_t devfn_it;
1874 
1875     trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1876 
1877     switch (func_mask & 3) {
1878     case 0:
1879         mask = 0;   /* No bits in the SID field masked */
1880         break;
1881     case 1:
1882         mask = 4;   /* Mask bit 2 in the SID field */
1883         break;
1884     case 2:
1885         mask = 6;   /* Mask bit 2:1 in the SID field */
1886         break;
1887     case 3:
1888         mask = 7;   /* Mask bit 2:0 in the SID field */
1889         break;
1890     default:
1891         g_assert_not_reached();
1892     }
1893     mask = ~mask;
1894 
1895     bus_n = VTD_SID_TO_BUS(source_id);
1896     vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1897     if (vtd_bus) {
1898         devfn = VTD_SID_TO_DEVFN(source_id);
1899         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1900             vtd_as = vtd_bus->dev_as[devfn_it];
1901             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1902                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1903                                              VTD_PCI_FUNC(devfn_it));
1904                 vtd_iommu_lock(s);
1905                 vtd_as->context_cache_entry.context_cache_gen = 0;
1906                 vtd_iommu_unlock(s);
1907                 /*
1908                  * Do switch address space when needed, in case if the
1909                  * device passthrough bit is switched.
1910                  */
1911                 vtd_switch_address_space(vtd_as);
1912                 /*
1913                  * So a device is moving out of (or moving into) a
1914                  * domain, resync the shadow page table.
1915                  * This won't bring bad even if we have no such
1916                  * notifier registered - the IOMMU notification
1917                  * framework will skip MAP notifications if that
1918                  * happened.
1919                  */
1920                 vtd_sync_shadow_page_table(vtd_as);
1921             }
1922         }
1923     }
1924 }
1925 
1926 /* Context-cache invalidation
1927  * Returns the Context Actual Invalidation Granularity.
1928  * @val: the content of the CCMD_REG
1929  */
1930 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1931 {
1932     uint64_t caig;
1933     uint64_t type = val & VTD_CCMD_CIRG_MASK;
1934 
1935     switch (type) {
1936     case VTD_CCMD_DOMAIN_INVL:
1937         /* Fall through */
1938     case VTD_CCMD_GLOBAL_INVL:
1939         caig = VTD_CCMD_GLOBAL_INVL_A;
1940         vtd_context_global_invalidate(s);
1941         break;
1942 
1943     case VTD_CCMD_DEVICE_INVL:
1944         caig = VTD_CCMD_DEVICE_INVL_A;
1945         vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1946         break;
1947 
1948     default:
1949         error_report_once("%s: invalid context: 0x%" PRIx64,
1950                           __func__, val);
1951         caig = 0;
1952     }
1953     return caig;
1954 }
1955 
1956 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1957 {
1958     trace_vtd_inv_desc_iotlb_global();
1959     vtd_reset_iotlb(s);
1960     vtd_iommu_replay_all(s);
1961 }
1962 
1963 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1964 {
1965     VTDContextEntry ce;
1966     VTDAddressSpace *vtd_as;
1967 
1968     trace_vtd_inv_desc_iotlb_domain(domain_id);
1969 
1970     vtd_iommu_lock(s);
1971     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1972                                 &domain_id);
1973     vtd_iommu_unlock(s);
1974 
1975     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1976         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1977                                       vtd_as->devfn, &ce) &&
1978             domain_id == vtd_get_domain_id(s, &ce)) {
1979             vtd_sync_shadow_page_table(vtd_as);
1980         }
1981     }
1982 }
1983 
1984 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1985                                            uint16_t domain_id, hwaddr addr,
1986                                            uint8_t am)
1987 {
1988     VTDAddressSpace *vtd_as;
1989     VTDContextEntry ce;
1990     int ret;
1991     hwaddr size = (1 << am) * VTD_PAGE_SIZE;
1992 
1993     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
1994         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1995                                        vtd_as->devfn, &ce);
1996         if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
1997             if (vtd_as_has_map_notifier(vtd_as)) {
1998                 /*
1999                  * As long as we have MAP notifications registered in
2000                  * any of our IOMMU notifiers, we need to sync the
2001                  * shadow page table.
2002                  */
2003                 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
2004             } else {
2005                 /*
2006                  * For UNMAP-only notifiers, we don't need to walk the
2007                  * page tables.  We just deliver the PSI down to
2008                  * invalidate caches.
2009                  */
2010                 IOMMUTLBEvent event = {
2011                     .type = IOMMU_NOTIFIER_UNMAP,
2012                     .entry = {
2013                         .target_as = &address_space_memory,
2014                         .iova = addr,
2015                         .translated_addr = 0,
2016                         .addr_mask = size - 1,
2017                         .perm = IOMMU_NONE,
2018                     },
2019                 };
2020                 memory_region_notify_iommu(&vtd_as->iommu, 0, event);
2021             }
2022         }
2023     }
2024 }
2025 
2026 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
2027                                       hwaddr addr, uint8_t am)
2028 {
2029     VTDIOTLBPageInvInfo info;
2030 
2031     trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
2032 
2033     assert(am <= VTD_MAMV);
2034     info.domain_id = domain_id;
2035     info.addr = addr;
2036     info.mask = ~((1 << am) - 1);
2037     vtd_iommu_lock(s);
2038     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
2039     vtd_iommu_unlock(s);
2040     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
2041 }
2042 
2043 /* Flush IOTLB
2044  * Returns the IOTLB Actual Invalidation Granularity.
2045  * @val: the content of the IOTLB_REG
2046  */
2047 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
2048 {
2049     uint64_t iaig;
2050     uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
2051     uint16_t domain_id;
2052     hwaddr addr;
2053     uint8_t am;
2054 
2055     switch (type) {
2056     case VTD_TLB_GLOBAL_FLUSH:
2057         iaig = VTD_TLB_GLOBAL_FLUSH_A;
2058         vtd_iotlb_global_invalidate(s);
2059         break;
2060 
2061     case VTD_TLB_DSI_FLUSH:
2062         domain_id = VTD_TLB_DID(val);
2063         iaig = VTD_TLB_DSI_FLUSH_A;
2064         vtd_iotlb_domain_invalidate(s, domain_id);
2065         break;
2066 
2067     case VTD_TLB_PSI_FLUSH:
2068         domain_id = VTD_TLB_DID(val);
2069         addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
2070         am = VTD_IVA_AM(addr);
2071         addr = VTD_IVA_ADDR(addr);
2072         if (am > VTD_MAMV) {
2073             error_report_once("%s: address mask overflow: 0x%" PRIx64,
2074                               __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
2075             iaig = 0;
2076             break;
2077         }
2078         iaig = VTD_TLB_PSI_FLUSH_A;
2079         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2080         break;
2081 
2082     default:
2083         error_report_once("%s: invalid granularity: 0x%" PRIx64,
2084                           __func__, val);
2085         iaig = 0;
2086     }
2087     return iaig;
2088 }
2089 
2090 static void vtd_fetch_inv_desc(IntelIOMMUState *s);
2091 
2092 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
2093 {
2094     return s->qi_enabled && (s->iq_tail == s->iq_head) &&
2095            (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
2096 }
2097 
2098 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
2099 {
2100     uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
2101 
2102     trace_vtd_inv_qi_enable(en);
2103 
2104     if (en) {
2105         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
2106         /* 2^(x+8) entries */
2107         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
2108         s->qi_enabled = true;
2109         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
2110         /* Ok - report back to driver */
2111         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
2112 
2113         if (s->iq_tail != 0) {
2114             /*
2115              * This is a spec violation but Windows guests are known to set up
2116              * Queued Invalidation this way so we allow the write and process
2117              * Invalidation Descriptors right away.
2118              */
2119             trace_vtd_warn_invalid_qi_tail(s->iq_tail);
2120             if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2121                 vtd_fetch_inv_desc(s);
2122             }
2123         }
2124     } else {
2125         if (vtd_queued_inv_disable_check(s)) {
2126             /* disable Queued Invalidation */
2127             vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
2128             s->iq_head = 0;
2129             s->qi_enabled = false;
2130             /* Ok - report back to driver */
2131             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
2132         } else {
2133             error_report_once("%s: detected improper state when disable QI "
2134                               "(head=0x%x, tail=0x%x, last_type=%d)",
2135                               __func__,
2136                               s->iq_head, s->iq_tail, s->iq_last_desc_type);
2137         }
2138     }
2139 }
2140 
2141 /* Set Root Table Pointer */
2142 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
2143 {
2144     vtd_root_table_setup(s);
2145     /* Ok - report back to driver */
2146     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
2147     vtd_reset_caches(s);
2148     vtd_address_space_refresh_all(s);
2149 }
2150 
2151 /* Set Interrupt Remap Table Pointer */
2152 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
2153 {
2154     vtd_interrupt_remap_table_setup(s);
2155     /* Ok - report back to driver */
2156     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
2157 }
2158 
2159 /* Handle Translation Enable/Disable */
2160 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
2161 {
2162     if (s->dmar_enabled == en) {
2163         return;
2164     }
2165 
2166     trace_vtd_dmar_enable(en);
2167 
2168     if (en) {
2169         s->dmar_enabled = true;
2170         /* Ok - report back to driver */
2171         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
2172     } else {
2173         s->dmar_enabled = false;
2174 
2175         /* Clear the index of Fault Recording Register */
2176         s->next_frcd_reg = 0;
2177         /* Ok - report back to driver */
2178         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
2179     }
2180 
2181     vtd_reset_caches(s);
2182     vtd_address_space_refresh_all(s);
2183 }
2184 
2185 /* Handle Interrupt Remap Enable/Disable */
2186 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
2187 {
2188     trace_vtd_ir_enable(en);
2189 
2190     if (en) {
2191         s->intr_enabled = true;
2192         /* Ok - report back to driver */
2193         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
2194     } else {
2195         s->intr_enabled = false;
2196         /* Ok - report back to driver */
2197         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
2198     }
2199 }
2200 
2201 /* Handle write to Global Command Register */
2202 static void vtd_handle_gcmd_write(IntelIOMMUState *s)
2203 {
2204     uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
2205     uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
2206     uint32_t changed = status ^ val;
2207 
2208     trace_vtd_reg_write_gcmd(status, val);
2209     if (changed & VTD_GCMD_TE) {
2210         /* Translation enable/disable */
2211         vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
2212     }
2213     if (val & VTD_GCMD_SRTP) {
2214         /* Set/update the root-table pointer */
2215         vtd_handle_gcmd_srtp(s);
2216     }
2217     if (changed & VTD_GCMD_QIE) {
2218         /* Queued Invalidation Enable */
2219         vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
2220     }
2221     if (val & VTD_GCMD_SIRTP) {
2222         /* Set/update the interrupt remapping root-table pointer */
2223         vtd_handle_gcmd_sirtp(s);
2224     }
2225     if (changed & VTD_GCMD_IRE) {
2226         /* Interrupt remap enable/disable */
2227         vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
2228     }
2229 }
2230 
2231 /* Handle write to Context Command Register */
2232 static void vtd_handle_ccmd_write(IntelIOMMUState *s)
2233 {
2234     uint64_t ret;
2235     uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
2236 
2237     /* Context-cache invalidation request */
2238     if (val & VTD_CCMD_ICC) {
2239         if (s->qi_enabled) {
2240             error_report_once("Queued Invalidation enabled, "
2241                               "should not use register-based invalidation");
2242             return;
2243         }
2244         ret = vtd_context_cache_invalidate(s, val);
2245         /* Invalidation completed. Change something to show */
2246         vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
2247         ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
2248                                       ret);
2249     }
2250 }
2251 
2252 /* Handle write to IOTLB Invalidation Register */
2253 static void vtd_handle_iotlb_write(IntelIOMMUState *s)
2254 {
2255     uint64_t ret;
2256     uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
2257 
2258     /* IOTLB invalidation request */
2259     if (val & VTD_TLB_IVT) {
2260         if (s->qi_enabled) {
2261             error_report_once("Queued Invalidation enabled, "
2262                               "should not use register-based invalidation");
2263             return;
2264         }
2265         ret = vtd_iotlb_flush(s, val);
2266         /* Invalidation completed. Change something to show */
2267         vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
2268         ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
2269                                       VTD_TLB_FLUSH_GRANU_MASK_A, ret);
2270     }
2271 }
2272 
2273 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
2274 static bool vtd_get_inv_desc(IntelIOMMUState *s,
2275                              VTDInvDesc *inv_desc)
2276 {
2277     dma_addr_t base_addr = s->iq;
2278     uint32_t offset = s->iq_head;
2279     uint32_t dw = s->iq_dw ? 32 : 16;
2280     dma_addr_t addr = base_addr + offset * dw;
2281 
2282     if (dma_memory_read(&address_space_memory, addr,
2283                         inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) {
2284         error_report_once("Read INV DESC failed.");
2285         return false;
2286     }
2287     inv_desc->lo = le64_to_cpu(inv_desc->lo);
2288     inv_desc->hi = le64_to_cpu(inv_desc->hi);
2289     if (dw == 32) {
2290         inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
2291         inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
2292     }
2293     return true;
2294 }
2295 
2296 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2297 {
2298     if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
2299         (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
2300         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2301                           " (reserved nonzero)", __func__, inv_desc->hi,
2302                           inv_desc->lo);
2303         return false;
2304     }
2305     if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
2306         /* Status Write */
2307         uint32_t status_data = (uint32_t)(inv_desc->lo >>
2308                                VTD_INV_DESC_WAIT_DATA_SHIFT);
2309 
2310         assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
2311 
2312         /* FIXME: need to be masked with HAW? */
2313         dma_addr_t status_addr = inv_desc->hi;
2314         trace_vtd_inv_desc_wait_sw(status_addr, status_data);
2315         status_data = cpu_to_le32(status_data);
2316         if (dma_memory_write(&address_space_memory, status_addr,
2317                              &status_data, sizeof(status_data),
2318                              MEMTXATTRS_UNSPECIFIED)) {
2319             trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
2320             return false;
2321         }
2322     } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
2323         /* Interrupt flag */
2324         vtd_generate_completion_event(s);
2325     } else {
2326         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2327                           " (unknown type)", __func__, inv_desc->hi,
2328                           inv_desc->lo);
2329         return false;
2330     }
2331     return true;
2332 }
2333 
2334 static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
2335                                            VTDInvDesc *inv_desc)
2336 {
2337     uint16_t sid, fmask;
2338 
2339     if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
2340         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2341                           " (reserved nonzero)", __func__, inv_desc->hi,
2342                           inv_desc->lo);
2343         return false;
2344     }
2345     switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
2346     case VTD_INV_DESC_CC_DOMAIN:
2347         trace_vtd_inv_desc_cc_domain(
2348             (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
2349         /* Fall through */
2350     case VTD_INV_DESC_CC_GLOBAL:
2351         vtd_context_global_invalidate(s);
2352         break;
2353 
2354     case VTD_INV_DESC_CC_DEVICE:
2355         sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
2356         fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
2357         vtd_context_device_invalidate(s, sid, fmask);
2358         break;
2359 
2360     default:
2361         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2362                           " (invalid type)", __func__, inv_desc->hi,
2363                           inv_desc->lo);
2364         return false;
2365     }
2366     return true;
2367 }
2368 
2369 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2370 {
2371     uint16_t domain_id;
2372     uint8_t am;
2373     hwaddr addr;
2374 
2375     if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
2376         (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
2377         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2378                           ", lo=0x%"PRIx64" (reserved bits unzero)",
2379                           __func__, inv_desc->hi, inv_desc->lo);
2380         return false;
2381     }
2382 
2383     switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
2384     case VTD_INV_DESC_IOTLB_GLOBAL:
2385         vtd_iotlb_global_invalidate(s);
2386         break;
2387 
2388     case VTD_INV_DESC_IOTLB_DOMAIN:
2389         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2390         vtd_iotlb_domain_invalidate(s, domain_id);
2391         break;
2392 
2393     case VTD_INV_DESC_IOTLB_PAGE:
2394         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2395         addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
2396         am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
2397         if (am > VTD_MAMV) {
2398             error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2399                               ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)",
2400                               __func__, inv_desc->hi, inv_desc->lo,
2401                               am, (unsigned)VTD_MAMV);
2402             return false;
2403         }
2404         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2405         break;
2406 
2407     default:
2408         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2409                           ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
2410                           __func__, inv_desc->hi, inv_desc->lo,
2411                           inv_desc->lo & VTD_INV_DESC_IOTLB_G);
2412         return false;
2413     }
2414     return true;
2415 }
2416 
2417 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
2418                                      VTDInvDesc *inv_desc)
2419 {
2420     trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
2421                            inv_desc->iec.index,
2422                            inv_desc->iec.index_mask);
2423 
2424     vtd_iec_notify_all(s, !inv_desc->iec.granularity,
2425                        inv_desc->iec.index,
2426                        inv_desc->iec.index_mask);
2427     return true;
2428 }
2429 
2430 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
2431                                           VTDInvDesc *inv_desc)
2432 {
2433     VTDAddressSpace *vtd_dev_as;
2434     IOMMUTLBEvent event;
2435     struct VTDBus *vtd_bus;
2436     hwaddr addr;
2437     uint64_t sz;
2438     uint16_t sid;
2439     uint8_t devfn;
2440     bool size;
2441     uint8_t bus_num;
2442 
2443     addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
2444     sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
2445     devfn = sid & 0xff;
2446     bus_num = sid >> 8;
2447     size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
2448 
2449     if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
2450         (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
2451         error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
2452                           ", lo=%"PRIx64" (reserved nonzero)", __func__,
2453                           inv_desc->hi, inv_desc->lo);
2454         return false;
2455     }
2456 
2457     vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2458     if (!vtd_bus) {
2459         goto done;
2460     }
2461 
2462     vtd_dev_as = vtd_bus->dev_as[devfn];
2463     if (!vtd_dev_as) {
2464         goto done;
2465     }
2466 
2467     /* According to ATS spec table 2.4:
2468      * S = 0, bits 15:12 = xxxx     range size: 4K
2469      * S = 1, bits 15:12 = xxx0     range size: 8K
2470      * S = 1, bits 15:12 = xx01     range size: 16K
2471      * S = 1, bits 15:12 = x011     range size: 32K
2472      * S = 1, bits 15:12 = 0111     range size: 64K
2473      * ...
2474      */
2475     if (size) {
2476         sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2477         addr &= ~(sz - 1);
2478     } else {
2479         sz = VTD_PAGE_SIZE;
2480     }
2481 
2482     event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
2483     event.entry.target_as = &vtd_dev_as->as;
2484     event.entry.addr_mask = sz - 1;
2485     event.entry.iova = addr;
2486     event.entry.perm = IOMMU_NONE;
2487     event.entry.translated_addr = 0;
2488     memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
2489 
2490 done:
2491     return true;
2492 }
2493 
2494 static bool vtd_process_inv_desc(IntelIOMMUState *s)
2495 {
2496     VTDInvDesc inv_desc;
2497     uint8_t desc_type;
2498 
2499     trace_vtd_inv_qi_head(s->iq_head);
2500     if (!vtd_get_inv_desc(s, &inv_desc)) {
2501         s->iq_last_desc_type = VTD_INV_DESC_NONE;
2502         return false;
2503     }
2504 
2505     desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2506     /* FIXME: should update at first or at last? */
2507     s->iq_last_desc_type = desc_type;
2508 
2509     switch (desc_type) {
2510     case VTD_INV_DESC_CC:
2511         trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2512         if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2513             return false;
2514         }
2515         break;
2516 
2517     case VTD_INV_DESC_IOTLB:
2518         trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2519         if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2520             return false;
2521         }
2522         break;
2523 
2524     /*
2525      * TODO: the entity of below two cases will be implemented in future series.
2526      * To make guest (which integrates scalable mode support patch set in
2527      * iommu driver) work, just return true is enough so far.
2528      */
2529     case VTD_INV_DESC_PC:
2530         break;
2531 
2532     case VTD_INV_DESC_PIOTLB:
2533         break;
2534 
2535     case VTD_INV_DESC_WAIT:
2536         trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2537         if (!vtd_process_wait_desc(s, &inv_desc)) {
2538             return false;
2539         }
2540         break;
2541 
2542     case VTD_INV_DESC_IEC:
2543         trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2544         if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2545             return false;
2546         }
2547         break;
2548 
2549     case VTD_INV_DESC_DEVICE:
2550         trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2551         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2552             return false;
2553         }
2554         break;
2555 
2556     default:
2557         error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
2558                           " (unknown type)", __func__, inv_desc.hi,
2559                           inv_desc.lo);
2560         return false;
2561     }
2562     s->iq_head++;
2563     if (s->iq_head == s->iq_size) {
2564         s->iq_head = 0;
2565     }
2566     return true;
2567 }
2568 
2569 /* Try to fetch and process more Invalidation Descriptors */
2570 static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2571 {
2572     int qi_shift;
2573 
2574     /* Refer to 10.4.23 of VT-d spec 3.0 */
2575     qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4;
2576 
2577     trace_vtd_inv_qi_fetch();
2578 
2579     if (s->iq_tail >= s->iq_size) {
2580         /* Detects an invalid Tail pointer */
2581         error_report_once("%s: detected invalid QI tail "
2582                           "(tail=0x%x, size=0x%x)",
2583                           __func__, s->iq_tail, s->iq_size);
2584         vtd_handle_inv_queue_error(s);
2585         return;
2586     }
2587     while (s->iq_head != s->iq_tail) {
2588         if (!vtd_process_inv_desc(s)) {
2589             /* Invalidation Queue Errors */
2590             vtd_handle_inv_queue_error(s);
2591             break;
2592         }
2593         /* Must update the IQH_REG in time */
2594         vtd_set_quad_raw(s, DMAR_IQH_REG,
2595                          (((uint64_t)(s->iq_head)) << qi_shift) &
2596                          VTD_IQH_QH_MASK);
2597     }
2598 }
2599 
2600 /* Handle write to Invalidation Queue Tail Register */
2601 static void vtd_handle_iqt_write(IntelIOMMUState *s)
2602 {
2603     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2604 
2605     if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
2606         error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
2607                           __func__, val);
2608         return;
2609     }
2610     s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
2611     trace_vtd_inv_qi_tail(s->iq_tail);
2612 
2613     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2614         /* Process Invalidation Queue here */
2615         vtd_fetch_inv_desc(s);
2616     }
2617 }
2618 
2619 static void vtd_handle_fsts_write(IntelIOMMUState *s)
2620 {
2621     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2622     uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2623     uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2624 
2625     if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2626         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2627         trace_vtd_fsts_clear_ip();
2628     }
2629     /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2630      * Descriptors if there are any when Queued Invalidation is enabled?
2631      */
2632 }
2633 
2634 static void vtd_handle_fectl_write(IntelIOMMUState *s)
2635 {
2636     uint32_t fectl_reg;
2637     /* FIXME: when software clears the IM field, check the IP field. But do we
2638      * need to compare the old value and the new value to conclude that
2639      * software clears the IM field? Or just check if the IM field is zero?
2640      */
2641     fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2642 
2643     trace_vtd_reg_write_fectl(fectl_reg);
2644 
2645     if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2646         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2647         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2648     }
2649 }
2650 
2651 static void vtd_handle_ics_write(IntelIOMMUState *s)
2652 {
2653     uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2654     uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2655 
2656     if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2657         trace_vtd_reg_ics_clear_ip();
2658         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2659     }
2660 }
2661 
2662 static void vtd_handle_iectl_write(IntelIOMMUState *s)
2663 {
2664     uint32_t iectl_reg;
2665     /* FIXME: when software clears the IM field, check the IP field. But do we
2666      * need to compare the old value and the new value to conclude that
2667      * software clears the IM field? Or just check if the IM field is zero?
2668      */
2669     iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2670 
2671     trace_vtd_reg_write_iectl(iectl_reg);
2672 
2673     if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2674         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2675         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2676     }
2677 }
2678 
2679 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2680 {
2681     IntelIOMMUState *s = opaque;
2682     uint64_t val;
2683 
2684     trace_vtd_reg_read(addr, size);
2685 
2686     if (addr + size > DMAR_REG_SIZE) {
2687         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2688                           " size=0x%x", __func__, addr, size);
2689         return (uint64_t)-1;
2690     }
2691 
2692     switch (addr) {
2693     /* Root Table Address Register, 64-bit */
2694     case DMAR_RTADDR_REG:
2695         val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
2696         if (size == 4) {
2697             val = val & ((1ULL << 32) - 1);
2698         }
2699         break;
2700 
2701     case DMAR_RTADDR_REG_HI:
2702         assert(size == 4);
2703         val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
2704         break;
2705 
2706     /* Invalidation Queue Address Register, 64-bit */
2707     case DMAR_IQA_REG:
2708         val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2709         if (size == 4) {
2710             val = val & ((1ULL << 32) - 1);
2711         }
2712         break;
2713 
2714     case DMAR_IQA_REG_HI:
2715         assert(size == 4);
2716         val = s->iq >> 32;
2717         break;
2718 
2719     default:
2720         if (size == 4) {
2721             val = vtd_get_long(s, addr);
2722         } else {
2723             val = vtd_get_quad(s, addr);
2724         }
2725     }
2726 
2727     return val;
2728 }
2729 
2730 static void vtd_mem_write(void *opaque, hwaddr addr,
2731                           uint64_t val, unsigned size)
2732 {
2733     IntelIOMMUState *s = opaque;
2734 
2735     trace_vtd_reg_write(addr, size, val);
2736 
2737     if (addr + size > DMAR_REG_SIZE) {
2738         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2739                           " size=0x%x", __func__, addr, size);
2740         return;
2741     }
2742 
2743     switch (addr) {
2744     /* Global Command Register, 32-bit */
2745     case DMAR_GCMD_REG:
2746         vtd_set_long(s, addr, val);
2747         vtd_handle_gcmd_write(s);
2748         break;
2749 
2750     /* Context Command Register, 64-bit */
2751     case DMAR_CCMD_REG:
2752         if (size == 4) {
2753             vtd_set_long(s, addr, val);
2754         } else {
2755             vtd_set_quad(s, addr, val);
2756             vtd_handle_ccmd_write(s);
2757         }
2758         break;
2759 
2760     case DMAR_CCMD_REG_HI:
2761         assert(size == 4);
2762         vtd_set_long(s, addr, val);
2763         vtd_handle_ccmd_write(s);
2764         break;
2765 
2766     /* IOTLB Invalidation Register, 64-bit */
2767     case DMAR_IOTLB_REG:
2768         if (size == 4) {
2769             vtd_set_long(s, addr, val);
2770         } else {
2771             vtd_set_quad(s, addr, val);
2772             vtd_handle_iotlb_write(s);
2773         }
2774         break;
2775 
2776     case DMAR_IOTLB_REG_HI:
2777         assert(size == 4);
2778         vtd_set_long(s, addr, val);
2779         vtd_handle_iotlb_write(s);
2780         break;
2781 
2782     /* Invalidate Address Register, 64-bit */
2783     case DMAR_IVA_REG:
2784         if (size == 4) {
2785             vtd_set_long(s, addr, val);
2786         } else {
2787             vtd_set_quad(s, addr, val);
2788         }
2789         break;
2790 
2791     case DMAR_IVA_REG_HI:
2792         assert(size == 4);
2793         vtd_set_long(s, addr, val);
2794         break;
2795 
2796     /* Fault Status Register, 32-bit */
2797     case DMAR_FSTS_REG:
2798         assert(size == 4);
2799         vtd_set_long(s, addr, val);
2800         vtd_handle_fsts_write(s);
2801         break;
2802 
2803     /* Fault Event Control Register, 32-bit */
2804     case DMAR_FECTL_REG:
2805         assert(size == 4);
2806         vtd_set_long(s, addr, val);
2807         vtd_handle_fectl_write(s);
2808         break;
2809 
2810     /* Fault Event Data Register, 32-bit */
2811     case DMAR_FEDATA_REG:
2812         assert(size == 4);
2813         vtd_set_long(s, addr, val);
2814         break;
2815 
2816     /* Fault Event Address Register, 32-bit */
2817     case DMAR_FEADDR_REG:
2818         if (size == 4) {
2819             vtd_set_long(s, addr, val);
2820         } else {
2821             /*
2822              * While the register is 32-bit only, some guests (Xen...) write to
2823              * it with 64-bit.
2824              */
2825             vtd_set_quad(s, addr, val);
2826         }
2827         break;
2828 
2829     /* Fault Event Upper Address Register, 32-bit */
2830     case DMAR_FEUADDR_REG:
2831         assert(size == 4);
2832         vtd_set_long(s, addr, val);
2833         break;
2834 
2835     /* Protected Memory Enable Register, 32-bit */
2836     case DMAR_PMEN_REG:
2837         assert(size == 4);
2838         vtd_set_long(s, addr, val);
2839         break;
2840 
2841     /* Root Table Address Register, 64-bit */
2842     case DMAR_RTADDR_REG:
2843         if (size == 4) {
2844             vtd_set_long(s, addr, val);
2845         } else {
2846             vtd_set_quad(s, addr, val);
2847         }
2848         break;
2849 
2850     case DMAR_RTADDR_REG_HI:
2851         assert(size == 4);
2852         vtd_set_long(s, addr, val);
2853         break;
2854 
2855     /* Invalidation Queue Tail Register, 64-bit */
2856     case DMAR_IQT_REG:
2857         if (size == 4) {
2858             vtd_set_long(s, addr, val);
2859         } else {
2860             vtd_set_quad(s, addr, val);
2861         }
2862         vtd_handle_iqt_write(s);
2863         break;
2864 
2865     case DMAR_IQT_REG_HI:
2866         assert(size == 4);
2867         vtd_set_long(s, addr, val);
2868         /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2869         break;
2870 
2871     /* Invalidation Queue Address Register, 64-bit */
2872     case DMAR_IQA_REG:
2873         if (size == 4) {
2874             vtd_set_long(s, addr, val);
2875         } else {
2876             vtd_set_quad(s, addr, val);
2877         }
2878         if (s->ecap & VTD_ECAP_SMTS &&
2879             val & VTD_IQA_DW_MASK) {
2880             s->iq_dw = true;
2881         } else {
2882             s->iq_dw = false;
2883         }
2884         break;
2885 
2886     case DMAR_IQA_REG_HI:
2887         assert(size == 4);
2888         vtd_set_long(s, addr, val);
2889         break;
2890 
2891     /* Invalidation Completion Status Register, 32-bit */
2892     case DMAR_ICS_REG:
2893         assert(size == 4);
2894         vtd_set_long(s, addr, val);
2895         vtd_handle_ics_write(s);
2896         break;
2897 
2898     /* Invalidation Event Control Register, 32-bit */
2899     case DMAR_IECTL_REG:
2900         assert(size == 4);
2901         vtd_set_long(s, addr, val);
2902         vtd_handle_iectl_write(s);
2903         break;
2904 
2905     /* Invalidation Event Data Register, 32-bit */
2906     case DMAR_IEDATA_REG:
2907         assert(size == 4);
2908         vtd_set_long(s, addr, val);
2909         break;
2910 
2911     /* Invalidation Event Address Register, 32-bit */
2912     case DMAR_IEADDR_REG:
2913         assert(size == 4);
2914         vtd_set_long(s, addr, val);
2915         break;
2916 
2917     /* Invalidation Event Upper Address Register, 32-bit */
2918     case DMAR_IEUADDR_REG:
2919         assert(size == 4);
2920         vtd_set_long(s, addr, val);
2921         break;
2922 
2923     /* Fault Recording Registers, 128-bit */
2924     case DMAR_FRCD_REG_0_0:
2925         if (size == 4) {
2926             vtd_set_long(s, addr, val);
2927         } else {
2928             vtd_set_quad(s, addr, val);
2929         }
2930         break;
2931 
2932     case DMAR_FRCD_REG_0_1:
2933         assert(size == 4);
2934         vtd_set_long(s, addr, val);
2935         break;
2936 
2937     case DMAR_FRCD_REG_0_2:
2938         if (size == 4) {
2939             vtd_set_long(s, addr, val);
2940         } else {
2941             vtd_set_quad(s, addr, val);
2942             /* May clear bit 127 (Fault), update PPF */
2943             vtd_update_fsts_ppf(s);
2944         }
2945         break;
2946 
2947     case DMAR_FRCD_REG_0_3:
2948         assert(size == 4);
2949         vtd_set_long(s, addr, val);
2950         /* May clear bit 127 (Fault), update PPF */
2951         vtd_update_fsts_ppf(s);
2952         break;
2953 
2954     case DMAR_IRTA_REG:
2955         if (size == 4) {
2956             vtd_set_long(s, addr, val);
2957         } else {
2958             vtd_set_quad(s, addr, val);
2959         }
2960         break;
2961 
2962     case DMAR_IRTA_REG_HI:
2963         assert(size == 4);
2964         vtd_set_long(s, addr, val);
2965         break;
2966 
2967     default:
2968         if (size == 4) {
2969             vtd_set_long(s, addr, val);
2970         } else {
2971             vtd_set_quad(s, addr, val);
2972         }
2973     }
2974 }
2975 
2976 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
2977                                          IOMMUAccessFlags flag, int iommu_idx)
2978 {
2979     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2980     IntelIOMMUState *s = vtd_as->iommu_state;
2981     IOMMUTLBEntry iotlb = {
2982         /* We'll fill in the rest later. */
2983         .target_as = &address_space_memory,
2984     };
2985     bool success;
2986 
2987     if (likely(s->dmar_enabled)) {
2988         success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
2989                                          addr, flag & IOMMU_WO, &iotlb);
2990     } else {
2991         /* DMAR disabled, passthrough, use 4k-page*/
2992         iotlb.iova = addr & VTD_PAGE_MASK_4K;
2993         iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
2994         iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
2995         iotlb.perm = IOMMU_RW;
2996         success = true;
2997     }
2998 
2999     if (likely(success)) {
3000         trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
3001                                  VTD_PCI_SLOT(vtd_as->devfn),
3002                                  VTD_PCI_FUNC(vtd_as->devfn),
3003                                  iotlb.iova, iotlb.translated_addr,
3004                                  iotlb.addr_mask);
3005     } else {
3006         error_report_once("%s: detected translation failure "
3007                           "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
3008                           __func__, pci_bus_num(vtd_as->bus),
3009                           VTD_PCI_SLOT(vtd_as->devfn),
3010                           VTD_PCI_FUNC(vtd_as->devfn),
3011                           addr);
3012     }
3013 
3014     return iotlb;
3015 }
3016 
3017 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
3018                                          IOMMUNotifierFlag old,
3019                                          IOMMUNotifierFlag new,
3020                                          Error **errp)
3021 {
3022     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
3023     IntelIOMMUState *s = vtd_as->iommu_state;
3024 
3025     /* Update per-address-space notifier flags */
3026     vtd_as->notifier_flags = new;
3027 
3028     if (old == IOMMU_NOTIFIER_NONE) {
3029         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
3030     } else if (new == IOMMU_NOTIFIER_NONE) {
3031         QLIST_REMOVE(vtd_as, next);
3032     }
3033     return 0;
3034 }
3035 
3036 static int vtd_post_load(void *opaque, int version_id)
3037 {
3038     IntelIOMMUState *iommu = opaque;
3039 
3040     /*
3041      * Memory regions are dynamically turned on/off depending on
3042      * context entry configurations from the guest. After migration,
3043      * we need to make sure the memory regions are still correct.
3044      */
3045     vtd_switch_address_space_all(iommu);
3046 
3047     /*
3048      * We don't need to migrate the root_scalable because we can
3049      * simply do the calculation after the loading is complete.  We
3050      * can actually do similar things with root, dmar_enabled, etc.
3051      * however since we've had them already so we'd better keep them
3052      * for compatibility of migration.
3053      */
3054     vtd_update_scalable_state(iommu);
3055 
3056     return 0;
3057 }
3058 
3059 static const VMStateDescription vtd_vmstate = {
3060     .name = "iommu-intel",
3061     .version_id = 1,
3062     .minimum_version_id = 1,
3063     .priority = MIG_PRI_IOMMU,
3064     .post_load = vtd_post_load,
3065     .fields = (VMStateField[]) {
3066         VMSTATE_UINT64(root, IntelIOMMUState),
3067         VMSTATE_UINT64(intr_root, IntelIOMMUState),
3068         VMSTATE_UINT64(iq, IntelIOMMUState),
3069         VMSTATE_UINT32(intr_size, IntelIOMMUState),
3070         VMSTATE_UINT16(iq_head, IntelIOMMUState),
3071         VMSTATE_UINT16(iq_tail, IntelIOMMUState),
3072         VMSTATE_UINT16(iq_size, IntelIOMMUState),
3073         VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
3074         VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
3075         VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
3076         VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
3077         VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
3078         VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
3079         VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
3080         VMSTATE_BOOL(intr_eime, IntelIOMMUState),
3081         VMSTATE_END_OF_LIST()
3082     }
3083 };
3084 
3085 static const MemoryRegionOps vtd_mem_ops = {
3086     .read = vtd_mem_read,
3087     .write = vtd_mem_write,
3088     .endianness = DEVICE_LITTLE_ENDIAN,
3089     .impl = {
3090         .min_access_size = 4,
3091         .max_access_size = 8,
3092     },
3093     .valid = {
3094         .min_access_size = 4,
3095         .max_access_size = 8,
3096     },
3097 };
3098 
3099 static Property vtd_properties[] = {
3100     DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
3101     DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
3102                             ON_OFF_AUTO_AUTO),
3103     DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
3104     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
3105                       VTD_HOST_ADDRESS_WIDTH),
3106     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
3107     DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
3108     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
3109     DEFINE_PROP_END_OF_LIST(),
3110 };
3111 
3112 /* Read IRTE entry with specific index */
3113 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
3114                         VTD_IR_TableEntry *entry, uint16_t sid)
3115 {
3116     static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
3117         {0xffff, 0xfffb, 0xfff9, 0xfff8};
3118     dma_addr_t addr = 0x00;
3119     uint16_t mask, source_id;
3120     uint8_t bus, bus_max, bus_min;
3121 
3122     if (index >= iommu->intr_size) {
3123         error_report_once("%s: index too large: ind=0x%x",
3124                           __func__, index);
3125         return -VTD_FR_IR_INDEX_OVER;
3126     }
3127 
3128     addr = iommu->intr_root + index * sizeof(*entry);
3129     if (dma_memory_read(&address_space_memory, addr,
3130                         entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) {
3131         error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
3132                           __func__, index, addr);
3133         return -VTD_FR_IR_ROOT_INVAL;
3134     }
3135 
3136     trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
3137                           le64_to_cpu(entry->data[0]));
3138 
3139     if (!entry->irte.present) {
3140         error_report_once("%s: detected non-present IRTE "
3141                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3142                           __func__, index, le64_to_cpu(entry->data[1]),
3143                           le64_to_cpu(entry->data[0]));
3144         return -VTD_FR_IR_ENTRY_P;
3145     }
3146 
3147     if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
3148         entry->irte.__reserved_2) {
3149         error_report_once("%s: detected non-zero reserved IRTE "
3150                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3151                           __func__, index, le64_to_cpu(entry->data[1]),
3152                           le64_to_cpu(entry->data[0]));
3153         return -VTD_FR_IR_IRTE_RSVD;
3154     }
3155 
3156     if (sid != X86_IOMMU_SID_INVALID) {
3157         /* Validate IRTE SID */
3158         source_id = le32_to_cpu(entry->irte.source_id);
3159         switch (entry->irte.sid_vtype) {
3160         case VTD_SVT_NONE:
3161             break;
3162 
3163         case VTD_SVT_ALL:
3164             mask = vtd_svt_mask[entry->irte.sid_q];
3165             if ((source_id & mask) != (sid & mask)) {
3166                 error_report_once("%s: invalid IRTE SID "
3167                                   "(index=%u, sid=%u, source_id=%u)",
3168                                   __func__, index, sid, source_id);
3169                 return -VTD_FR_IR_SID_ERR;
3170             }
3171             break;
3172 
3173         case VTD_SVT_BUS:
3174             bus_max = source_id >> 8;
3175             bus_min = source_id & 0xff;
3176             bus = sid >> 8;
3177             if (bus > bus_max || bus < bus_min) {
3178                 error_report_once("%s: invalid SVT_BUS "
3179                                   "(index=%u, bus=%u, min=%u, max=%u)",
3180                                   __func__, index, bus, bus_min, bus_max);
3181                 return -VTD_FR_IR_SID_ERR;
3182             }
3183             break;
3184 
3185         default:
3186             error_report_once("%s: detected invalid IRTE SVT "
3187                               "(index=%u, type=%d)", __func__,
3188                               index, entry->irte.sid_vtype);
3189             /* Take this as verification failure. */
3190             return -VTD_FR_IR_SID_ERR;
3191         }
3192     }
3193 
3194     return 0;
3195 }
3196 
3197 /* Fetch IRQ information of specific IR index */
3198 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
3199                              X86IOMMUIrq *irq, uint16_t sid)
3200 {
3201     VTD_IR_TableEntry irte = {};
3202     int ret = 0;
3203 
3204     ret = vtd_irte_get(iommu, index, &irte, sid);
3205     if (ret) {
3206         return ret;
3207     }
3208 
3209     irq->trigger_mode = irte.irte.trigger_mode;
3210     irq->vector = irte.irte.vector;
3211     irq->delivery_mode = irte.irte.delivery_mode;
3212     irq->dest = le32_to_cpu(irte.irte.dest_id);
3213     if (!iommu->intr_eime) {
3214 #define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
3215 #define  VTD_IR_APIC_DEST_SHIFT        (8)
3216         irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
3217             VTD_IR_APIC_DEST_SHIFT;
3218     }
3219     irq->dest_mode = irte.irte.dest_mode;
3220     irq->redir_hint = irte.irte.redir_hint;
3221 
3222     trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
3223                        irq->delivery_mode, irq->dest, irq->dest_mode);
3224 
3225     return 0;
3226 }
3227 
3228 /* Interrupt remapping for MSI/MSI-X entry */
3229 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
3230                                    MSIMessage *origin,
3231                                    MSIMessage *translated,
3232                                    uint16_t sid)
3233 {
3234     int ret = 0;
3235     VTD_IR_MSIAddress addr;
3236     uint16_t index;
3237     X86IOMMUIrq irq = {};
3238 
3239     assert(origin && translated);
3240 
3241     trace_vtd_ir_remap_msi_req(origin->address, origin->data);
3242 
3243     if (!iommu || !iommu->intr_enabled) {
3244         memcpy(translated, origin, sizeof(*origin));
3245         goto out;
3246     }
3247 
3248     if (origin->address & VTD_MSI_ADDR_HI_MASK) {
3249         error_report_once("%s: MSI address high 32 bits non-zero detected: "
3250                           "address=0x%" PRIx64, __func__, origin->address);
3251         return -VTD_FR_IR_REQ_RSVD;
3252     }
3253 
3254     addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
3255     if (addr.addr.__head != 0xfee) {
3256         error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
3257                           __func__, addr.data);
3258         return -VTD_FR_IR_REQ_RSVD;
3259     }
3260 
3261     /* This is compatible mode. */
3262     if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
3263         memcpy(translated, origin, sizeof(*origin));
3264         goto out;
3265     }
3266 
3267     index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
3268 
3269 #define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
3270 #define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
3271 
3272     if (addr.addr.sub_valid) {
3273         /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
3274         index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
3275     }
3276 
3277     ret = vtd_remap_irq_get(iommu, index, &irq, sid);
3278     if (ret) {
3279         return ret;
3280     }
3281 
3282     if (addr.addr.sub_valid) {
3283         trace_vtd_ir_remap_type("MSI");
3284         if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
3285             error_report_once("%s: invalid IR MSI "
3286                               "(sid=%u, address=0x%" PRIx64
3287                               ", data=0x%" PRIx32 ")",
3288                               __func__, sid, origin->address, origin->data);
3289             return -VTD_FR_IR_REQ_RSVD;
3290         }
3291     } else {
3292         uint8_t vector = origin->data & 0xff;
3293         uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
3294 
3295         trace_vtd_ir_remap_type("IOAPIC");
3296         /* IOAPIC entry vector should be aligned with IRTE vector
3297          * (see vt-d spec 5.1.5.1). */
3298         if (vector != irq.vector) {
3299             trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
3300         }
3301 
3302         /* The Trigger Mode field must match the Trigger Mode in the IRTE.
3303          * (see vt-d spec 5.1.5.1). */
3304         if (trigger_mode != irq.trigger_mode) {
3305             trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
3306                                       irq.trigger_mode);
3307         }
3308     }
3309 
3310     /*
3311      * We'd better keep the last two bits, assuming that guest OS
3312      * might modify it. Keep it does not hurt after all.
3313      */
3314     irq.msi_addr_last_bits = addr.addr.__not_care;
3315 
3316     /* Translate X86IOMMUIrq to MSI message */
3317     x86_iommu_irq_to_msi_message(&irq, translated);
3318 
3319 out:
3320     trace_vtd_ir_remap_msi(origin->address, origin->data,
3321                            translated->address, translated->data);
3322     return 0;
3323 }
3324 
3325 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
3326                          MSIMessage *dst, uint16_t sid)
3327 {
3328     return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
3329                                    src, dst, sid);
3330 }
3331 
3332 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
3333                                    uint64_t *data, unsigned size,
3334                                    MemTxAttrs attrs)
3335 {
3336     return MEMTX_OK;
3337 }
3338 
3339 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
3340                                     uint64_t value, unsigned size,
3341                                     MemTxAttrs attrs)
3342 {
3343     int ret = 0;
3344     MSIMessage from = {}, to = {};
3345     uint16_t sid = X86_IOMMU_SID_INVALID;
3346 
3347     from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
3348     from.data = (uint32_t) value;
3349 
3350     if (!attrs.unspecified) {
3351         /* We have explicit Source ID */
3352         sid = attrs.requester_id;
3353     }
3354 
3355     ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
3356     if (ret) {
3357         /* TODO: report error */
3358         /* Drop this interrupt */
3359         return MEMTX_ERROR;
3360     }
3361 
3362     apic_get_class()->send_msi(&to);
3363 
3364     return MEMTX_OK;
3365 }
3366 
3367 static const MemoryRegionOps vtd_mem_ir_ops = {
3368     .read_with_attrs = vtd_mem_ir_read,
3369     .write_with_attrs = vtd_mem_ir_write,
3370     .endianness = DEVICE_LITTLE_ENDIAN,
3371     .impl = {
3372         .min_access_size = 4,
3373         .max_access_size = 4,
3374     },
3375     .valid = {
3376         .min_access_size = 4,
3377         .max_access_size = 4,
3378     },
3379 };
3380 
3381 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
3382 {
3383     uintptr_t key = (uintptr_t)bus;
3384     VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
3385     VTDAddressSpace *vtd_dev_as;
3386     char name[128];
3387 
3388     if (!vtd_bus) {
3389         uintptr_t *new_key = g_malloc(sizeof(*new_key));
3390         *new_key = (uintptr_t)bus;
3391         /* No corresponding free() */
3392         vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
3393                             PCI_DEVFN_MAX);
3394         vtd_bus->bus = bus;
3395         g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
3396     }
3397 
3398     vtd_dev_as = vtd_bus->dev_as[devfn];
3399 
3400     if (!vtd_dev_as) {
3401         snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
3402                  PCI_FUNC(devfn));
3403         vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
3404 
3405         vtd_dev_as->bus = bus;
3406         vtd_dev_as->devfn = (uint8_t)devfn;
3407         vtd_dev_as->iommu_state = s;
3408         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
3409         vtd_dev_as->iova_tree = iova_tree_new();
3410 
3411         memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
3412         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
3413 
3414         /*
3415          * Build the DMAR-disabled container with aliases to the
3416          * shared MRs.  Note that aliasing to a shared memory region
3417          * could help the memory API to detect same FlatViews so we
3418          * can have devices to share the same FlatView when DMAR is
3419          * disabled (either by not providing "intel_iommu=on" or with
3420          * "iommu=pt").  It will greatly reduce the total number of
3421          * FlatViews of the system hence VM runs faster.
3422          */
3423         memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
3424                                  "vtd-nodmar", &s->mr_nodmar, 0,
3425                                  memory_region_size(&s->mr_nodmar));
3426 
3427         /*
3428          * Build the per-device DMAR-enabled container.
3429          *
3430          * TODO: currently we have per-device IOMMU memory region only
3431          * because we have per-device IOMMU notifiers for devices.  If
3432          * one day we can abstract the IOMMU notifiers out of the
3433          * memory regions then we can also share the same memory
3434          * region here just like what we've done above with the nodmar
3435          * region.
3436          */
3437         strcat(name, "-dmar");
3438         memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
3439                                  TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
3440                                  name, UINT64_MAX);
3441         memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
3442                                  &s->mr_ir, 0, memory_region_size(&s->mr_ir));
3443         memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
3444                                             VTD_INTERRUPT_ADDR_FIRST,
3445                                             &vtd_dev_as->iommu_ir, 1);
3446 
3447         /*
3448          * Hook both the containers under the root container, we
3449          * switch between DMAR & noDMAR by enable/disable
3450          * corresponding sub-containers
3451          */
3452         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3453                                             MEMORY_REGION(&vtd_dev_as->iommu),
3454                                             0);
3455         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3456                                             &vtd_dev_as->nodmar, 0);
3457 
3458         vtd_switch_address_space(vtd_dev_as);
3459     }
3460     return vtd_dev_as;
3461 }
3462 
3463 /* Unmap the whole range in the notifier's scope. */
3464 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
3465 {
3466     hwaddr size, remain;
3467     hwaddr start = n->start;
3468     hwaddr end = n->end;
3469     IntelIOMMUState *s = as->iommu_state;
3470     DMAMap map;
3471 
3472     /*
3473      * Note: all the codes in this function has a assumption that IOVA
3474      * bits are no more than VTD_MGAW bits (which is restricted by
3475      * VT-d spec), otherwise we need to consider overflow of 64 bits.
3476      */
3477 
3478     if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
3479         /*
3480          * Don't need to unmap regions that is bigger than the whole
3481          * VT-d supported address space size
3482          */
3483         end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
3484     }
3485 
3486     assert(start <= end);
3487     size = remain = end - start + 1;
3488 
3489     while (remain >= VTD_PAGE_SIZE) {
3490         IOMMUTLBEvent event;
3491         uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits);
3492         uint64_t size = mask + 1;
3493 
3494         assert(size);
3495 
3496         event.type = IOMMU_NOTIFIER_UNMAP;
3497         event.entry.iova = start;
3498         event.entry.addr_mask = mask;
3499         event.entry.target_as = &address_space_memory;
3500         event.entry.perm = IOMMU_NONE;
3501         /* This field is meaningless for unmap */
3502         event.entry.translated_addr = 0;
3503 
3504         memory_region_notify_iommu_one(n, &event);
3505 
3506         start += size;
3507         remain -= size;
3508     }
3509 
3510     assert(!remain);
3511 
3512     trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3513                              VTD_PCI_SLOT(as->devfn),
3514                              VTD_PCI_FUNC(as->devfn),
3515                              n->start, size);
3516 
3517     map.iova = n->start;
3518     map.size = size;
3519     iova_tree_remove(as->iova_tree, &map);
3520 }
3521 
3522 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3523 {
3524     VTDAddressSpace *vtd_as;
3525     IOMMUNotifier *n;
3526 
3527     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3528         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3529             vtd_address_space_unmap(vtd_as, n);
3530         }
3531     }
3532 }
3533 
3534 static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3535 {
3536     vtd_address_space_unmap_all(s);
3537     vtd_switch_address_space_all(s);
3538 }
3539 
3540 static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
3541 {
3542     memory_region_notify_iommu_one(private, event);
3543     return 0;
3544 }
3545 
3546 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3547 {
3548     VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3549     IntelIOMMUState *s = vtd_as->iommu_state;
3550     uint8_t bus_n = pci_bus_num(vtd_as->bus);
3551     VTDContextEntry ce;
3552 
3553     /*
3554      * The replay can be triggered by either a invalidation or a newly
3555      * created entry. No matter what, we release existing mappings
3556      * (it means flushing caches for UNMAP-only registers).
3557      */
3558     vtd_address_space_unmap(vtd_as, n);
3559 
3560     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3561         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
3562                                   "legacy mode",
3563                                   bus_n, PCI_SLOT(vtd_as->devfn),
3564                                   PCI_FUNC(vtd_as->devfn),
3565                                   vtd_get_domain_id(s, &ce),
3566                                   ce.hi, ce.lo);
3567         if (vtd_as_has_map_notifier(vtd_as)) {
3568             /* This is required only for MAP typed notifiers */
3569             vtd_page_walk_info info = {
3570                 .hook_fn = vtd_replay_hook,
3571                 .private = (void *)n,
3572                 .notify_unmap = false,
3573                 .aw = s->aw_bits,
3574                 .as = vtd_as,
3575                 .domain_id = vtd_get_domain_id(s, &ce),
3576             };
3577 
3578             vtd_page_walk(s, &ce, 0, ~0ULL, &info);
3579         }
3580     } else {
3581         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3582                                     PCI_FUNC(vtd_as->devfn));
3583     }
3584 
3585     return;
3586 }
3587 
3588 /* Do the initialization. It will also be called when reset, so pay
3589  * attention when adding new initialization stuff.
3590  */
3591 static void vtd_init(IntelIOMMUState *s)
3592 {
3593     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3594 
3595     memset(s->csr, 0, DMAR_REG_SIZE);
3596     memset(s->wmask, 0, DMAR_REG_SIZE);
3597     memset(s->w1cmask, 0, DMAR_REG_SIZE);
3598     memset(s->womask, 0, DMAR_REG_SIZE);
3599 
3600     s->root = 0;
3601     s->root_scalable = false;
3602     s->dmar_enabled = false;
3603     s->intr_enabled = false;
3604     s->iq_head = 0;
3605     s->iq_tail = 0;
3606     s->iq = 0;
3607     s->iq_size = 0;
3608     s->qi_enabled = false;
3609     s->iq_last_desc_type = VTD_INV_DESC_NONE;
3610     s->iq_dw = false;
3611     s->next_frcd_reg = 0;
3612     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3613              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3614              VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
3615     if (s->dma_drain) {
3616         s->cap |= VTD_CAP_DRAIN;
3617     }
3618     if (s->aw_bits == VTD_HOST_AW_48BIT) {
3619         s->cap |= VTD_CAP_SAGAW_48bit;
3620     }
3621     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3622 
3623     /*
3624      * Rsvd field masks for spte
3625      */
3626     vtd_spte_rsvd[0] = ~0ULL;
3627     vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
3628                                                   x86_iommu->dt_supported);
3629     vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3630     vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3631     vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3632 
3633     vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
3634                                                          x86_iommu->dt_supported);
3635     vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
3636                                                          x86_iommu->dt_supported);
3637 
3638     if (s->scalable_mode) {
3639         vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
3640         vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
3641         vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
3642     }
3643 
3644     if (x86_iommu_ir_supported(x86_iommu)) {
3645         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3646         if (s->intr_eim == ON_OFF_AUTO_ON) {
3647             s->ecap |= VTD_ECAP_EIM;
3648         }
3649         assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3650     }
3651 
3652     if (x86_iommu->dt_supported) {
3653         s->ecap |= VTD_ECAP_DT;
3654     }
3655 
3656     if (x86_iommu->pt_supported) {
3657         s->ecap |= VTD_ECAP_PT;
3658     }
3659 
3660     if (s->caching_mode) {
3661         s->cap |= VTD_CAP_CM;
3662     }
3663 
3664     /* TODO: read cap/ecap from host to decide which cap to be exposed. */
3665     if (s->scalable_mode) {
3666         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
3667     }
3668 
3669     vtd_reset_caches(s);
3670 
3671     /* Define registers with default values and bit semantics */
3672     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3673     vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3674     vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3675     vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3676     vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3677     vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3678     vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
3679     vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3680     vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3681 
3682     /* Advanced Fault Logging not supported */
3683     vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3684     vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3685     vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3686     vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3687 
3688     /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3689      * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3690      */
3691     vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3692 
3693     /* Treated as RO for implementations that PLMR and PHMR fields reported
3694      * as Clear in the CAP_REG.
3695      * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3696      */
3697     vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3698 
3699     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3700     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3701     vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
3702     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3703     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3704     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3705     vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3706     /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3707     vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3708 
3709     /* IOTLB registers */
3710     vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3711     vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3712     vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3713 
3714     /* Fault Recording Registers, 128-bit */
3715     vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3716     vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3717 
3718     /*
3719      * Interrupt remapping registers.
3720      */
3721     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3722 }
3723 
3724 /* Should not reset address_spaces when reset because devices will still use
3725  * the address space they got at first (won't ask the bus again).
3726  */
3727 static void vtd_reset(DeviceState *dev)
3728 {
3729     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3730 
3731     vtd_init(s);
3732     vtd_address_space_refresh_all(s);
3733 }
3734 
3735 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3736 {
3737     IntelIOMMUState *s = opaque;
3738     VTDAddressSpace *vtd_as;
3739 
3740     assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3741 
3742     vtd_as = vtd_find_add_as(s, bus, devfn);
3743     return &vtd_as->as;
3744 }
3745 
3746 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3747 {
3748     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3749 
3750     if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
3751         error_setg(errp, "eim=on cannot be selected without intremap=on");
3752         return false;
3753     }
3754 
3755     if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3756         s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3757                       && x86_iommu_ir_supported(x86_iommu) ?
3758                                               ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3759     }
3760     if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3761         if (!kvm_irqchip_in_kernel()) {
3762             error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3763             return false;
3764         }
3765         if (!kvm_enable_x2apic()) {
3766             error_setg(errp, "eim=on requires support on the KVM side"
3767                              "(X2APIC_API, first shipped in v4.7)");
3768             return false;
3769         }
3770     }
3771 
3772     /* Currently only address widths supported are 39 and 48 bits */
3773     if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3774         (s->aw_bits != VTD_HOST_AW_48BIT)) {
3775         error_setg(errp, "Supported values for aw-bits are: %d, %d",
3776                    VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3777         return false;
3778     }
3779 
3780     if (s->scalable_mode && !s->dma_drain) {
3781         error_setg(errp, "Need to set dma_drain for scalable mode");
3782         return false;
3783     }
3784 
3785     return true;
3786 }
3787 
3788 static int vtd_machine_done_notify_one(Object *child, void *unused)
3789 {
3790     IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
3791 
3792     /*
3793      * We hard-coded here because vfio-pci is the only special case
3794      * here.  Let's be more elegant in the future when we can, but so
3795      * far there seems to be no better way.
3796      */
3797     if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
3798         vtd_panic_require_caching_mode();
3799     }
3800 
3801     return 0;
3802 }
3803 
3804 static void vtd_machine_done_hook(Notifier *notifier, void *unused)
3805 {
3806     object_child_foreach_recursive(object_get_root(),
3807                                    vtd_machine_done_notify_one, NULL);
3808 }
3809 
3810 static Notifier vtd_machine_done_notify = {
3811     .notify = vtd_machine_done_hook,
3812 };
3813 
3814 static void vtd_realize(DeviceState *dev, Error **errp)
3815 {
3816     MachineState *ms = MACHINE(qdev_get_machine());
3817     PCMachineState *pcms = PC_MACHINE(ms);
3818     X86MachineState *x86ms = X86_MACHINE(ms);
3819     PCIBus *bus = pcms->bus;
3820     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3821 
3822     if (!vtd_decide_config(s, errp)) {
3823         return;
3824     }
3825 
3826     QLIST_INIT(&s->vtd_as_with_notifiers);
3827     qemu_mutex_init(&s->iommu_lock);
3828     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3829     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3830                           "intel_iommu", DMAR_REG_SIZE);
3831 
3832     /* Create the shared memory regions by all devices */
3833     memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
3834                        UINT64_MAX);
3835     memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
3836                           s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
3837     memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
3838                              "vtd-sys-alias", get_system_memory(), 0,
3839                              memory_region_size(get_system_memory()));
3840     memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
3841                                         &s->mr_sys_alias, 0);
3842     memory_region_add_subregion_overlap(&s->mr_nodmar,
3843                                         VTD_INTERRUPT_ADDR_FIRST,
3844                                         &s->mr_ir, 1);
3845 
3846     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3847     /* No corresponding destroy */
3848     s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3849                                      g_free, g_free);
3850     s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3851                                               g_free, g_free);
3852     vtd_init(s);
3853     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3854     pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3855     /* Pseudo address space under root PCI bus. */
3856     x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3857     qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
3858 }
3859 
3860 static void vtd_class_init(ObjectClass *klass, void *data)
3861 {
3862     DeviceClass *dc = DEVICE_CLASS(klass);
3863     X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
3864 
3865     dc->reset = vtd_reset;
3866     dc->vmsd = &vtd_vmstate;
3867     device_class_set_props(dc, vtd_properties);
3868     dc->hotpluggable = false;
3869     x86_class->realize = vtd_realize;
3870     x86_class->int_remap = vtd_int_remap;
3871     /* Supported by the pc-q35-* machine types */
3872     dc->user_creatable = true;
3873     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3874     dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
3875 }
3876 
3877 static const TypeInfo vtd_info = {
3878     .name          = TYPE_INTEL_IOMMU_DEVICE,
3879     .parent        = TYPE_X86_IOMMU_DEVICE,
3880     .instance_size = sizeof(IntelIOMMUState),
3881     .class_init    = vtd_class_init,
3882 };
3883 
3884 static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3885                                                      void *data)
3886 {
3887     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3888 
3889     imrc->translate = vtd_iommu_translate;
3890     imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3891     imrc->replay = vtd_iommu_replay;
3892 }
3893 
3894 static const TypeInfo vtd_iommu_memory_region_info = {
3895     .parent = TYPE_IOMMU_MEMORY_REGION,
3896     .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3897     .class_init = vtd_iommu_memory_region_class_init,
3898 };
3899 
3900 static void vtd_register_types(void)
3901 {
3902     type_register_static(&vtd_info);
3903     type_register_static(&vtd_iommu_memory_region_info);
3904 }
3905 
3906 type_init(vtd_register_types)
3907