xref: /openbmc/qemu/hw/riscv/riscv-iommu.c (revision 7d87775f)
1 /*
2  * QEMU emulation of an RISC-V IOMMU
3  *
4  * Copyright (C) 2021-2023, Rivos Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qom/object.h"
21 #include "hw/pci/pci_bus.h"
22 #include "hw/pci/pci_device.h"
23 #include "hw/qdev-properties.h"
24 #include "hw/riscv/riscv_hart.h"
25 #include "migration/vmstate.h"
26 #include "qapi/error.h"
27 #include "qemu/timer.h"
28 
29 #include "cpu_bits.h"
30 #include "riscv-iommu.h"
31 #include "riscv-iommu-bits.h"
32 #include "trace.h"
33 
34 #define LIMIT_CACHE_CTX               (1U << 7)
35 #define LIMIT_CACHE_IOT               (1U << 20)
36 
37 /* Physical page number coversions */
38 #define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
39 #define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
40 
41 typedef struct RISCVIOMMUContext RISCVIOMMUContext;
42 typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
43 
44 /* Device assigned I/O address space */
45 struct RISCVIOMMUSpace {
46     IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
47     AddressSpace iova_as;       /* IOVA address space for attached device */
48     RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
49     uint32_t devid;             /* Requester identifier, AKA device_id */
50     bool notifier;              /* IOMMU unmap notifier enabled */
51     QLIST_ENTRY(RISCVIOMMUSpace) list;
52 };
53 
54 /* Device translation context state. */
55 struct RISCVIOMMUContext {
56     uint64_t devid:24;          /* Requester Id, AKA device_id */
57     uint64_t process_id:20;     /* Process ID. PASID for PCIe */
58     uint64_t tc;                /* Translation Control */
59     uint64_t ta;                /* Translation Attributes */
60     uint64_t satp;              /* S-Stage address translation and protection */
61     uint64_t gatp;              /* G-Stage address translation and protection */
62     uint64_t msi_addr_mask;     /* MSI filtering - address mask */
63     uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
64     uint64_t msiptp;            /* MSI redirection page table pointer */
65 };
66 
67 /* Address translation cache entry */
68 struct RISCVIOMMUEntry {
69     uint64_t iova:44;           /* IOVA Page Number */
70     uint64_t pscid:20;          /* Process Soft-Context identifier */
71     uint64_t phys:44;           /* Physical Page Number */
72     uint64_t gscid:16;          /* Guest Soft-Context identifier */
73     uint64_t perm:2;            /* IOMMU_RW flags */
74 };
75 
76 /* IOMMU index for transactions without process_id specified. */
77 #define RISCV_IOMMU_NOPROCID 0
78 
79 static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
80 {
81     switch (vec_type) {
82     case RISCV_IOMMU_INTR_CQ:
83         return icvec & RISCV_IOMMU_ICVEC_CIV;
84     case RISCV_IOMMU_INTR_FQ:
85         return (icvec & RISCV_IOMMU_ICVEC_FIV) >> 4;
86     case RISCV_IOMMU_INTR_PM:
87         return (icvec & RISCV_IOMMU_ICVEC_PMIV) >> 8;
88     case RISCV_IOMMU_INTR_PQ:
89         return (icvec & RISCV_IOMMU_ICVEC_PIV) >> 12;
90     default:
91         g_assert_not_reached();
92     }
93 }
94 
95 static void riscv_iommu_notify(RISCVIOMMUState *s, int vec_type)
96 {
97     const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
98     uint32_t ipsr, icvec, vector;
99 
100     if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
101         return;
102     }
103 
104     icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
105     ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec_type), 0);
106 
107     if (!(ipsr & (1 << vec_type))) {
108         vector = riscv_iommu_get_icvec_vector(icvec, vec_type);
109         s->notify(s, vector);
110         trace_riscv_iommu_notify_int_vector(vec_type, vector);
111     }
112 }
113 
114 static void riscv_iommu_fault(RISCVIOMMUState *s,
115                               struct riscv_iommu_fq_record *ev)
116 {
117     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
118     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
119     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
120     uint32_t next = (tail + 1) & s->fq_mask;
121     uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
122 
123     trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
124                           PCI_FUNC(devid), ev->hdr, ev->iotval);
125 
126     if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
127         !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
128         return;
129     }
130 
131     if (head == next) {
132         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
133                               RISCV_IOMMU_FQCSR_FQOF, 0);
134     } else {
135         dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
136         if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
137                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
138             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
139                                   RISCV_IOMMU_FQCSR_FQMF, 0);
140         } else {
141             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
142         }
143     }
144 
145     if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
146         riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
147     }
148 }
149 
150 static void riscv_iommu_pri(RISCVIOMMUState *s,
151     struct riscv_iommu_pq_record *pr)
152 {
153     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
154     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
155     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
156     uint32_t next = (tail + 1) & s->pq_mask;
157     uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
158 
159     trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
160                           PCI_FUNC(devid), pr->payload);
161 
162     if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
163         !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
164         return;
165     }
166 
167     if (head == next) {
168         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
169                               RISCV_IOMMU_PQCSR_PQOF, 0);
170     } else {
171         dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
172         if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
173                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
174             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
175                                   RISCV_IOMMU_PQCSR_PQMF, 0);
176         } else {
177             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
178         }
179     }
180 
181     if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
182         riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
183     }
184 }
185 
186 /*
187  * Discards all bits from 'val' whose matching bits in the same
188  * positions in the mask 'ext' are zeros, and packs the remaining
189  * bits from 'val' contiguously at the least-significant end of the
190  * result, keeping the same bit order as 'val' and filling any
191  * other bits at the most-significant end of the result with zeros.
192  *
193  * For example, for the following 'val' and 'ext', the return 'ret'
194  * will be:
195  *
196  * val = a b c d e f g h
197  * ext = 1 0 1 0 0 1 1 0
198  * ret = 0 0 0 0 a c f g
199  *
200  * This function, taken from the riscv-iommu 1.0 spec, section 2.3.3
201  * "Process to translate addresses of MSIs", is similar to bit manip
202  * function PEXT (Parallel bits extract) from x86.
203  */
204 static uint64_t riscv_iommu_pext_u64(uint64_t val, uint64_t ext)
205 {
206     uint64_t ret = 0;
207     uint64_t rot = 1;
208 
209     while (ext) {
210         if (ext & 1) {
211             if (val & 1) {
212                 ret |= rot;
213             }
214             rot <<= 1;
215         }
216         val >>= 1;
217         ext >>= 1;
218     }
219 
220     return ret;
221 }
222 
223 /* Check if GPA matches MSI/MRIF pattern. */
224 static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
225     dma_addr_t gpa)
226 {
227     if (!s->enable_msi) {
228         return false;
229     }
230 
231     if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
232         RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
233         return false; /* Invalid MSI/MRIF mode */
234     }
235 
236     if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
237         return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
238     }
239 
240     return true;
241 }
242 
243 /*
244  * RISCV IOMMU Address Translation Lookup - Page Table Walk
245  *
246  * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
247  * Both implementation can be merged into single helper function in future.
248  * Keeping them separate for now, as error reporting and flow specifics are
249  * sufficiently different for separate implementation.
250  *
251  * @s        : IOMMU Device State
252  * @ctx      : Translation context for device id and process address space id.
253  * @iotlb    : translation data: physical address and access mode.
254  * @return   : success or fault cause code.
255  */
256 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
257     IOMMUTLBEntry *iotlb)
258 {
259     dma_addr_t addr, base;
260     uint64_t satp, gatp, pte;
261     bool en_s, en_g;
262     struct {
263         unsigned char step;
264         unsigned char levels;
265         unsigned char ptidxbits;
266         unsigned char ptesize;
267     } sc[2];
268     /* Translation stage phase */
269     enum {
270         S_STAGE = 0,
271         G_STAGE = 1,
272     } pass;
273     MemTxResult ret;
274 
275     satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
276     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
277 
278     en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
279     en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
280 
281     /*
282      * Early check for MSI address match when IOVA == GPA.
283      * Note that the (!en_s) condition means that the MSI
284      * page table may only be used when guest pages are
285      * mapped using the g-stage page table, whether single-
286      * or two-stage paging is enabled. It's unavoidable though,
287      * because the spec mandates that we do a first-stage
288      * translation before we check the MSI page table, which
289      * means we can't do an early MSI check unless we have
290      * strictly !en_s.
291      */
292     if (!en_s && (iotlb->perm & IOMMU_WO) &&
293         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
294         iotlb->target_as = &s->trap_as;
295         iotlb->translated_addr = iotlb->iova;
296         iotlb->addr_mask = ~TARGET_PAGE_MASK;
297         return 0;
298     }
299 
300     /* Exit early for pass-through mode. */
301     if (!(en_s || en_g)) {
302         iotlb->translated_addr = iotlb->iova;
303         iotlb->addr_mask = ~TARGET_PAGE_MASK;
304         /* Allow R/W in pass-through mode */
305         iotlb->perm = IOMMU_RW;
306         return 0;
307     }
308 
309     /* S/G translation parameters. */
310     for (pass = 0; pass < 2; pass++) {
311         uint32_t sv_mode;
312 
313         sc[pass].step = 0;
314         if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
315             (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
316             /* 32bit mode for GXL/SXL == 1 */
317             switch (pass ? gatp : satp) {
318             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
319                 sc[pass].levels    = 0;
320                 sc[pass].ptidxbits = 0;
321                 sc[pass].ptesize   = 0;
322                 break;
323             case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
324                 sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
325                 if (!(s->cap & sv_mode)) {
326                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
327                 }
328                 sc[pass].levels    = 2;
329                 sc[pass].ptidxbits = 10;
330                 sc[pass].ptesize   = 4;
331                 break;
332             default:
333                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
334             }
335         } else {
336             /* 64bit mode for GXL/SXL == 0 */
337             switch (pass ? gatp : satp) {
338             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
339                 sc[pass].levels    = 0;
340                 sc[pass].ptidxbits = 0;
341                 sc[pass].ptesize   = 0;
342                 break;
343             case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
344                 sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
345                 if (!(s->cap & sv_mode)) {
346                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
347                 }
348                 sc[pass].levels    = 3;
349                 sc[pass].ptidxbits = 9;
350                 sc[pass].ptesize   = 8;
351                 break;
352             case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
353                 sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
354                 if (!(s->cap & sv_mode)) {
355                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
356                 }
357                 sc[pass].levels    = 4;
358                 sc[pass].ptidxbits = 9;
359                 sc[pass].ptesize   = 8;
360                 break;
361             case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
362                 sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
363                 if (!(s->cap & sv_mode)) {
364                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
365                 }
366                 sc[pass].levels    = 5;
367                 sc[pass].ptidxbits = 9;
368                 sc[pass].ptesize   = 8;
369                 break;
370             default:
371                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
372             }
373         }
374     };
375 
376     /* S/G stages translation tables root pointers */
377     gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
378     satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
379     addr = (en_s && en_g) ? satp : iotlb->iova;
380     base = en_g ? gatp : satp;
381     pass = en_g ? G_STAGE : S_STAGE;
382 
383     do {
384         const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
385         const unsigned va_bits = widened + sc[pass].ptidxbits;
386         const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
387                                  (sc[pass].levels - 1 - sc[pass].step);
388         const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
389         const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
390         const bool ade =
391             ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
392 
393         /* Address range check before first level lookup */
394         if (!sc[pass].step) {
395             const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
396             if ((addr & va_mask) != addr) {
397                 return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
398             }
399         }
400 
401         /* Read page table entry */
402         if (sc[pass].ptesize == 4) {
403             uint32_t pte32 = 0;
404             ret = ldl_le_dma(s->target_as, pte_addr, &pte32,
405                              MEMTXATTRS_UNSPECIFIED);
406             pte = pte32;
407         } else {
408             ret = ldq_le_dma(s->target_as, pte_addr, &pte,
409                              MEMTXATTRS_UNSPECIFIED);
410         }
411         if (ret != MEMTX_OK) {
412             return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
413                                             : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
414         }
415 
416         sc[pass].step++;
417         hwaddr ppn = pte >> PTE_PPN_SHIFT;
418 
419         if (!(pte & PTE_V)) {
420             break;                /* Invalid PTE */
421         } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
422             base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
423         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
424             break;                /* Reserved leaf PTE flags: PTE_W */
425         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
426             break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
427         } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
428             break;                /* Misaligned PPN */
429         } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
430             break;                /* Read access check failed */
431         } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
432             break;                /* Write access check failed */
433         } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
434             break;                /* Access bit not set */
435         } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
436             break;                /* Dirty bit not set */
437         } else {
438             /* Leaf PTE, translation completed. */
439             sc[pass].step = sc[pass].levels;
440             base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
441             /* Update address mask based on smallest translation granularity */
442             iotlb->addr_mask &= (1ULL << va_skip) - 1;
443             /* Continue with S-Stage translation? */
444             if (pass && sc[0].step != sc[0].levels) {
445                 pass = S_STAGE;
446                 addr = iotlb->iova;
447                 continue;
448             }
449             /* Translation phase completed (GPA or SPA) */
450             iotlb->translated_addr = base;
451             iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
452                                                          : IOMMU_RO;
453 
454             /* Check MSI GPA address match */
455             if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
456                 riscv_iommu_msi_check(s, ctx, base)) {
457                 /* Trap MSI writes and return GPA address. */
458                 iotlb->target_as = &s->trap_as;
459                 iotlb->addr_mask = ~TARGET_PAGE_MASK;
460                 return 0;
461             }
462 
463             /* Continue with G-Stage translation? */
464             if (!pass && en_g) {
465                 pass = G_STAGE;
466                 addr = base;
467                 base = gatp;
468                 sc[pass].step = 0;
469                 continue;
470             }
471 
472             return 0;
473         }
474 
475         if (sc[pass].step == sc[pass].levels) {
476             break; /* Can't find leaf PTE */
477         }
478 
479         /* Continue with G-Stage translation? */
480         if (!pass && en_g) {
481             pass = G_STAGE;
482             addr = base;
483             base = gatp;
484             sc[pass].step = 0;
485         }
486     } while (1);
487 
488     return (iotlb->perm & IOMMU_WO) ?
489                 (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
490                         RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
491                 (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
492                         RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
493 }
494 
495 static void riscv_iommu_report_fault(RISCVIOMMUState *s,
496                                      RISCVIOMMUContext *ctx,
497                                      uint32_t fault_type, uint32_t cause,
498                                      bool pv,
499                                      uint64_t iotval, uint64_t iotval2)
500 {
501     struct riscv_iommu_fq_record ev = { 0 };
502 
503     if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
504         switch (cause) {
505         case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
506         case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
507         case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
508         case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
509         case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
510         case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
511         case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
512             break;
513         default:
514             /* DTF prevents reporting a fault for this given cause */
515             return;
516         }
517     }
518 
519     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
520     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
521     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
522     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
523 
524     if (pv) {
525         ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
526     }
527 
528     ev.iotval = iotval;
529     ev.iotval2 = iotval2;
530 
531     riscv_iommu_fault(s, &ev);
532 }
533 
534 /* Redirect MSI write for given GPA. */
535 static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
536     RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
537     unsigned size, MemTxAttrs attrs)
538 {
539     MemTxResult res;
540     dma_addr_t addr;
541     uint64_t intn;
542     uint32_t n190;
543     uint64_t pte[2];
544     int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
545     int cause;
546 
547     /* Interrupt File Number */
548     intn = riscv_iommu_pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
549     if (intn >= 256) {
550         /* Interrupt file number out of range */
551         res = MEMTX_ACCESS_ERROR;
552         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
553         goto err;
554     }
555 
556     /* fetch MSI PTE */
557     addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
558     addr = addr | (intn * sizeof(pte));
559     res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
560             MEMTXATTRS_UNSPECIFIED);
561     if (res != MEMTX_OK) {
562         if (res == MEMTX_DECODE_ERROR) {
563             cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
564         } else {
565             cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
566         }
567         goto err;
568     }
569 
570     le64_to_cpus(&pte[0]);
571     le64_to_cpus(&pte[1]);
572 
573     if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
574         /*
575          * The spec mentions that: "If msipte.C == 1, then further
576          * processing to interpret the PTE is implementation
577          * defined.". We'll abort with cause = 262 for this
578          * case too.
579          */
580         res = MEMTX_ACCESS_ERROR;
581         cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
582         goto err;
583     }
584 
585     switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
586     case RISCV_IOMMU_MSI_PTE_M_BASIC:
587         /* MSI Pass-through mode */
588         addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
589 
590         trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
591                               PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
592                               gpa, addr);
593 
594         res = dma_memory_write(s->target_as, addr, &data, size, attrs);
595         if (res != MEMTX_OK) {
596             cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
597             goto err;
598         }
599 
600         return MEMTX_OK;
601     case RISCV_IOMMU_MSI_PTE_M_MRIF:
602         /* MRIF mode, continue. */
603         break;
604     default:
605         res = MEMTX_ACCESS_ERROR;
606         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
607         goto err;
608     }
609 
610     /*
611      * Report an error for interrupt identities exceeding the maximum allowed
612      * for an IMSIC interrupt file (2047) or destination address is not 32-bit
613      * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
614      */
615     if ((data > 2047) || (gpa & 3)) {
616         res = MEMTX_ACCESS_ERROR;
617         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
618         goto err;
619     }
620 
621     /* MSI MRIF mode, non atomic pending bit update */
622 
623     /* MRIF pending bit address */
624     addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
625     addr = addr | ((data & 0x7c0) >> 3);
626 
627     trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
628                           PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
629                           gpa, addr);
630 
631     /* MRIF pending bit mask */
632     data = 1ULL << (data & 0x03f);
633     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
634     if (res != MEMTX_OK) {
635         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
636         goto err;
637     }
638 
639     intn = intn | data;
640     res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
641     if (res != MEMTX_OK) {
642         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
643         goto err;
644     }
645 
646     /* Get MRIF enable bits */
647     addr = addr + sizeof(intn);
648     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
649     if (res != MEMTX_OK) {
650         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
651         goto err;
652     }
653 
654     if (!(intn & data)) {
655         /* notification disabled, MRIF update completed. */
656         return MEMTX_OK;
657     }
658 
659     /* Send notification message */
660     addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
661     n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
662           (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
663 
664     res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
665     if (res != MEMTX_OK) {
666         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
667         goto err;
668     }
669 
670     trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
671 
672     return MEMTX_OK;
673 
674 err:
675     riscv_iommu_report_fault(s, ctx, fault_type, cause,
676                              !!ctx->process_id, 0, 0);
677     return res;
678 }
679 
680 /*
681  * Check device context configuration as described by the
682  * riscv-iommu spec section "Device-context configuration
683  * checks".
684  */
685 static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
686                                             RISCVIOMMUContext *ctx)
687 {
688     uint32_t fsc_mode, msi_mode;
689     uint64_t gatp;
690 
691     if (!(s->cap & RISCV_IOMMU_CAP_ATS) &&
692         (ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS ||
693          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI ||
694          ctx->tc & RISCV_IOMMU_DC_TC_PRPR)) {
695         return false;
696     }
697 
698     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS) &&
699         (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA ||
700          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI)) {
701         return false;
702     }
703 
704     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
705         ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
706         return false;
707     }
708 
709     if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
710         ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
711         return false;
712     }
713 
714     if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
715         msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
716 
717         if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
718             msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
719             return false;
720         }
721     }
722 
723     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
724     if (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA &&
725         gatp == RISCV_IOMMU_DC_IOHGATP_MODE_BARE) {
726         return false;
727     }
728 
729     fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
730 
731     if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
732         switch (fsc_mode) {
733         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
734             if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
735                 return false;
736             }
737             break;
738         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
739             if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
740                 return false;
741             }
742             break;
743         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
744             if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
745                 return false;
746             }
747             break;
748         }
749     } else {
750         /* DC.tc.PDTV is 0 */
751         if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
752             return false;
753         }
754 
755         if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
756             if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
757                 !(s->cap & RISCV_IOMMU_CAP_SV32)) {
758                 return false;
759             }
760         } else {
761             switch (fsc_mode) {
762             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
763                 if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
764                     return false;
765                 }
766                 break;
767             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
768                 if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
769                     return false;
770                 }
771             break;
772             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
773                 if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
774                     return false;
775                 }
776                 break;
777             }
778         }
779     }
780 
781     /*
782      * CAP_END is always zero (only one endianess). FCTL_BE is
783      * always zero (little-endian accesses). Thus TC_SBE must
784      * always be LE, i.e. zero.
785      */
786     if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
787         return false;
788     }
789 
790     return true;
791 }
792 
793 /*
794  * Validate process context (PC) according to section
795  * "Process-context configuration checks".
796  */
797 static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
798                                              RISCVIOMMUContext *ctx)
799 {
800     uint32_t mode;
801 
802     if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
803         return false;
804     }
805 
806     if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
807         return false;
808     }
809 
810     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
811     switch (mode) {
812     case RISCV_IOMMU_DC_FSC_MODE_BARE:
813     /* sv39 and sv32 modes have the same value (8) */
814     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
815     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
816     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
817         break;
818     default:
819         return false;
820     }
821 
822     if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
823         if (mode == RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 &&
824             !(s->cap & RISCV_IOMMU_CAP_SV32)) {
825                 return false;
826         }
827     } else {
828         switch (mode) {
829         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
830             if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
831                 return false;
832             }
833             break;
834         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
835             if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
836                 return false;
837             }
838             break;
839         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
840             if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
841                 return false;
842             }
843             break;
844         }
845     }
846 
847     return true;
848 }
849 
850 /*
851  * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
852  *
853  * @s         : IOMMU Device State
854  * @ctx       : Device Translation Context with devid and process_id set.
855  * @return    : success or fault code.
856  */
857 static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
858 {
859     const uint64_t ddtp = s->ddtp;
860     unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
861     dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
862     struct riscv_iommu_dc dc;
863     /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
864     const int dc_fmt = !s->enable_msi;
865     const size_t dc_len = sizeof(dc) >> dc_fmt;
866     int depth;
867     uint64_t de;
868 
869     switch (mode) {
870     case RISCV_IOMMU_DDTP_MODE_OFF:
871         return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
872 
873     case RISCV_IOMMU_DDTP_MODE_BARE:
874         /* mock up pass-through translation context */
875         ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
876             RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
877         ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
878             RISCV_IOMMU_DC_FSC_MODE_BARE);
879 
880         ctx->tc = RISCV_IOMMU_DC_TC_V;
881         if (s->enable_ats) {
882             ctx->tc |= RISCV_IOMMU_DC_TC_EN_ATS;
883         }
884 
885         ctx->ta = 0;
886         ctx->msiptp = 0;
887         return 0;
888 
889     case RISCV_IOMMU_DDTP_MODE_1LVL:
890         depth = 0;
891         break;
892 
893     case RISCV_IOMMU_DDTP_MODE_2LVL:
894         depth = 1;
895         break;
896 
897     case RISCV_IOMMU_DDTP_MODE_3LVL:
898         depth = 2;
899         break;
900 
901     default:
902         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
903     }
904 
905     /*
906      * Check supported device id width (in bits).
907      * See IOMMU Specification, Chapter 6. Software guidelines.
908      * - if extended device-context format is used:
909      *   1LVL: 6, 2LVL: 15, 3LVL: 24
910      * - if base device-context format is used:
911      *   1LVL: 7, 2LVL: 16, 3LVL: 24
912      */
913     if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
914         return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
915     }
916 
917     /* Device directory tree walk */
918     for (; depth-- > 0; ) {
919         /*
920          * Select device id index bits based on device directory tree level
921          * and device context format.
922          * See IOMMU Specification, Chapter 2. Data Structures.
923          * - if extended device-context format is used:
924          *   device index: [23:15][14:6][5:0]
925          * - if base device-context format is used:
926          *   device index: [23:16][15:7][6:0]
927          */
928         const int split = depth * 9 + 6 + dc_fmt;
929         addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
930         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
931                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
932             return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
933         }
934         le64_to_cpus(&de);
935         if (!(de & RISCV_IOMMU_DDTE_VALID)) {
936             /* invalid directory entry */
937             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
938         }
939         if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
940             /* reserved bits set */
941             return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
942         }
943         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
944     }
945 
946     /* index into device context entry page */
947     addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
948 
949     memset(&dc, 0, sizeof(dc));
950     if (dma_memory_read(s->target_as, addr, &dc, dc_len,
951                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
952         return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
953     }
954 
955     /* Set translation context. */
956     ctx->tc = le64_to_cpu(dc.tc);
957     ctx->gatp = le64_to_cpu(dc.iohgatp);
958     ctx->satp = le64_to_cpu(dc.fsc);
959     ctx->ta = le64_to_cpu(dc.ta);
960     ctx->msiptp = le64_to_cpu(dc.msiptp);
961     ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
962     ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
963 
964     if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
965         return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
966     }
967 
968     if (!riscv_iommu_validate_device_ctx(s, ctx)) {
969         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
970     }
971 
972     /* FSC field checks */
973     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
974     addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
975 
976     if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
977         if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
978             /* PID is disabled */
979             return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
980         }
981         if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
982             /* Invalid translation mode */
983             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
984         }
985         return 0;
986     }
987 
988     if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
989         if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
990             /* No default process_id enabled, set BARE mode */
991             ctx->satp = 0ULL;
992             return 0;
993         } else {
994             /* Use default process_id #0 */
995             ctx->process_id = 0;
996         }
997     }
998 
999     if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
1000         /* No S-Stage translation, done. */
1001         return 0;
1002     }
1003 
1004     /* FSC.TC.PDTV enabled */
1005     if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
1006         /* Invalid PDTP.MODE */
1007         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
1008     }
1009 
1010     for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
1011         /*
1012          * Select process id index bits based on process directory tree
1013          * level. See IOMMU Specification, 2.2. Process-Directory-Table.
1014          */
1015         const int split = depth * 9 + 8;
1016         addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
1017         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
1018                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1019             return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1020         }
1021         le64_to_cpus(&de);
1022         if (!(de & RISCV_IOMMU_PC_TA_V)) {
1023             return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1024         }
1025         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
1026     }
1027 
1028     /* Leaf entry in PDT */
1029     addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
1030     if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
1031                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1032         return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1033     }
1034 
1035     /* Use FSC and TA from process directory entry. */
1036     ctx->ta = le64_to_cpu(dc.ta);
1037     ctx->satp = le64_to_cpu(dc.fsc);
1038 
1039     if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
1040         return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1041     }
1042 
1043     if (!riscv_iommu_validate_process_ctx(s, ctx)) {
1044         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
1045     }
1046 
1047     return 0;
1048 }
1049 
1050 /* Translation Context cache support */
1051 static gboolean riscv_iommu_ctx_equal(gconstpointer v1, gconstpointer v2)
1052 {
1053     RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
1054     RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
1055     return c1->devid == c2->devid &&
1056            c1->process_id == c2->process_id;
1057 }
1058 
1059 static guint riscv_iommu_ctx_hash(gconstpointer v)
1060 {
1061     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
1062     /*
1063      * Generate simple hash of (process_id, devid)
1064      * assuming 24-bit wide devid.
1065      */
1066     return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
1067 }
1068 
1069 static void riscv_iommu_ctx_inval_devid_procid(gpointer key, gpointer value,
1070                                                gpointer data)
1071 {
1072     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1073     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1074     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1075         ctx->devid == arg->devid &&
1076         ctx->process_id == arg->process_id) {
1077         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1078     }
1079 }
1080 
1081 static void riscv_iommu_ctx_inval_devid(gpointer key, gpointer value,
1082                                         gpointer data)
1083 {
1084     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1085     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1086     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1087         ctx->devid == arg->devid) {
1088         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1089     }
1090 }
1091 
1092 static void riscv_iommu_ctx_inval_all(gpointer key, gpointer value,
1093                                       gpointer data)
1094 {
1095     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1096     if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
1097         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1098     }
1099 }
1100 
1101 static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
1102                                   uint32_t devid, uint32_t process_id)
1103 {
1104     GHashTable *ctx_cache;
1105     RISCVIOMMUContext key = {
1106         .devid = devid,
1107         .process_id = process_id,
1108     };
1109     ctx_cache = g_hash_table_ref(s->ctx_cache);
1110     g_hash_table_foreach(ctx_cache, func, &key);
1111     g_hash_table_unref(ctx_cache);
1112 }
1113 
1114 /* Find or allocate translation context for a given {device_id, process_id} */
1115 static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
1116                                           unsigned devid, unsigned process_id,
1117                                           void **ref)
1118 {
1119     GHashTable *ctx_cache;
1120     RISCVIOMMUContext *ctx;
1121     RISCVIOMMUContext key = {
1122         .devid = devid,
1123         .process_id = process_id,
1124     };
1125 
1126     ctx_cache = g_hash_table_ref(s->ctx_cache);
1127     ctx = g_hash_table_lookup(ctx_cache, &key);
1128 
1129     if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
1130         *ref = ctx_cache;
1131         return ctx;
1132     }
1133 
1134     ctx = g_new0(RISCVIOMMUContext, 1);
1135     ctx->devid = devid;
1136     ctx->process_id = process_id;
1137 
1138     int fault = riscv_iommu_ctx_fetch(s, ctx);
1139     if (!fault) {
1140         if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
1141             g_hash_table_unref(ctx_cache);
1142             ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
1143                                               riscv_iommu_ctx_equal,
1144                                               g_free, NULL);
1145             g_hash_table_ref(ctx_cache);
1146             g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
1147         }
1148         g_hash_table_add(ctx_cache, ctx);
1149         *ref = ctx_cache;
1150         return ctx;
1151     }
1152 
1153     g_hash_table_unref(ctx_cache);
1154     *ref = NULL;
1155 
1156     riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
1157                              fault, !!process_id, 0, 0);
1158 
1159     g_free(ctx);
1160     return NULL;
1161 }
1162 
1163 static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
1164 {
1165     if (ref) {
1166         g_hash_table_unref((GHashTable *)ref);
1167     }
1168 }
1169 
1170 /* Find or allocate address space for a given device */
1171 static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
1172 {
1173     RISCVIOMMUSpace *as;
1174 
1175     /* FIXME: PCIe bus remapping for attached endpoints. */
1176     devid |= s->bus << 8;
1177 
1178     QLIST_FOREACH(as, &s->spaces, list) {
1179         if (as->devid == devid) {
1180             break;
1181         }
1182     }
1183 
1184     if (as == NULL) {
1185         char name[64];
1186         as = g_new0(RISCVIOMMUSpace, 1);
1187 
1188         as->iommu = s;
1189         as->devid = devid;
1190 
1191         snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
1192             PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1193 
1194         /* IOVA address space, untranslated addresses */
1195         memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
1196             TYPE_RISCV_IOMMU_MEMORY_REGION,
1197             OBJECT(as), "riscv_iommu", UINT64_MAX);
1198         address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
1199 
1200         QLIST_INSERT_HEAD(&s->spaces, as, list);
1201 
1202         trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
1203                 PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1204     }
1205     return &as->iova_as;
1206 }
1207 
1208 /* Translation Object cache support */
1209 static gboolean riscv_iommu_iot_equal(gconstpointer v1, gconstpointer v2)
1210 {
1211     RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
1212     RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
1213     return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
1214            t1->iova == t2->iova;
1215 }
1216 
1217 static guint riscv_iommu_iot_hash(gconstpointer v)
1218 {
1219     RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
1220     return (guint)t->iova;
1221 }
1222 
1223 /* GV: 1 PSCV: 1 AV: 1 */
1224 static void riscv_iommu_iot_inval_pscid_iova(gpointer key, gpointer value,
1225                                              gpointer data)
1226 {
1227     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1228     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1229     if (iot->gscid == arg->gscid &&
1230         iot->pscid == arg->pscid &&
1231         iot->iova == arg->iova) {
1232         iot->perm = IOMMU_NONE;
1233     }
1234 }
1235 
1236 /* GV: 1 PSCV: 1 AV: 0 */
1237 static void riscv_iommu_iot_inval_pscid(gpointer key, gpointer value,
1238                                         gpointer data)
1239 {
1240     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1241     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1242     if (iot->gscid == arg->gscid &&
1243         iot->pscid == arg->pscid) {
1244         iot->perm = IOMMU_NONE;
1245     }
1246 }
1247 
1248 /* GV: 1 GVMA: 1 */
1249 static void riscv_iommu_iot_inval_gscid_gpa(gpointer key, gpointer value,
1250                                             gpointer data)
1251 {
1252     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1253     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1254     if (iot->gscid == arg->gscid) {
1255         /* simplified cache, no GPA matching */
1256         iot->perm = IOMMU_NONE;
1257     }
1258 }
1259 
1260 /* GV: 1 GVMA: 0 */
1261 static void riscv_iommu_iot_inval_gscid(gpointer key, gpointer value,
1262                                         gpointer data)
1263 {
1264     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1265     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1266     if (iot->gscid == arg->gscid) {
1267         iot->perm = IOMMU_NONE;
1268     }
1269 }
1270 
1271 /* GV: 0 */
1272 static void riscv_iommu_iot_inval_all(gpointer key, gpointer value,
1273                                       gpointer data)
1274 {
1275     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1276     iot->perm = IOMMU_NONE;
1277 }
1278 
1279 /* caller should keep ref-count for iot_cache object */
1280 static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
1281     GHashTable *iot_cache, hwaddr iova)
1282 {
1283     RISCVIOMMUEntry key = {
1284         .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
1285         .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
1286         .iova  = PPN_DOWN(iova),
1287     };
1288     return g_hash_table_lookup(iot_cache, &key);
1289 }
1290 
1291 /* caller should keep ref-count for iot_cache object */
1292 static void riscv_iommu_iot_update(RISCVIOMMUState *s,
1293     GHashTable *iot_cache, RISCVIOMMUEntry *iot)
1294 {
1295     if (!s->iot_limit) {
1296         return;
1297     }
1298 
1299     if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
1300         iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
1301                                           riscv_iommu_iot_equal,
1302                                           g_free, NULL);
1303         g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
1304     }
1305     g_hash_table_add(iot_cache, iot);
1306 }
1307 
1308 static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
1309     uint32_t gscid, uint32_t pscid, hwaddr iova)
1310 {
1311     GHashTable *iot_cache;
1312     RISCVIOMMUEntry key = {
1313         .gscid = gscid,
1314         .pscid = pscid,
1315         .iova  = PPN_DOWN(iova),
1316     };
1317 
1318     iot_cache = g_hash_table_ref(s->iot_cache);
1319     g_hash_table_foreach(iot_cache, func, &key);
1320     g_hash_table_unref(iot_cache);
1321 }
1322 
1323 static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
1324     IOMMUTLBEntry *iotlb, bool enable_cache)
1325 {
1326     RISCVIOMMUEntry *iot;
1327     IOMMUAccessFlags perm;
1328     bool enable_pid;
1329     bool enable_pri;
1330     GHashTable *iot_cache;
1331     int fault;
1332 
1333     iot_cache = g_hash_table_ref(s->iot_cache);
1334     /*
1335      * TC[32] is reserved for custom extensions, used here to temporarily
1336      * enable automatic page-request generation for ATS queries.
1337      */
1338     enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
1339     enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
1340 
1341     /* Check for ATS request. */
1342     if (iotlb->perm == IOMMU_NONE) {
1343         /* Check if ATS is disabled. */
1344         if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
1345             enable_pri = false;
1346             fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
1347             goto done;
1348         }
1349     }
1350 
1351     iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
1352     perm = iot ? iot->perm : IOMMU_NONE;
1353     if (perm != IOMMU_NONE) {
1354         iotlb->translated_addr = PPN_PHYS(iot->phys);
1355         iotlb->addr_mask = ~TARGET_PAGE_MASK;
1356         iotlb->perm = perm;
1357         fault = 0;
1358         goto done;
1359     }
1360 
1361     /* Translate using device directory / page table information. */
1362     fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
1363 
1364     if (!fault && iotlb->target_as == &s->trap_as) {
1365         /* Do not cache trapped MSI translations */
1366         goto done;
1367     }
1368 
1369     /*
1370      * We made an implementation choice to not cache identity-mapped
1371      * translations, as allowed by the specification, to avoid
1372      * translation cache evictions for other devices sharing the
1373      * IOMMU hardware model.
1374      */
1375     if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
1376         iot = g_new0(RISCVIOMMUEntry, 1);
1377         iot->iova = PPN_DOWN(iotlb->iova);
1378         iot->phys = PPN_DOWN(iotlb->translated_addr);
1379         iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
1380         iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
1381         iot->perm = iotlb->perm;
1382         riscv_iommu_iot_update(s, iot_cache, iot);
1383     }
1384 
1385 done:
1386     g_hash_table_unref(iot_cache);
1387 
1388     if (enable_pri && fault) {
1389         struct riscv_iommu_pq_record pr = {0};
1390         if (enable_pid) {
1391             pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
1392                                RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
1393         }
1394         pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
1395         pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
1396                      RISCV_IOMMU_PREQ_PAYLOAD_M;
1397         riscv_iommu_pri(s, &pr);
1398         return fault;
1399     }
1400 
1401     if (fault) {
1402         unsigned ttype = RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ;
1403 
1404         if (iotlb->perm & IOMMU_RW) {
1405             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
1406         } else if (iotlb->perm & IOMMU_RO) {
1407             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
1408         }
1409 
1410         riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
1411                                  iotlb->iova, iotlb->translated_addr);
1412         return fault;
1413     }
1414 
1415     return 0;
1416 }
1417 
1418 /* IOMMU Command Interface */
1419 static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
1420     uint64_t addr, uint32_t data)
1421 {
1422     /*
1423      * ATS processing in this implementation of the IOMMU is synchronous,
1424      * no need to wait for completions here.
1425      */
1426     if (!notify) {
1427         return MEMTX_OK;
1428     }
1429 
1430     return dma_memory_write(s->target_as, addr, &data, sizeof(data),
1431         MEMTXATTRS_UNSPECIFIED);
1432 }
1433 
1434 static void riscv_iommu_ats(RISCVIOMMUState *s,
1435     struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
1436     IOMMUAccessFlags perm,
1437     void (*trace_fn)(const char *id))
1438 {
1439     RISCVIOMMUSpace *as = NULL;
1440     IOMMUNotifier *n;
1441     IOMMUTLBEvent event;
1442     uint32_t pid;
1443     uint32_t devid;
1444     const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
1445 
1446     if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
1447         /* Use device segment and requester id */
1448         devid = get_field(cmd->dword0,
1449             RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
1450     } else {
1451         devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
1452     }
1453 
1454     pid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
1455 
1456     QLIST_FOREACH(as, &s->spaces, list) {
1457         if (as->devid == devid) {
1458             break;
1459         }
1460     }
1461 
1462     if (!as || !as->notifier) {
1463         return;
1464     }
1465 
1466     event.type = flag;
1467     event.entry.perm = perm;
1468     event.entry.target_as = s->target_as;
1469 
1470     IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
1471         if (!pv || n->iommu_idx == pid) {
1472             event.entry.iova = n->start;
1473             event.entry.addr_mask = n->end - n->start;
1474             trace_fn(as->iova_mr.parent_obj.name);
1475             memory_region_notify_iommu_one(n, &event);
1476         }
1477     }
1478 }
1479 
1480 static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
1481     struct riscv_iommu_command *cmd)
1482 {
1483     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
1484                            trace_riscv_iommu_ats_inval);
1485 }
1486 
1487 static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
1488     struct riscv_iommu_command *cmd)
1489 {
1490     unsigned resp_code = get_field(cmd->dword1,
1491                                    RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
1492 
1493     /* Using the access flag to carry response code information */
1494     IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
1495     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
1496                            trace_riscv_iommu_ats_prgr);
1497 }
1498 
1499 static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
1500 {
1501     uint64_t old_ddtp = s->ddtp;
1502     uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
1503     unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
1504     unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
1505     bool ok = false;
1506 
1507     /*
1508      * Check for allowed DDTP.MODE transitions:
1509      * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
1510      * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
1511      */
1512     if (new_mode == old_mode ||
1513         new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1514         new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
1515         ok = true;
1516     } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
1517                new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
1518                new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
1519         ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1520              old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
1521     }
1522 
1523     if (ok) {
1524         /* clear reserved and busy bits, report back sanitized version */
1525         new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
1526                              RISCV_IOMMU_DDTP_MODE, new_mode);
1527     } else {
1528         new_ddtp = old_ddtp;
1529     }
1530     s->ddtp = new_ddtp;
1531 
1532     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
1533 }
1534 
1535 /* Command function and opcode field. */
1536 #define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
1537 
1538 static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
1539 {
1540     struct riscv_iommu_command cmd;
1541     MemTxResult res;
1542     dma_addr_t addr;
1543     uint32_t tail, head, ctrl;
1544     uint64_t cmd_opcode;
1545     GHFunc func;
1546 
1547     ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1548     tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
1549     head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
1550 
1551     /* Check for pending error or queue processing disabled */
1552     if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
1553         !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
1554         return;
1555     }
1556 
1557     while (tail != head) {
1558         addr = s->cq_addr  + head * sizeof(cmd);
1559         res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
1560                               MEMTXATTRS_UNSPECIFIED);
1561 
1562         if (res != MEMTX_OK) {
1563             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1564                                   RISCV_IOMMU_CQCSR_CQMF, 0);
1565             goto fault;
1566         }
1567 
1568         trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
1569 
1570         cmd_opcode = get_field(cmd.dword0,
1571                                RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
1572 
1573         switch (cmd_opcode) {
1574         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
1575                              RISCV_IOMMU_CMD_IOFENCE_OPCODE):
1576             res = riscv_iommu_iofence(s,
1577                 cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1 << 2,
1578                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
1579 
1580             if (res != MEMTX_OK) {
1581                 riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1582                                       RISCV_IOMMU_CQCSR_CQMF, 0);
1583                 goto fault;
1584             }
1585             break;
1586 
1587         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
1588                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1589             if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
1590                 /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
1591                 goto cmd_ill;
1592             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1593                 /* invalidate all cache mappings */
1594                 func = riscv_iommu_iot_inval_all;
1595             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1596                 /* invalidate cache matching GSCID */
1597                 func = riscv_iommu_iot_inval_gscid;
1598             } else {
1599                 /* invalidate cache matching GSCID and ADDR (GPA) */
1600                 func = riscv_iommu_iot_inval_gscid_gpa;
1601             }
1602             riscv_iommu_iot_inval(s, func,
1603                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
1604                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1605             break;
1606 
1607         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
1608                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1609             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1610                 /* invalidate all cache mappings, simplified model */
1611                 func = riscv_iommu_iot_inval_all;
1612             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
1613                 /* invalidate cache matching GSCID, simplified model */
1614                 func = riscv_iommu_iot_inval_gscid;
1615             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1616                 /* invalidate cache matching GSCID and PSCID */
1617                 func = riscv_iommu_iot_inval_pscid;
1618             } else {
1619                 /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
1620                 func = riscv_iommu_iot_inval_pscid_iova;
1621             }
1622             riscv_iommu_iot_inval(s, func,
1623                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
1624                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
1625                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1626             break;
1627 
1628         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
1629                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1630             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1631                 /* invalidate all device context cache mappings */
1632                 func = riscv_iommu_ctx_inval_all;
1633             } else {
1634                 /* invalidate all device context matching DID */
1635                 func = riscv_iommu_ctx_inval_devid;
1636             }
1637             riscv_iommu_ctx_inval(s, func,
1638                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
1639             break;
1640 
1641         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
1642                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1643             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1644                 /* illegal command arguments IODIR_PDT & DV == 0 */
1645                 goto cmd_ill;
1646             } else {
1647                 func = riscv_iommu_ctx_inval_devid_procid;
1648             }
1649             riscv_iommu_ctx_inval(s, func,
1650                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
1651                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
1652             break;
1653 
1654         /* ATS commands */
1655         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
1656                              RISCV_IOMMU_CMD_ATS_OPCODE):
1657             if (!s->enable_ats) {
1658                 goto cmd_ill;
1659             }
1660 
1661             riscv_iommu_ats_inval(s, &cmd);
1662             break;
1663 
1664         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
1665                              RISCV_IOMMU_CMD_ATS_OPCODE):
1666             if (!s->enable_ats) {
1667                 goto cmd_ill;
1668             }
1669 
1670             riscv_iommu_ats_prgr(s, &cmd);
1671             break;
1672 
1673         default:
1674         cmd_ill:
1675             /* Invalid instruction, do not advance instruction index. */
1676             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1677                 RISCV_IOMMU_CQCSR_CMD_ILL, 0);
1678             goto fault;
1679         }
1680 
1681         /* Advance and update head pointer after command completes. */
1682         head = (head + 1) & s->cq_mask;
1683         riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
1684     }
1685     return;
1686 
1687 fault:
1688     if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
1689         riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
1690     }
1691 }
1692 
1693 static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
1694 {
1695     uint64_t base;
1696     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1697     uint32_t ctrl_clr;
1698     bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
1699     bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
1700 
1701     if (enable && !active) {
1702         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
1703         s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
1704         s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
1705         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
1706         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
1707         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
1708         ctrl_set = RISCV_IOMMU_CQCSR_CQON;
1709         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
1710                    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
1711                    RISCV_IOMMU_CQCSR_FENCE_W_IP;
1712     } else if (!enable && active) {
1713         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
1714         ctrl_set = 0;
1715         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
1716     } else {
1717         ctrl_set = 0;
1718         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
1719     }
1720 
1721     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
1722 }
1723 
1724 static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
1725 {
1726     uint64_t base;
1727     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1728     uint32_t ctrl_clr;
1729     bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
1730     bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
1731 
1732     if (enable && !active) {
1733         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
1734         s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
1735         s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
1736         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
1737         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
1738         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
1739         ctrl_set = RISCV_IOMMU_FQCSR_FQON;
1740         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
1741             RISCV_IOMMU_FQCSR_FQOF;
1742     } else if (!enable && active) {
1743         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
1744         ctrl_set = 0;
1745         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
1746     } else {
1747         ctrl_set = 0;
1748         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
1749     }
1750 
1751     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
1752 }
1753 
1754 static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
1755 {
1756     uint64_t base;
1757     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1758     uint32_t ctrl_clr;
1759     bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
1760     bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
1761 
1762     if (enable && !active) {
1763         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
1764         s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
1765         s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
1766         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
1767         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
1768         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
1769         ctrl_set = RISCV_IOMMU_PQCSR_PQON;
1770         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
1771             RISCV_IOMMU_PQCSR_PQOF;
1772     } else if (!enable && active) {
1773         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
1774         ctrl_set = 0;
1775         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
1776     } else {
1777         ctrl_set = 0;
1778         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
1779     }
1780 
1781     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
1782 }
1783 
1784 static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
1785 {
1786     uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
1787     uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
1788     unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
1789     unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
1790     RISCVIOMMUContext *ctx;
1791     void *ref;
1792 
1793     if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
1794         return;
1795     }
1796 
1797     ctx = riscv_iommu_ctx(s, devid, pid, &ref);
1798     if (ctx == NULL) {
1799         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
1800                                  RISCV_IOMMU_TR_RESPONSE_FAULT |
1801                                  (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
1802     } else {
1803         IOMMUTLBEntry iotlb = {
1804             .iova = iova,
1805             .perm = ctrl & RISCV_IOMMU_TR_REQ_CTL_NW ? IOMMU_RO : IOMMU_RW,
1806             .addr_mask = ~0,
1807             .target_as = NULL,
1808         };
1809         int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
1810         if (fault) {
1811             iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
1812         } else {
1813             iova = iotlb.translated_addr & ~iotlb.addr_mask;
1814             iova >>= TARGET_PAGE_BITS;
1815             iova &= RISCV_IOMMU_TR_RESPONSE_PPN;
1816 
1817             /* We do not support superpages (> 4kbs) for now */
1818             iova &= ~RISCV_IOMMU_TR_RESPONSE_S;
1819         }
1820         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
1821     }
1822 
1823     riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
1824         RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
1825     riscv_iommu_ctx_put(s, ref);
1826 }
1827 
1828 typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
1829 
1830 static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
1831 {
1832     uint64_t icvec = 0;
1833 
1834     icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
1835                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
1836 
1837     icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
1838                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
1839 
1840     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
1841                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
1842 
1843     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
1844                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
1845 
1846     trace_riscv_iommu_icvec_write(data, icvec);
1847 
1848     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
1849 }
1850 
1851 static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
1852 {
1853     uint32_t cqcsr, fqcsr, pqcsr;
1854     uint32_t ipsr_set = 0;
1855     uint32_t ipsr_clr = 0;
1856 
1857     if (data & RISCV_IOMMU_IPSR_CIP) {
1858         cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1859 
1860         if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
1861             (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
1862              cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
1863              cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
1864              cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
1865             ipsr_set |= RISCV_IOMMU_IPSR_CIP;
1866         } else {
1867             ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1868         }
1869     } else {
1870         ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1871     }
1872 
1873     if (data & RISCV_IOMMU_IPSR_FIP) {
1874         fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1875 
1876         if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
1877             (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
1878              fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
1879             ipsr_set |= RISCV_IOMMU_IPSR_FIP;
1880         } else {
1881             ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1882         }
1883     } else {
1884         ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1885     }
1886 
1887     if (data & RISCV_IOMMU_IPSR_PIP) {
1888         pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1889 
1890         if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
1891             (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
1892              pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
1893             ipsr_set |= RISCV_IOMMU_IPSR_PIP;
1894         } else {
1895             ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1896         }
1897     } else {
1898         ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1899     }
1900 
1901     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
1902 }
1903 
1904 /*
1905  * Write the resulting value of 'data' for the reg specified
1906  * by 'reg_addr', after considering read-only/read-write/write-clear
1907  * bits, in the pointer 'dest'.
1908  *
1909  * The result is written in little-endian.
1910  */
1911 static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
1912                                       void *dest, hwaddr reg_addr,
1913                                       int size, uint64_t data)
1914 {
1915     uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
1916     uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
1917     uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
1918 
1919     stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
1920 }
1921 
1922 static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
1923                                           uint64_t data, unsigned size,
1924                                           MemTxAttrs attrs)
1925 {
1926     riscv_iommu_process_fn *process_fn = NULL;
1927     RISCVIOMMUState *s = opaque;
1928     uint32_t regb = addr & ~3;
1929     uint32_t busy = 0;
1930     uint64_t val = 0;
1931 
1932     if ((addr & (size - 1)) != 0) {
1933         /* Unsupported MMIO alignment or access size */
1934         return MEMTX_ERROR;
1935     }
1936 
1937     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
1938         /* Unsupported MMIO access location. */
1939         return MEMTX_ACCESS_ERROR;
1940     }
1941 
1942     /* Track actionable MMIO write. */
1943     switch (regb) {
1944     case RISCV_IOMMU_REG_DDTP:
1945     case RISCV_IOMMU_REG_DDTP + 4:
1946         process_fn = riscv_iommu_process_ddtp;
1947         regb = RISCV_IOMMU_REG_DDTP;
1948         busy = RISCV_IOMMU_DDTP_BUSY;
1949         break;
1950 
1951     case RISCV_IOMMU_REG_CQT:
1952         process_fn = riscv_iommu_process_cq_tail;
1953         break;
1954 
1955     case RISCV_IOMMU_REG_CQCSR:
1956         process_fn = riscv_iommu_process_cq_control;
1957         busy = RISCV_IOMMU_CQCSR_BUSY;
1958         break;
1959 
1960     case RISCV_IOMMU_REG_FQCSR:
1961         process_fn = riscv_iommu_process_fq_control;
1962         busy = RISCV_IOMMU_FQCSR_BUSY;
1963         break;
1964 
1965     case RISCV_IOMMU_REG_PQCSR:
1966         process_fn = riscv_iommu_process_pq_control;
1967         busy = RISCV_IOMMU_PQCSR_BUSY;
1968         break;
1969 
1970     case RISCV_IOMMU_REG_ICVEC:
1971     case RISCV_IOMMU_REG_IPSR:
1972         /*
1973          * ICVEC and IPSR have special read/write procedures. We'll
1974          * call their respective helpers and exit.
1975          */
1976         riscv_iommu_write_reg_val(s, &val, addr, size, data);
1977 
1978         /*
1979          * 'val' is stored as LE. Switch to host endianess
1980          * before using it.
1981          */
1982         val = le64_to_cpu(val);
1983 
1984         if (regb == RISCV_IOMMU_REG_ICVEC) {
1985             riscv_iommu_update_icvec(s, val);
1986         } else {
1987             riscv_iommu_update_ipsr(s, val);
1988         }
1989 
1990         return MEMTX_OK;
1991 
1992     case RISCV_IOMMU_REG_TR_REQ_CTL:
1993         process_fn = riscv_iommu_process_dbg;
1994         regb = RISCV_IOMMU_REG_TR_REQ_CTL;
1995         busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
1996         break;
1997 
1998     default:
1999         break;
2000     }
2001 
2002     /*
2003      * Registers update might be not synchronized with core logic.
2004      * If system software updates register when relevant BUSY bit
2005      * is set IOMMU behavior of additional writes to the register
2006      * is UNSPECIFIED.
2007      */
2008     riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
2009 
2010     /* Busy flag update, MSB 4-byte register. */
2011     if (busy) {
2012         uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
2013         stl_le_p(&s->regs_rw[regb], rw | busy);
2014     }
2015 
2016     if (process_fn) {
2017         process_fn(s);
2018     }
2019 
2020     return MEMTX_OK;
2021 }
2022 
2023 static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
2024     uint64_t *data, unsigned size, MemTxAttrs attrs)
2025 {
2026     RISCVIOMMUState *s = opaque;
2027     uint64_t val = -1;
2028     uint8_t *ptr;
2029 
2030     if ((addr & (size - 1)) != 0) {
2031         /* Unsupported MMIO alignment. */
2032         return MEMTX_ERROR;
2033     }
2034 
2035     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
2036         return MEMTX_ACCESS_ERROR;
2037     }
2038 
2039     ptr = &s->regs_rw[addr];
2040     val = ldn_le_p(ptr, size);
2041 
2042     *data = val;
2043 
2044     return MEMTX_OK;
2045 }
2046 
2047 static const MemoryRegionOps riscv_iommu_mmio_ops = {
2048     .read_with_attrs = riscv_iommu_mmio_read,
2049     .write_with_attrs = riscv_iommu_mmio_write,
2050     .endianness = DEVICE_NATIVE_ENDIAN,
2051     .impl = {
2052         .min_access_size = 4,
2053         .max_access_size = 8,
2054         .unaligned = false,
2055     },
2056     .valid = {
2057         .min_access_size = 4,
2058         .max_access_size = 8,
2059     }
2060 };
2061 
2062 /*
2063  * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
2064  * memory region as untranslated address, for additional MSI/MRIF interception
2065  * by IOMMU interrupt remapping implementation.
2066  * Note: Device emulation code generating an MSI is expected to provide a valid
2067  * memory transaction attributes with requested_id set.
2068  */
2069 static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
2070     uint64_t data, unsigned size, MemTxAttrs attrs)
2071 {
2072     RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
2073     RISCVIOMMUContext *ctx;
2074     MemTxResult res;
2075     void *ref;
2076     uint32_t devid = attrs.requester_id;
2077 
2078     if (attrs.unspecified) {
2079         return MEMTX_ACCESS_ERROR;
2080     }
2081 
2082     /* FIXME: PCIe bus remapping for attached endpoints. */
2083     devid |= s->bus << 8;
2084 
2085     ctx = riscv_iommu_ctx(s, devid, 0, &ref);
2086     if (ctx == NULL) {
2087         res = MEMTX_ACCESS_ERROR;
2088     } else {
2089         res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
2090     }
2091     riscv_iommu_ctx_put(s, ref);
2092     return res;
2093 }
2094 
2095 static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
2096     uint64_t *data, unsigned size, MemTxAttrs attrs)
2097 {
2098     return MEMTX_ACCESS_ERROR;
2099 }
2100 
2101 static const MemoryRegionOps riscv_iommu_trap_ops = {
2102     .read_with_attrs = riscv_iommu_trap_read,
2103     .write_with_attrs = riscv_iommu_trap_write,
2104     .endianness = DEVICE_LITTLE_ENDIAN,
2105     .impl = {
2106         .min_access_size = 4,
2107         .max_access_size = 8,
2108         .unaligned = true,
2109     },
2110     .valid = {
2111         .min_access_size = 4,
2112         .max_access_size = 8,
2113     }
2114 };
2115 
2116 static void riscv_iommu_realize(DeviceState *dev, Error **errp)
2117 {
2118     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2119 
2120     s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
2121     if (s->enable_msi) {
2122         s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
2123     }
2124     if (s->enable_ats) {
2125         s->cap |= RISCV_IOMMU_CAP_ATS;
2126     }
2127     if (s->enable_s_stage) {
2128         s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
2129                   RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
2130     }
2131     if (s->enable_g_stage) {
2132         s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
2133                   RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
2134     }
2135     /* Enable translation debug interface */
2136     s->cap |= RISCV_IOMMU_CAP_DBG;
2137 
2138     /* Report QEMU target physical address space limits */
2139     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
2140                        TARGET_PHYS_ADDR_SPACE_BITS);
2141 
2142     /* TODO: method to report supported PID bits */
2143     s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
2144     s->cap |= RISCV_IOMMU_CAP_PD8;
2145 
2146     /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
2147     s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
2148                         RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
2149 
2150     /* register storage */
2151     s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2152     s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2153     s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2154 
2155      /* Mark all registers read-only */
2156     memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
2157 
2158     /*
2159      * Register complete MMIO space, including MSI/PBA registers.
2160      * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
2161      * managed directly by the PCIDevice implementation.
2162      */
2163     memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
2164         "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
2165 
2166     /* Set power-on register state */
2167     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
2168     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
2169     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
2170              ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
2171     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
2172         ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
2173     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
2174         ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
2175     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
2176         ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
2177     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
2178         ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
2179     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
2180         RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
2181     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
2182         RISCV_IOMMU_CQCSR_BUSY);
2183     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
2184         RISCV_IOMMU_FQCSR_FQOF);
2185     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
2186         RISCV_IOMMU_FQCSR_BUSY);
2187     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
2188         RISCV_IOMMU_PQCSR_PQOF);
2189     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
2190         RISCV_IOMMU_PQCSR_BUSY);
2191     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
2192     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
2193     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
2194     /* If debug registers enabled. */
2195     if (s->cap & RISCV_IOMMU_CAP_DBG) {
2196         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
2197         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
2198             RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
2199     }
2200 
2201     /* Memory region for downstream access, if specified. */
2202     if (s->target_mr) {
2203         s->target_as = g_new0(AddressSpace, 1);
2204         address_space_init(s->target_as, s->target_mr,
2205             "riscv-iommu-downstream");
2206     } else {
2207         /* Fallback to global system memory. */
2208         s->target_as = &address_space_memory;
2209     }
2210 
2211     /* Memory region for untranslated MRIF/MSI writes */
2212     memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
2213             "riscv-iommu-trap", ~0ULL);
2214     address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
2215 
2216     /* Device translation context cache */
2217     s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
2218                                          riscv_iommu_ctx_equal,
2219                                          g_free, NULL);
2220 
2221     s->iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
2222                                          riscv_iommu_iot_equal,
2223                                          g_free, NULL);
2224 
2225     s->iommus.le_next = NULL;
2226     s->iommus.le_prev = NULL;
2227     QLIST_INIT(&s->spaces);
2228 }
2229 
2230 static void riscv_iommu_unrealize(DeviceState *dev)
2231 {
2232     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2233 
2234     g_hash_table_unref(s->iot_cache);
2235     g_hash_table_unref(s->ctx_cache);
2236 }
2237 
2238 static Property riscv_iommu_properties[] = {
2239     DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
2240         RISCV_IOMMU_SPEC_DOT_VER),
2241     DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
2242     DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
2243         LIMIT_CACHE_IOT),
2244     DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
2245     DEFINE_PROP_BOOL("ats", RISCVIOMMUState, enable_ats, TRUE),
2246     DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
2247     DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
2248     DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
2249     DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
2250         TYPE_MEMORY_REGION, MemoryRegion *),
2251     DEFINE_PROP_END_OF_LIST(),
2252 };
2253 
2254 static void riscv_iommu_class_init(ObjectClass *klass, void* data)
2255 {
2256     DeviceClass *dc = DEVICE_CLASS(klass);
2257 
2258     /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
2259     dc->user_creatable = false;
2260     dc->realize = riscv_iommu_realize;
2261     dc->unrealize = riscv_iommu_unrealize;
2262     device_class_set_props(dc, riscv_iommu_properties);
2263 }
2264 
2265 static const TypeInfo riscv_iommu_info = {
2266     .name = TYPE_RISCV_IOMMU,
2267     .parent = TYPE_DEVICE,
2268     .instance_size = sizeof(RISCVIOMMUState),
2269     .class_init = riscv_iommu_class_init,
2270 };
2271 
2272 static const char *IOMMU_FLAG_STR[] = {
2273     "NA",
2274     "RO",
2275     "WR",
2276     "RW",
2277 };
2278 
2279 /* RISC-V IOMMU Memory Region - Address Translation Space */
2280 static IOMMUTLBEntry riscv_iommu_memory_region_translate(
2281     IOMMUMemoryRegion *iommu_mr, hwaddr addr,
2282     IOMMUAccessFlags flag, int iommu_idx)
2283 {
2284     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2285     RISCVIOMMUContext *ctx;
2286     void *ref;
2287     IOMMUTLBEntry iotlb = {
2288         .iova = addr,
2289         .target_as = as->iommu->target_as,
2290         .addr_mask = ~0ULL,
2291         .perm = flag,
2292     };
2293 
2294     ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
2295     if (ctx == NULL) {
2296         /* Translation disabled or invalid. */
2297         iotlb.addr_mask = 0;
2298         iotlb.perm = IOMMU_NONE;
2299     } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
2300         /* Translation disabled or fault reported. */
2301         iotlb.addr_mask = 0;
2302         iotlb.perm = IOMMU_NONE;
2303     }
2304 
2305     /* Trace all dma translations with original access flags. */
2306     trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
2307                           PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
2308                           IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
2309                           iotlb.translated_addr);
2310 
2311     riscv_iommu_ctx_put(as->iommu, ref);
2312 
2313     return iotlb;
2314 }
2315 
2316 static int riscv_iommu_memory_region_notify(
2317     IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
2318     IOMMUNotifierFlag new, Error **errp)
2319 {
2320     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2321 
2322     if (old == IOMMU_NOTIFIER_NONE) {
2323         as->notifier = true;
2324         trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
2325     } else if (new == IOMMU_NOTIFIER_NONE) {
2326         as->notifier = false;
2327         trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
2328     }
2329 
2330     return 0;
2331 }
2332 
2333 static inline bool pci_is_iommu(PCIDevice *pdev)
2334 {
2335     return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
2336 }
2337 
2338 static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
2339 {
2340     RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
2341     PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
2342     AddressSpace *as = NULL;
2343 
2344     if (pdev && pci_is_iommu(pdev)) {
2345         return s->target_as;
2346     }
2347 
2348     /* Find first registered IOMMU device */
2349     while (s->iommus.le_prev) {
2350         s = *(s->iommus.le_prev);
2351     }
2352 
2353     /* Find first matching IOMMU */
2354     while (s != NULL && as == NULL) {
2355         as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
2356         s = s->iommus.le_next;
2357     }
2358 
2359     return as ? as : &address_space_memory;
2360 }
2361 
2362 static const PCIIOMMUOps riscv_iommu_ops = {
2363     .get_address_space = riscv_iommu_find_as,
2364 };
2365 
2366 void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
2367         Error **errp)
2368 {
2369     if (bus->iommu_ops &&
2370         bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
2371         /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
2372         RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
2373         QLIST_INSERT_AFTER(last, iommu, iommus);
2374     } else if (!bus->iommu_ops && !bus->iommu_opaque) {
2375         pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
2376     } else {
2377         error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
2378             pci_bus_num(bus));
2379     }
2380 }
2381 
2382 static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
2383     MemTxAttrs attrs)
2384 {
2385     return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
2386 }
2387 
2388 static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
2389 {
2390     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2391     return 1 << as->iommu->pid_bits;
2392 }
2393 
2394 static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
2395 {
2396     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
2397 
2398     imrc->translate = riscv_iommu_memory_region_translate;
2399     imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
2400     imrc->attrs_to_index = riscv_iommu_memory_region_index;
2401     imrc->num_indexes = riscv_iommu_memory_region_index_len;
2402 }
2403 
2404 static const TypeInfo riscv_iommu_memory_region_info = {
2405     .parent = TYPE_IOMMU_MEMORY_REGION,
2406     .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
2407     .class_init = riscv_iommu_memory_region_init,
2408 };
2409 
2410 static void riscv_iommu_register_mr_types(void)
2411 {
2412     type_register_static(&riscv_iommu_memory_region_info);
2413     type_register_static(&riscv_iommu_info);
2414 }
2415 
2416 type_init(riscv_iommu_register_mr_types);
2417