xref: /openbmc/qemu/hw/riscv/riscv-iommu.c (revision 9e4cc917)
1 /*
2  * QEMU emulation of an RISC-V IOMMU
3  *
4  * Copyright (C) 2021-2023, Rivos Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qom/object.h"
21 #include "hw/pci/pci_bus.h"
22 #include "hw/pci/pci_device.h"
23 #include "hw/qdev-properties.h"
24 #include "hw/riscv/riscv_hart.h"
25 #include "migration/vmstate.h"
26 #include "qapi/error.h"
27 #include "qemu/timer.h"
28 
29 #include "cpu_bits.h"
30 #include "riscv-iommu.h"
31 #include "riscv-iommu-bits.h"
32 #include "trace.h"
33 
34 #define LIMIT_CACHE_CTX               (1U << 7)
35 #define LIMIT_CACHE_IOT               (1U << 20)
36 
37 /* Physical page number coversions */
38 #define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
39 #define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
40 
41 typedef struct RISCVIOMMUContext RISCVIOMMUContext;
42 typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
43 
44 /* Device assigned I/O address space */
45 struct RISCVIOMMUSpace {
46     IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
47     AddressSpace iova_as;       /* IOVA address space for attached device */
48     RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
49     uint32_t devid;             /* Requester identifier, AKA device_id */
50     bool notifier;              /* IOMMU unmap notifier enabled */
51     QLIST_ENTRY(RISCVIOMMUSpace) list;
52 };
53 
54 /* Device translation context state. */
55 struct RISCVIOMMUContext {
56     uint64_t devid:24;          /* Requester Id, AKA device_id */
57     uint64_t process_id:20;     /* Process ID. PASID for PCIe */
58     uint64_t tc;                /* Translation Control */
59     uint64_t ta;                /* Translation Attributes */
60     uint64_t satp;              /* S-Stage address translation and protection */
61     uint64_t gatp;              /* G-Stage address translation and protection */
62     uint64_t msi_addr_mask;     /* MSI filtering - address mask */
63     uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
64     uint64_t msiptp;            /* MSI redirection page table pointer */
65 };
66 
67 /* Address translation cache entry */
68 struct RISCVIOMMUEntry {
69     uint64_t iova:44;           /* IOVA Page Number */
70     uint64_t pscid:20;          /* Process Soft-Context identifier */
71     uint64_t phys:44;           /* Physical Page Number */
72     uint64_t gscid:16;          /* Guest Soft-Context identifier */
73     uint64_t perm:2;            /* IOMMU_RW flags */
74 };
75 
76 /* IOMMU index for transactions without process_id specified. */
77 #define RISCV_IOMMU_NOPROCID 0
78 
79 static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
80 {
81     switch (vec_type) {
82     case RISCV_IOMMU_INTR_CQ:
83         return icvec & RISCV_IOMMU_ICVEC_CIV;
84     case RISCV_IOMMU_INTR_FQ:
85         return (icvec & RISCV_IOMMU_ICVEC_FIV) >> 4;
86     case RISCV_IOMMU_INTR_PM:
87         return (icvec & RISCV_IOMMU_ICVEC_PMIV) >> 8;
88     case RISCV_IOMMU_INTR_PQ:
89         return (icvec & RISCV_IOMMU_ICVEC_PIV) >> 12;
90     default:
91         g_assert_not_reached();
92     }
93 }
94 
95 static void riscv_iommu_notify(RISCVIOMMUState *s, int vec_type)
96 {
97     const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
98     uint32_t ipsr, icvec, vector;
99 
100     if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
101         return;
102     }
103 
104     icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
105     ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec_type), 0);
106 
107     if (!(ipsr & (1 << vec_type))) {
108         vector = riscv_iommu_get_icvec_vector(icvec, vec_type);
109         s->notify(s, vector);
110         trace_riscv_iommu_notify_int_vector(vec_type, vector);
111     }
112 }
113 
114 static void riscv_iommu_fault(RISCVIOMMUState *s,
115                               struct riscv_iommu_fq_record *ev)
116 {
117     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
118     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
119     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
120     uint32_t next = (tail + 1) & s->fq_mask;
121     uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
122 
123     trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
124                           PCI_FUNC(devid), ev->hdr, ev->iotval);
125 
126     if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
127         !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
128         return;
129     }
130 
131     if (head == next) {
132         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
133                               RISCV_IOMMU_FQCSR_FQOF, 0);
134     } else {
135         dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
136         if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
137                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
138             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
139                                   RISCV_IOMMU_FQCSR_FQMF, 0);
140         } else {
141             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
142         }
143     }
144 
145     if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
146         riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
147     }
148 }
149 
150 static void riscv_iommu_pri(RISCVIOMMUState *s,
151     struct riscv_iommu_pq_record *pr)
152 {
153     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
154     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
155     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
156     uint32_t next = (tail + 1) & s->pq_mask;
157     uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
158 
159     trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
160                           PCI_FUNC(devid), pr->payload);
161 
162     if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
163         !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
164         return;
165     }
166 
167     if (head == next) {
168         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
169                               RISCV_IOMMU_PQCSR_PQOF, 0);
170     } else {
171         dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
172         if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
173                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
174             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
175                                   RISCV_IOMMU_PQCSR_PQMF, 0);
176         } else {
177             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
178         }
179     }
180 
181     if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
182         riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
183     }
184 }
185 
186 /* Portable implementation of pext_u64, bit-mask extraction. */
187 static uint64_t _pext_u64(uint64_t val, uint64_t ext)
188 {
189     uint64_t ret = 0;
190     uint64_t rot = 1;
191 
192     while (ext) {
193         if (ext & 1) {
194             if (val & 1) {
195                 ret |= rot;
196             }
197             rot <<= 1;
198         }
199         val >>= 1;
200         ext >>= 1;
201     }
202 
203     return ret;
204 }
205 
206 /* Check if GPA matches MSI/MRIF pattern. */
207 static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
208     dma_addr_t gpa)
209 {
210     if (!s->enable_msi) {
211         return false;
212     }
213 
214     if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
215         RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
216         return false; /* Invalid MSI/MRIF mode */
217     }
218 
219     if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
220         return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
221     }
222 
223     return true;
224 }
225 
226 /*
227  * RISCV IOMMU Address Translation Lookup - Page Table Walk
228  *
229  * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
230  * Both implementation can be merged into single helper function in future.
231  * Keeping them separate for now, as error reporting and flow specifics are
232  * sufficiently different for separate implementation.
233  *
234  * @s        : IOMMU Device State
235  * @ctx      : Translation context for device id and process address space id.
236  * @iotlb    : translation data: physical address and access mode.
237  * @return   : success or fault cause code.
238  */
239 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
240     IOMMUTLBEntry *iotlb)
241 {
242     dma_addr_t addr, base;
243     uint64_t satp, gatp, pte;
244     bool en_s, en_g;
245     struct {
246         unsigned char step;
247         unsigned char levels;
248         unsigned char ptidxbits;
249         unsigned char ptesize;
250     } sc[2];
251     /* Translation stage phase */
252     enum {
253         S_STAGE = 0,
254         G_STAGE = 1,
255     } pass;
256     MemTxResult ret;
257 
258     satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
259     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
260 
261     en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
262     en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
263 
264     /*
265      * Early check for MSI address match when IOVA == GPA.
266      * Note that the (!en_s) condition means that the MSI
267      * page table may only be used when guest pages are
268      * mapped using the g-stage page table, whether single-
269      * or two-stage paging is enabled. It's unavoidable though,
270      * because the spec mandates that we do a first-stage
271      * translation before we check the MSI page table, which
272      * means we can't do an early MSI check unless we have
273      * strictly !en_s.
274      */
275     if (!en_s && (iotlb->perm & IOMMU_WO) &&
276         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
277         iotlb->target_as = &s->trap_as;
278         iotlb->translated_addr = iotlb->iova;
279         iotlb->addr_mask = ~TARGET_PAGE_MASK;
280         return 0;
281     }
282 
283     /* Exit early for pass-through mode. */
284     if (!(en_s || en_g)) {
285         iotlb->translated_addr = iotlb->iova;
286         iotlb->addr_mask = ~TARGET_PAGE_MASK;
287         /* Allow R/W in pass-through mode */
288         iotlb->perm = IOMMU_RW;
289         return 0;
290     }
291 
292     /* S/G translation parameters. */
293     for (pass = 0; pass < 2; pass++) {
294         uint32_t sv_mode;
295 
296         sc[pass].step = 0;
297         if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
298             (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
299             /* 32bit mode for GXL/SXL == 1 */
300             switch (pass ? gatp : satp) {
301             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
302                 sc[pass].levels    = 0;
303                 sc[pass].ptidxbits = 0;
304                 sc[pass].ptesize   = 0;
305                 break;
306             case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
307                 sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
308                 if (!(s->cap & sv_mode)) {
309                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
310                 }
311                 sc[pass].levels    = 2;
312                 sc[pass].ptidxbits = 10;
313                 sc[pass].ptesize   = 4;
314                 break;
315             default:
316                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
317             }
318         } else {
319             /* 64bit mode for GXL/SXL == 0 */
320             switch (pass ? gatp : satp) {
321             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
322                 sc[pass].levels    = 0;
323                 sc[pass].ptidxbits = 0;
324                 sc[pass].ptesize   = 0;
325                 break;
326             case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
327                 sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
328                 if (!(s->cap & sv_mode)) {
329                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
330                 }
331                 sc[pass].levels    = 3;
332                 sc[pass].ptidxbits = 9;
333                 sc[pass].ptesize   = 8;
334                 break;
335             case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
336                 sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
337                 if (!(s->cap & sv_mode)) {
338                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
339                 }
340                 sc[pass].levels    = 4;
341                 sc[pass].ptidxbits = 9;
342                 sc[pass].ptesize   = 8;
343                 break;
344             case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
345                 sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
346                 if (!(s->cap & sv_mode)) {
347                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
348                 }
349                 sc[pass].levels    = 5;
350                 sc[pass].ptidxbits = 9;
351                 sc[pass].ptesize   = 8;
352                 break;
353             default:
354                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
355             }
356         }
357     };
358 
359     /* S/G stages translation tables root pointers */
360     gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
361     satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
362     addr = (en_s && en_g) ? satp : iotlb->iova;
363     base = en_g ? gatp : satp;
364     pass = en_g ? G_STAGE : S_STAGE;
365 
366     do {
367         const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
368         const unsigned va_bits = widened + sc[pass].ptidxbits;
369         const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
370                                  (sc[pass].levels - 1 - sc[pass].step);
371         const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
372         const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
373         const bool ade =
374             ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
375 
376         /* Address range check before first level lookup */
377         if (!sc[pass].step) {
378             const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
379             if ((addr & va_mask) != addr) {
380                 return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
381             }
382         }
383 
384         /* Read page table entry */
385         if (sc[pass].ptesize == 4) {
386             uint32_t pte32 = 0;
387             ret = ldl_le_dma(s->target_as, pte_addr, &pte32,
388                              MEMTXATTRS_UNSPECIFIED);
389             pte = pte32;
390         } else {
391             ret = ldq_le_dma(s->target_as, pte_addr, &pte,
392                              MEMTXATTRS_UNSPECIFIED);
393         }
394         if (ret != MEMTX_OK) {
395             return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
396                                             : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
397         }
398 
399         sc[pass].step++;
400         hwaddr ppn = pte >> PTE_PPN_SHIFT;
401 
402         if (!(pte & PTE_V)) {
403             break;                /* Invalid PTE */
404         } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
405             base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
406         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
407             break;                /* Reserved leaf PTE flags: PTE_W */
408         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
409             break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
410         } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
411             break;                /* Misaligned PPN */
412         } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
413             break;                /* Read access check failed */
414         } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
415             break;                /* Write access check failed */
416         } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
417             break;                /* Access bit not set */
418         } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
419             break;                /* Dirty bit not set */
420         } else {
421             /* Leaf PTE, translation completed. */
422             sc[pass].step = sc[pass].levels;
423             base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
424             /* Update address mask based on smallest translation granularity */
425             iotlb->addr_mask &= (1ULL << va_skip) - 1;
426             /* Continue with S-Stage translation? */
427             if (pass && sc[0].step != sc[0].levels) {
428                 pass = S_STAGE;
429                 addr = iotlb->iova;
430                 continue;
431             }
432             /* Translation phase completed (GPA or SPA) */
433             iotlb->translated_addr = base;
434             iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
435                                                          : IOMMU_RO;
436 
437             /* Check MSI GPA address match */
438             if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
439                 riscv_iommu_msi_check(s, ctx, base)) {
440                 /* Trap MSI writes and return GPA address. */
441                 iotlb->target_as = &s->trap_as;
442                 iotlb->addr_mask = ~TARGET_PAGE_MASK;
443                 return 0;
444             }
445 
446             /* Continue with G-Stage translation? */
447             if (!pass && en_g) {
448                 pass = G_STAGE;
449                 addr = base;
450                 base = gatp;
451                 sc[pass].step = 0;
452                 continue;
453             }
454 
455             return 0;
456         }
457 
458         if (sc[pass].step == sc[pass].levels) {
459             break; /* Can't find leaf PTE */
460         }
461 
462         /* Continue with G-Stage translation? */
463         if (!pass && en_g) {
464             pass = G_STAGE;
465             addr = base;
466             base = gatp;
467             sc[pass].step = 0;
468         }
469     } while (1);
470 
471     return (iotlb->perm & IOMMU_WO) ?
472                 (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
473                         RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
474                 (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
475                         RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
476 }
477 
478 static void riscv_iommu_report_fault(RISCVIOMMUState *s,
479                                      RISCVIOMMUContext *ctx,
480                                      uint32_t fault_type, uint32_t cause,
481                                      bool pv,
482                                      uint64_t iotval, uint64_t iotval2)
483 {
484     struct riscv_iommu_fq_record ev = { 0 };
485 
486     if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
487         switch (cause) {
488         case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
489         case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
490         case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
491         case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
492         case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
493         case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
494         case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
495             break;
496         default:
497             /* DTF prevents reporting a fault for this given cause */
498             return;
499         }
500     }
501 
502     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
503     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
504     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
505     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
506 
507     if (pv) {
508         ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
509     }
510 
511     ev.iotval = iotval;
512     ev.iotval2 = iotval2;
513 
514     riscv_iommu_fault(s, &ev);
515 }
516 
517 /* Redirect MSI write for given GPA. */
518 static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
519     RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
520     unsigned size, MemTxAttrs attrs)
521 {
522     MemTxResult res;
523     dma_addr_t addr;
524     uint64_t intn;
525     uint32_t n190;
526     uint64_t pte[2];
527     int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
528     int cause;
529 
530     /* Interrupt File Number */
531     intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
532     if (intn >= 256) {
533         /* Interrupt file number out of range */
534         res = MEMTX_ACCESS_ERROR;
535         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
536         goto err;
537     }
538 
539     /* fetch MSI PTE */
540     addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
541     addr = addr | (intn * sizeof(pte));
542     res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
543             MEMTXATTRS_UNSPECIFIED);
544     if (res != MEMTX_OK) {
545         if (res == MEMTX_DECODE_ERROR) {
546             cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
547         } else {
548             cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
549         }
550         goto err;
551     }
552 
553     le64_to_cpus(&pte[0]);
554     le64_to_cpus(&pte[1]);
555 
556     if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
557         /*
558          * The spec mentions that: "If msipte.C == 1, then further
559          * processing to interpret the PTE is implementation
560          * defined.". We'll abort with cause = 262 for this
561          * case too.
562          */
563         res = MEMTX_ACCESS_ERROR;
564         cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
565         goto err;
566     }
567 
568     switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
569     case RISCV_IOMMU_MSI_PTE_M_BASIC:
570         /* MSI Pass-through mode */
571         addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
572 
573         trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
574                               PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
575                               gpa, addr);
576 
577         res = dma_memory_write(s->target_as, addr, &data, size, attrs);
578         if (res != MEMTX_OK) {
579             cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
580             goto err;
581         }
582 
583         return MEMTX_OK;
584     case RISCV_IOMMU_MSI_PTE_M_MRIF:
585         /* MRIF mode, continue. */
586         break;
587     default:
588         res = MEMTX_ACCESS_ERROR;
589         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
590         goto err;
591     }
592 
593     /*
594      * Report an error for interrupt identities exceeding the maximum allowed
595      * for an IMSIC interrupt file (2047) or destination address is not 32-bit
596      * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
597      */
598     if ((data > 2047) || (gpa & 3)) {
599         res = MEMTX_ACCESS_ERROR;
600         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
601         goto err;
602     }
603 
604     /* MSI MRIF mode, non atomic pending bit update */
605 
606     /* MRIF pending bit address */
607     addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
608     addr = addr | ((data & 0x7c0) >> 3);
609 
610     trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
611                           PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
612                           gpa, addr);
613 
614     /* MRIF pending bit mask */
615     data = 1ULL << (data & 0x03f);
616     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
617     if (res != MEMTX_OK) {
618         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
619         goto err;
620     }
621 
622     intn = intn | data;
623     res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
624     if (res != MEMTX_OK) {
625         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
626         goto err;
627     }
628 
629     /* Get MRIF enable bits */
630     addr = addr + sizeof(intn);
631     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
632     if (res != MEMTX_OK) {
633         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
634         goto err;
635     }
636 
637     if (!(intn & data)) {
638         /* notification disabled, MRIF update completed. */
639         return MEMTX_OK;
640     }
641 
642     /* Send notification message */
643     addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
644     n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
645           (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
646 
647     res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
648     if (res != MEMTX_OK) {
649         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
650         goto err;
651     }
652 
653     trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
654 
655     return MEMTX_OK;
656 
657 err:
658     riscv_iommu_report_fault(s, ctx, fault_type, cause,
659                              !!ctx->process_id, 0, 0);
660     return res;
661 }
662 
663 /*
664  * Check device context configuration as described by the
665  * riscv-iommu spec section "Device-context configuration
666  * checks".
667  */
668 static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
669                                             RISCVIOMMUContext *ctx)
670 {
671     uint32_t fsc_mode, msi_mode;
672     uint64_t gatp;
673 
674     if (!(s->cap & RISCV_IOMMU_CAP_ATS) &&
675         (ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS ||
676          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI ||
677          ctx->tc & RISCV_IOMMU_DC_TC_PRPR)) {
678         return false;
679     }
680 
681     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS) &&
682         (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA ||
683          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI)) {
684         return false;
685     }
686 
687     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
688         ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
689         return false;
690     }
691 
692     if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
693         ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
694         return false;
695     }
696 
697     if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
698         msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
699 
700         if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
701             msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
702             return false;
703         }
704     }
705 
706     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
707     if (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA &&
708         gatp == RISCV_IOMMU_DC_IOHGATP_MODE_BARE) {
709         return false;
710     }
711 
712     fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
713 
714     if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
715         switch (fsc_mode) {
716         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
717             if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
718                 return false;
719             }
720             break;
721         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
722             if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
723                 return false;
724             }
725             break;
726         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
727             if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
728                 return false;
729             }
730             break;
731         }
732     } else {
733         /* DC.tc.PDTV is 0 */
734         if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
735             return false;
736         }
737 
738         if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
739             if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
740                 !(s->cap & RISCV_IOMMU_CAP_SV32)) {
741                 return false;
742             }
743         } else {
744             switch (fsc_mode) {
745             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
746                 if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
747                     return false;
748                 }
749                 break;
750             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
751                 if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
752                     return false;
753                 }
754             break;
755             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
756                 if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
757                     return false;
758                 }
759                 break;
760             }
761         }
762     }
763 
764     /*
765      * CAP_END is always zero (only one endianess). FCTL_BE is
766      * always zero (little-endian accesses). Thus TC_SBE must
767      * always be LE, i.e. zero.
768      */
769     if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
770         return false;
771     }
772 
773     return true;
774 }
775 
776 /*
777  * Validate process context (PC) according to section
778  * "Process-context configuration checks".
779  */
780 static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
781                                              RISCVIOMMUContext *ctx)
782 {
783     uint32_t mode;
784 
785     if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
786         return false;
787     }
788 
789     if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
790         return false;
791     }
792 
793     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
794     switch (mode) {
795     case RISCV_IOMMU_DC_FSC_MODE_BARE:
796     /* sv39 and sv32 modes have the same value (8) */
797     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
798     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
799     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
800         break;
801     default:
802         return false;
803     }
804 
805     if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
806         if (mode == RISCV_IOMMU_CAP_SV32 &&
807             !(s->cap & RISCV_IOMMU_CAP_SV32)) {
808                 return false;
809         }
810     } else {
811         switch (mode) {
812         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
813             if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
814                 return false;
815             }
816             break;
817         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
818             if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
819                 return false;
820             }
821             break;
822         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
823             if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
824                 return false;
825             }
826             break;
827         }
828     }
829 
830     return true;
831 }
832 
833 /*
834  * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
835  *
836  * @s         : IOMMU Device State
837  * @ctx       : Device Translation Context with devid and process_id set.
838  * @return    : success or fault code.
839  */
840 static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
841 {
842     const uint64_t ddtp = s->ddtp;
843     unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
844     dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
845     struct riscv_iommu_dc dc;
846     /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
847     const int dc_fmt = !s->enable_msi;
848     const size_t dc_len = sizeof(dc) >> dc_fmt;
849     unsigned depth;
850     uint64_t de;
851 
852     switch (mode) {
853     case RISCV_IOMMU_DDTP_MODE_OFF:
854         return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
855 
856     case RISCV_IOMMU_DDTP_MODE_BARE:
857         /* mock up pass-through translation context */
858         ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
859             RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
860         ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
861             RISCV_IOMMU_DC_FSC_MODE_BARE);
862 
863         ctx->tc = RISCV_IOMMU_DC_TC_V;
864         if (s->enable_ats) {
865             ctx->tc |= RISCV_IOMMU_DC_TC_EN_ATS;
866         }
867 
868         ctx->ta = 0;
869         ctx->msiptp = 0;
870         return 0;
871 
872     case RISCV_IOMMU_DDTP_MODE_1LVL:
873         depth = 0;
874         break;
875 
876     case RISCV_IOMMU_DDTP_MODE_2LVL:
877         depth = 1;
878         break;
879 
880     case RISCV_IOMMU_DDTP_MODE_3LVL:
881         depth = 2;
882         break;
883 
884     default:
885         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
886     }
887 
888     /*
889      * Check supported device id width (in bits).
890      * See IOMMU Specification, Chapter 6. Software guidelines.
891      * - if extended device-context format is used:
892      *   1LVL: 6, 2LVL: 15, 3LVL: 24
893      * - if base device-context format is used:
894      *   1LVL: 7, 2LVL: 16, 3LVL: 24
895      */
896     if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
897         return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
898     }
899 
900     /* Device directory tree walk */
901     for (; depth-- > 0; ) {
902         /*
903          * Select device id index bits based on device directory tree level
904          * and device context format.
905          * See IOMMU Specification, Chapter 2. Data Structures.
906          * - if extended device-context format is used:
907          *   device index: [23:15][14:6][5:0]
908          * - if base device-context format is used:
909          *   device index: [23:16][15:7][6:0]
910          */
911         const int split = depth * 9 + 6 + dc_fmt;
912         addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
913         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
914                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
915             return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
916         }
917         le64_to_cpus(&de);
918         if (!(de & RISCV_IOMMU_DDTE_VALID)) {
919             /* invalid directory entry */
920             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
921         }
922         if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
923             /* reserved bits set */
924             return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
925         }
926         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
927     }
928 
929     /* index into device context entry page */
930     addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
931 
932     memset(&dc, 0, sizeof(dc));
933     if (dma_memory_read(s->target_as, addr, &dc, dc_len,
934                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
935         return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
936     }
937 
938     /* Set translation context. */
939     ctx->tc = le64_to_cpu(dc.tc);
940     ctx->gatp = le64_to_cpu(dc.iohgatp);
941     ctx->satp = le64_to_cpu(dc.fsc);
942     ctx->ta = le64_to_cpu(dc.ta);
943     ctx->msiptp = le64_to_cpu(dc.msiptp);
944     ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
945     ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
946 
947     if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
948         return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
949     }
950 
951     if (!riscv_iommu_validate_device_ctx(s, ctx)) {
952         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
953     }
954 
955     /* FSC field checks */
956     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
957     addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
958 
959     if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
960         if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
961             /* PID is disabled */
962             return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
963         }
964         if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
965             /* Invalid translation mode */
966             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
967         }
968         return 0;
969     }
970 
971     if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
972         if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
973             /* No default process_id enabled, set BARE mode */
974             ctx->satp = 0ULL;
975             return 0;
976         } else {
977             /* Use default process_id #0 */
978             ctx->process_id = 0;
979         }
980     }
981 
982     if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
983         /* No S-Stage translation, done. */
984         return 0;
985     }
986 
987     /* FSC.TC.PDTV enabled */
988     if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
989         /* Invalid PDTP.MODE */
990         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
991     }
992 
993     for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
994         /*
995          * Select process id index bits based on process directory tree
996          * level. See IOMMU Specification, 2.2. Process-Directory-Table.
997          */
998         const int split = depth * 9 + 8;
999         addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
1000         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
1001                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1002             return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1003         }
1004         le64_to_cpus(&de);
1005         if (!(de & RISCV_IOMMU_PC_TA_V)) {
1006             return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1007         }
1008         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
1009     }
1010 
1011     /* Leaf entry in PDT */
1012     addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
1013     if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
1014                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1015         return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1016     }
1017 
1018     /* Use FSC and TA from process directory entry. */
1019     ctx->ta = le64_to_cpu(dc.ta);
1020     ctx->satp = le64_to_cpu(dc.fsc);
1021 
1022     if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
1023         return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1024     }
1025 
1026     if (!riscv_iommu_validate_process_ctx(s, ctx)) {
1027         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
1028     }
1029 
1030     return 0;
1031 }
1032 
1033 /* Translation Context cache support */
1034 static gboolean riscv_iommu_ctx_equal(gconstpointer v1, gconstpointer v2)
1035 {
1036     RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
1037     RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
1038     return c1->devid == c2->devid &&
1039            c1->process_id == c2->process_id;
1040 }
1041 
1042 static guint riscv_iommu_ctx_hash(gconstpointer v)
1043 {
1044     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
1045     /*
1046      * Generate simple hash of (process_id, devid)
1047      * assuming 24-bit wide devid.
1048      */
1049     return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
1050 }
1051 
1052 static void riscv_iommu_ctx_inval_devid_procid(gpointer key, gpointer value,
1053                                                gpointer data)
1054 {
1055     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1056     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1057     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1058         ctx->devid == arg->devid &&
1059         ctx->process_id == arg->process_id) {
1060         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1061     }
1062 }
1063 
1064 static void riscv_iommu_ctx_inval_devid(gpointer key, gpointer value,
1065                                         gpointer data)
1066 {
1067     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1068     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1069     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1070         ctx->devid == arg->devid) {
1071         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1072     }
1073 }
1074 
1075 static void riscv_iommu_ctx_inval_all(gpointer key, gpointer value,
1076                                       gpointer data)
1077 {
1078     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1079     if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
1080         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1081     }
1082 }
1083 
1084 static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
1085                                   uint32_t devid, uint32_t process_id)
1086 {
1087     GHashTable *ctx_cache;
1088     RISCVIOMMUContext key = {
1089         .devid = devid,
1090         .process_id = process_id,
1091     };
1092     ctx_cache = g_hash_table_ref(s->ctx_cache);
1093     g_hash_table_foreach(ctx_cache, func, &key);
1094     g_hash_table_unref(ctx_cache);
1095 }
1096 
1097 /* Find or allocate translation context for a given {device_id, process_id} */
1098 static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
1099                                           unsigned devid, unsigned process_id,
1100                                           void **ref)
1101 {
1102     GHashTable *ctx_cache;
1103     RISCVIOMMUContext *ctx;
1104     RISCVIOMMUContext key = {
1105         .devid = devid,
1106         .process_id = process_id,
1107     };
1108 
1109     ctx_cache = g_hash_table_ref(s->ctx_cache);
1110     ctx = g_hash_table_lookup(ctx_cache, &key);
1111 
1112     if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
1113         *ref = ctx_cache;
1114         return ctx;
1115     }
1116 
1117     ctx = g_new0(RISCVIOMMUContext, 1);
1118     ctx->devid = devid;
1119     ctx->process_id = process_id;
1120 
1121     int fault = riscv_iommu_ctx_fetch(s, ctx);
1122     if (!fault) {
1123         if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
1124             g_hash_table_unref(ctx_cache);
1125             ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
1126                                               riscv_iommu_ctx_equal,
1127                                               g_free, NULL);
1128             g_hash_table_ref(ctx_cache);
1129             g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
1130         }
1131         g_hash_table_add(ctx_cache, ctx);
1132         *ref = ctx_cache;
1133         return ctx;
1134     }
1135 
1136     g_hash_table_unref(ctx_cache);
1137     *ref = NULL;
1138 
1139     riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
1140                              fault, !!process_id, 0, 0);
1141 
1142     g_free(ctx);
1143     return NULL;
1144 }
1145 
1146 static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
1147 {
1148     if (ref) {
1149         g_hash_table_unref((GHashTable *)ref);
1150     }
1151 }
1152 
1153 /* Find or allocate address space for a given device */
1154 static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
1155 {
1156     RISCVIOMMUSpace *as;
1157 
1158     /* FIXME: PCIe bus remapping for attached endpoints. */
1159     devid |= s->bus << 8;
1160 
1161     QLIST_FOREACH(as, &s->spaces, list) {
1162         if (as->devid == devid) {
1163             break;
1164         }
1165     }
1166 
1167     if (as == NULL) {
1168         char name[64];
1169         as = g_new0(RISCVIOMMUSpace, 1);
1170 
1171         as->iommu = s;
1172         as->devid = devid;
1173 
1174         snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
1175             PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1176 
1177         /* IOVA address space, untranslated addresses */
1178         memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
1179             TYPE_RISCV_IOMMU_MEMORY_REGION,
1180             OBJECT(as), "riscv_iommu", UINT64_MAX);
1181         address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
1182 
1183         QLIST_INSERT_HEAD(&s->spaces, as, list);
1184 
1185         trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
1186                 PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1187     }
1188     return &as->iova_as;
1189 }
1190 
1191 /* Translation Object cache support */
1192 static gboolean riscv_iommu_iot_equal(gconstpointer v1, gconstpointer v2)
1193 {
1194     RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
1195     RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
1196     return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
1197            t1->iova == t2->iova;
1198 }
1199 
1200 static guint riscv_iommu_iot_hash(gconstpointer v)
1201 {
1202     RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
1203     return (guint)t->iova;
1204 }
1205 
1206 /* GV: 1 PSCV: 1 AV: 1 */
1207 static void riscv_iommu_iot_inval_pscid_iova(gpointer key, gpointer value,
1208                                              gpointer data)
1209 {
1210     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1211     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1212     if (iot->gscid == arg->gscid &&
1213         iot->pscid == arg->pscid &&
1214         iot->iova == arg->iova) {
1215         iot->perm = IOMMU_NONE;
1216     }
1217 }
1218 
1219 /* GV: 1 PSCV: 1 AV: 0 */
1220 static void riscv_iommu_iot_inval_pscid(gpointer key, gpointer value,
1221                                         gpointer data)
1222 {
1223     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1224     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1225     if (iot->gscid == arg->gscid &&
1226         iot->pscid == arg->pscid) {
1227         iot->perm = IOMMU_NONE;
1228     }
1229 }
1230 
1231 /* GV: 1 GVMA: 1 */
1232 static void riscv_iommu_iot_inval_gscid_gpa(gpointer key, gpointer value,
1233                                             gpointer data)
1234 {
1235     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1236     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1237     if (iot->gscid == arg->gscid) {
1238         /* simplified cache, no GPA matching */
1239         iot->perm = IOMMU_NONE;
1240     }
1241 }
1242 
1243 /* GV: 1 GVMA: 0 */
1244 static void riscv_iommu_iot_inval_gscid(gpointer key, gpointer value,
1245                                         gpointer data)
1246 {
1247     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1248     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1249     if (iot->gscid == arg->gscid) {
1250         iot->perm = IOMMU_NONE;
1251     }
1252 }
1253 
1254 /* GV: 0 */
1255 static void riscv_iommu_iot_inval_all(gpointer key, gpointer value,
1256                                       gpointer data)
1257 {
1258     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1259     iot->perm = IOMMU_NONE;
1260 }
1261 
1262 /* caller should keep ref-count for iot_cache object */
1263 static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
1264     GHashTable *iot_cache, hwaddr iova)
1265 {
1266     RISCVIOMMUEntry key = {
1267         .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
1268         .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
1269         .iova  = PPN_DOWN(iova),
1270     };
1271     return g_hash_table_lookup(iot_cache, &key);
1272 }
1273 
1274 /* caller should keep ref-count for iot_cache object */
1275 static void riscv_iommu_iot_update(RISCVIOMMUState *s,
1276     GHashTable *iot_cache, RISCVIOMMUEntry *iot)
1277 {
1278     if (!s->iot_limit) {
1279         return;
1280     }
1281 
1282     if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
1283         iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
1284                                           riscv_iommu_iot_equal,
1285                                           g_free, NULL);
1286         g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
1287     }
1288     g_hash_table_add(iot_cache, iot);
1289 }
1290 
1291 static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
1292     uint32_t gscid, uint32_t pscid, hwaddr iova)
1293 {
1294     GHashTable *iot_cache;
1295     RISCVIOMMUEntry key = {
1296         .gscid = gscid,
1297         .pscid = pscid,
1298         .iova  = PPN_DOWN(iova),
1299     };
1300 
1301     iot_cache = g_hash_table_ref(s->iot_cache);
1302     g_hash_table_foreach(iot_cache, func, &key);
1303     g_hash_table_unref(iot_cache);
1304 }
1305 
1306 static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
1307     IOMMUTLBEntry *iotlb, bool enable_cache)
1308 {
1309     RISCVIOMMUEntry *iot;
1310     IOMMUAccessFlags perm;
1311     bool enable_pid;
1312     bool enable_pri;
1313     GHashTable *iot_cache;
1314     int fault;
1315 
1316     iot_cache = g_hash_table_ref(s->iot_cache);
1317     /*
1318      * TC[32] is reserved for custom extensions, used here to temporarily
1319      * enable automatic page-request generation for ATS queries.
1320      */
1321     enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
1322     enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
1323 
1324     /* Check for ATS request. */
1325     if (iotlb->perm == IOMMU_NONE) {
1326         /* Check if ATS is disabled. */
1327         if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
1328             enable_pri = false;
1329             fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
1330             goto done;
1331         }
1332     }
1333 
1334     iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
1335     perm = iot ? iot->perm : IOMMU_NONE;
1336     if (perm != IOMMU_NONE) {
1337         iotlb->translated_addr = PPN_PHYS(iot->phys);
1338         iotlb->addr_mask = ~TARGET_PAGE_MASK;
1339         iotlb->perm = perm;
1340         fault = 0;
1341         goto done;
1342     }
1343 
1344     /* Translate using device directory / page table information. */
1345     fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
1346 
1347     if (!fault && iotlb->target_as == &s->trap_as) {
1348         /* Do not cache trapped MSI translations */
1349         goto done;
1350     }
1351 
1352     /*
1353      * We made an implementation choice to not cache identity-mapped
1354      * translations, as allowed by the specification, to avoid
1355      * translation cache evictions for other devices sharing the
1356      * IOMMU hardware model.
1357      */
1358     if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
1359         iot = g_new0(RISCVIOMMUEntry, 1);
1360         iot->iova = PPN_DOWN(iotlb->iova);
1361         iot->phys = PPN_DOWN(iotlb->translated_addr);
1362         iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
1363         iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
1364         iot->perm = iotlb->perm;
1365         riscv_iommu_iot_update(s, iot_cache, iot);
1366     }
1367 
1368 done:
1369     g_hash_table_unref(iot_cache);
1370 
1371     if (enable_pri && fault) {
1372         struct riscv_iommu_pq_record pr = {0};
1373         if (enable_pid) {
1374             pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
1375                                RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
1376         }
1377         pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
1378         pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
1379                      RISCV_IOMMU_PREQ_PAYLOAD_M;
1380         riscv_iommu_pri(s, &pr);
1381         return fault;
1382     }
1383 
1384     if (fault) {
1385         unsigned ttype = RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ;
1386 
1387         if (iotlb->perm & IOMMU_RW) {
1388             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
1389         } else if (iotlb->perm & IOMMU_RO) {
1390             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
1391         }
1392 
1393         riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
1394                                  iotlb->iova, iotlb->translated_addr);
1395         return fault;
1396     }
1397 
1398     return 0;
1399 }
1400 
1401 /* IOMMU Command Interface */
1402 static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
1403     uint64_t addr, uint32_t data)
1404 {
1405     /*
1406      * ATS processing in this implementation of the IOMMU is synchronous,
1407      * no need to wait for completions here.
1408      */
1409     if (!notify) {
1410         return MEMTX_OK;
1411     }
1412 
1413     return dma_memory_write(s->target_as, addr, &data, sizeof(data),
1414         MEMTXATTRS_UNSPECIFIED);
1415 }
1416 
1417 static void riscv_iommu_ats(RISCVIOMMUState *s,
1418     struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
1419     IOMMUAccessFlags perm,
1420     void (*trace_fn)(const char *id))
1421 {
1422     RISCVIOMMUSpace *as = NULL;
1423     IOMMUNotifier *n;
1424     IOMMUTLBEvent event;
1425     uint32_t pid;
1426     uint32_t devid;
1427     const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
1428 
1429     if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
1430         /* Use device segment and requester id */
1431         devid = get_field(cmd->dword0,
1432             RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
1433     } else {
1434         devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
1435     }
1436 
1437     pid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
1438 
1439     QLIST_FOREACH(as, &s->spaces, list) {
1440         if (as->devid == devid) {
1441             break;
1442         }
1443     }
1444 
1445     if (!as || !as->notifier) {
1446         return;
1447     }
1448 
1449     event.type = flag;
1450     event.entry.perm = perm;
1451     event.entry.target_as = s->target_as;
1452 
1453     IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
1454         if (!pv || n->iommu_idx == pid) {
1455             event.entry.iova = n->start;
1456             event.entry.addr_mask = n->end - n->start;
1457             trace_fn(as->iova_mr.parent_obj.name);
1458             memory_region_notify_iommu_one(n, &event);
1459         }
1460     }
1461 }
1462 
1463 static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
1464     struct riscv_iommu_command *cmd)
1465 {
1466     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
1467                            trace_riscv_iommu_ats_inval);
1468 }
1469 
1470 static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
1471     struct riscv_iommu_command *cmd)
1472 {
1473     unsigned resp_code = get_field(cmd->dword1,
1474                                    RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
1475 
1476     /* Using the access flag to carry response code information */
1477     IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
1478     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
1479                            trace_riscv_iommu_ats_prgr);
1480 }
1481 
1482 static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
1483 {
1484     uint64_t old_ddtp = s->ddtp;
1485     uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
1486     unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
1487     unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
1488     bool ok = false;
1489 
1490     /*
1491      * Check for allowed DDTP.MODE transitions:
1492      * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
1493      * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
1494      */
1495     if (new_mode == old_mode ||
1496         new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1497         new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
1498         ok = true;
1499     } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
1500                new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
1501                new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
1502         ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1503              old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
1504     }
1505 
1506     if (ok) {
1507         /* clear reserved and busy bits, report back sanitized version */
1508         new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
1509                              RISCV_IOMMU_DDTP_MODE, new_mode);
1510     } else {
1511         new_ddtp = old_ddtp;
1512     }
1513     s->ddtp = new_ddtp;
1514 
1515     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
1516 }
1517 
1518 /* Command function and opcode field. */
1519 #define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
1520 
1521 static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
1522 {
1523     struct riscv_iommu_command cmd;
1524     MemTxResult res;
1525     dma_addr_t addr;
1526     uint32_t tail, head, ctrl;
1527     uint64_t cmd_opcode;
1528     GHFunc func;
1529 
1530     ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1531     tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
1532     head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
1533 
1534     /* Check for pending error or queue processing disabled */
1535     if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
1536         !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
1537         return;
1538     }
1539 
1540     while (tail != head) {
1541         addr = s->cq_addr  + head * sizeof(cmd);
1542         res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
1543                               MEMTXATTRS_UNSPECIFIED);
1544 
1545         if (res != MEMTX_OK) {
1546             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1547                                   RISCV_IOMMU_CQCSR_CQMF, 0);
1548             goto fault;
1549         }
1550 
1551         trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
1552 
1553         cmd_opcode = get_field(cmd.dword0,
1554                                RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
1555 
1556         switch (cmd_opcode) {
1557         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
1558                              RISCV_IOMMU_CMD_IOFENCE_OPCODE):
1559             res = riscv_iommu_iofence(s,
1560                 cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1 << 2,
1561                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
1562 
1563             if (res != MEMTX_OK) {
1564                 riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1565                                       RISCV_IOMMU_CQCSR_CQMF, 0);
1566                 goto fault;
1567             }
1568             break;
1569 
1570         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
1571                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1572             if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
1573                 /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
1574                 goto cmd_ill;
1575             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1576                 /* invalidate all cache mappings */
1577                 func = riscv_iommu_iot_inval_all;
1578             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1579                 /* invalidate cache matching GSCID */
1580                 func = riscv_iommu_iot_inval_gscid;
1581             } else {
1582                 /* invalidate cache matching GSCID and ADDR (GPA) */
1583                 func = riscv_iommu_iot_inval_gscid_gpa;
1584             }
1585             riscv_iommu_iot_inval(s, func,
1586                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
1587                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1588             break;
1589 
1590         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
1591                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1592             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1593                 /* invalidate all cache mappings, simplified model */
1594                 func = riscv_iommu_iot_inval_all;
1595             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
1596                 /* invalidate cache matching GSCID, simplified model */
1597                 func = riscv_iommu_iot_inval_gscid;
1598             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1599                 /* invalidate cache matching GSCID and PSCID */
1600                 func = riscv_iommu_iot_inval_pscid;
1601             } else {
1602                 /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
1603                 func = riscv_iommu_iot_inval_pscid_iova;
1604             }
1605             riscv_iommu_iot_inval(s, func,
1606                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
1607                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
1608                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1609             break;
1610 
1611         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
1612                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1613             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1614                 /* invalidate all device context cache mappings */
1615                 func = riscv_iommu_ctx_inval_all;
1616             } else {
1617                 /* invalidate all device context matching DID */
1618                 func = riscv_iommu_ctx_inval_devid;
1619             }
1620             riscv_iommu_ctx_inval(s, func,
1621                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
1622             break;
1623 
1624         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
1625                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1626             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1627                 /* illegal command arguments IODIR_PDT & DV == 0 */
1628                 goto cmd_ill;
1629             } else {
1630                 func = riscv_iommu_ctx_inval_devid_procid;
1631             }
1632             riscv_iommu_ctx_inval(s, func,
1633                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
1634                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
1635             break;
1636 
1637         /* ATS commands */
1638         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
1639                              RISCV_IOMMU_CMD_ATS_OPCODE):
1640             if (!s->enable_ats) {
1641                 goto cmd_ill;
1642             }
1643 
1644             riscv_iommu_ats_inval(s, &cmd);
1645             break;
1646 
1647         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
1648                              RISCV_IOMMU_CMD_ATS_OPCODE):
1649             if (!s->enable_ats) {
1650                 goto cmd_ill;
1651             }
1652 
1653             riscv_iommu_ats_prgr(s, &cmd);
1654             break;
1655 
1656         default:
1657         cmd_ill:
1658             /* Invalid instruction, do not advance instruction index. */
1659             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1660                 RISCV_IOMMU_CQCSR_CMD_ILL, 0);
1661             goto fault;
1662         }
1663 
1664         /* Advance and update head pointer after command completes. */
1665         head = (head + 1) & s->cq_mask;
1666         riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
1667     }
1668     return;
1669 
1670 fault:
1671     if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
1672         riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
1673     }
1674 }
1675 
1676 static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
1677 {
1678     uint64_t base;
1679     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1680     uint32_t ctrl_clr;
1681     bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
1682     bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
1683 
1684     if (enable && !active) {
1685         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
1686         s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
1687         s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
1688         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
1689         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
1690         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
1691         ctrl_set = RISCV_IOMMU_CQCSR_CQON;
1692         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
1693                    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
1694                    RISCV_IOMMU_CQCSR_FENCE_W_IP;
1695     } else if (!enable && active) {
1696         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
1697         ctrl_set = 0;
1698         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
1699     } else {
1700         ctrl_set = 0;
1701         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
1702     }
1703 
1704     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
1705 }
1706 
1707 static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
1708 {
1709     uint64_t base;
1710     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1711     uint32_t ctrl_clr;
1712     bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
1713     bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
1714 
1715     if (enable && !active) {
1716         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
1717         s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
1718         s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
1719         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
1720         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
1721         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
1722         ctrl_set = RISCV_IOMMU_FQCSR_FQON;
1723         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
1724             RISCV_IOMMU_FQCSR_FQOF;
1725     } else if (!enable && active) {
1726         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
1727         ctrl_set = 0;
1728         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
1729     } else {
1730         ctrl_set = 0;
1731         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
1732     }
1733 
1734     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
1735 }
1736 
1737 static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
1738 {
1739     uint64_t base;
1740     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1741     uint32_t ctrl_clr;
1742     bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
1743     bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
1744 
1745     if (enable && !active) {
1746         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
1747         s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
1748         s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
1749         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
1750         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
1751         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
1752         ctrl_set = RISCV_IOMMU_PQCSR_PQON;
1753         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
1754             RISCV_IOMMU_PQCSR_PQOF;
1755     } else if (!enable && active) {
1756         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
1757         ctrl_set = 0;
1758         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
1759     } else {
1760         ctrl_set = 0;
1761         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
1762     }
1763 
1764     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
1765 }
1766 
1767 static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
1768 {
1769     uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
1770     uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
1771     unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
1772     unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
1773     RISCVIOMMUContext *ctx;
1774     void *ref;
1775 
1776     if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
1777         return;
1778     }
1779 
1780     ctx = riscv_iommu_ctx(s, devid, pid, &ref);
1781     if (ctx == NULL) {
1782         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
1783                                  RISCV_IOMMU_TR_RESPONSE_FAULT |
1784                                  (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
1785     } else {
1786         IOMMUTLBEntry iotlb = {
1787             .iova = iova,
1788             .perm = ctrl & RISCV_IOMMU_TR_REQ_CTL_NW ? IOMMU_RO : IOMMU_RW,
1789             .addr_mask = ~0,
1790             .target_as = NULL,
1791         };
1792         int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
1793         if (fault) {
1794             iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
1795         } else {
1796             iova = iotlb.translated_addr & ~iotlb.addr_mask;
1797             iova >>= TARGET_PAGE_BITS;
1798             iova &= RISCV_IOMMU_TR_RESPONSE_PPN;
1799 
1800             /* We do not support superpages (> 4kbs) for now */
1801             iova &= ~RISCV_IOMMU_TR_RESPONSE_S;
1802         }
1803         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
1804     }
1805 
1806     riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
1807         RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
1808     riscv_iommu_ctx_put(s, ref);
1809 }
1810 
1811 typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
1812 
1813 static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
1814 {
1815     uint64_t icvec = 0;
1816 
1817     icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
1818                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
1819 
1820     icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
1821                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
1822 
1823     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
1824                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
1825 
1826     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
1827                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
1828 
1829     trace_riscv_iommu_icvec_write(data, icvec);
1830 
1831     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
1832 }
1833 
1834 static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
1835 {
1836     uint32_t cqcsr, fqcsr, pqcsr;
1837     uint32_t ipsr_set = 0;
1838     uint32_t ipsr_clr = 0;
1839 
1840     if (data & RISCV_IOMMU_IPSR_CIP) {
1841         cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1842 
1843         if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
1844             (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
1845              cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
1846              cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
1847              cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
1848             ipsr_set |= RISCV_IOMMU_IPSR_CIP;
1849         } else {
1850             ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1851         }
1852     } else {
1853         ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1854     }
1855 
1856     if (data & RISCV_IOMMU_IPSR_FIP) {
1857         fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1858 
1859         if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
1860             (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
1861              fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
1862             ipsr_set |= RISCV_IOMMU_IPSR_FIP;
1863         } else {
1864             ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1865         }
1866     } else {
1867         ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1868     }
1869 
1870     if (data & RISCV_IOMMU_IPSR_PIP) {
1871         pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1872 
1873         if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
1874             (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
1875              pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
1876             ipsr_set |= RISCV_IOMMU_IPSR_PIP;
1877         } else {
1878             ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1879         }
1880     } else {
1881         ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1882     }
1883 
1884     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
1885 }
1886 
1887 /*
1888  * Write the resulting value of 'data' for the reg specified
1889  * by 'reg_addr', after considering read-only/read-write/write-clear
1890  * bits, in the pointer 'dest'.
1891  *
1892  * The result is written in little-endian.
1893  */
1894 static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
1895                                       void *dest, hwaddr reg_addr,
1896                                       int size, uint64_t data)
1897 {
1898     uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
1899     uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
1900     uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
1901 
1902     stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
1903 }
1904 
1905 static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
1906                                           uint64_t data, unsigned size,
1907                                           MemTxAttrs attrs)
1908 {
1909     riscv_iommu_process_fn *process_fn = NULL;
1910     RISCVIOMMUState *s = opaque;
1911     uint32_t regb = addr & ~3;
1912     uint32_t busy = 0;
1913     uint64_t val = 0;
1914 
1915     if ((addr & (size - 1)) != 0) {
1916         /* Unsupported MMIO alignment or access size */
1917         return MEMTX_ERROR;
1918     }
1919 
1920     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
1921         /* Unsupported MMIO access location. */
1922         return MEMTX_ACCESS_ERROR;
1923     }
1924 
1925     /* Track actionable MMIO write. */
1926     switch (regb) {
1927     case RISCV_IOMMU_REG_DDTP:
1928     case RISCV_IOMMU_REG_DDTP + 4:
1929         process_fn = riscv_iommu_process_ddtp;
1930         regb = RISCV_IOMMU_REG_DDTP;
1931         busy = RISCV_IOMMU_DDTP_BUSY;
1932         break;
1933 
1934     case RISCV_IOMMU_REG_CQT:
1935         process_fn = riscv_iommu_process_cq_tail;
1936         break;
1937 
1938     case RISCV_IOMMU_REG_CQCSR:
1939         process_fn = riscv_iommu_process_cq_control;
1940         busy = RISCV_IOMMU_CQCSR_BUSY;
1941         break;
1942 
1943     case RISCV_IOMMU_REG_FQCSR:
1944         process_fn = riscv_iommu_process_fq_control;
1945         busy = RISCV_IOMMU_FQCSR_BUSY;
1946         break;
1947 
1948     case RISCV_IOMMU_REG_PQCSR:
1949         process_fn = riscv_iommu_process_pq_control;
1950         busy = RISCV_IOMMU_PQCSR_BUSY;
1951         break;
1952 
1953     case RISCV_IOMMU_REG_ICVEC:
1954     case RISCV_IOMMU_REG_IPSR:
1955         /*
1956          * ICVEC and IPSR have special read/write procedures. We'll
1957          * call their respective helpers and exit.
1958          */
1959         riscv_iommu_write_reg_val(s, &val, addr, size, data);
1960 
1961         /*
1962          * 'val' is stored as LE. Switch to host endianess
1963          * before using it.
1964          */
1965         val = le64_to_cpu(val);
1966 
1967         if (regb == RISCV_IOMMU_REG_ICVEC) {
1968             riscv_iommu_update_icvec(s, val);
1969         } else {
1970             riscv_iommu_update_ipsr(s, val);
1971         }
1972 
1973         return MEMTX_OK;
1974 
1975     case RISCV_IOMMU_REG_TR_REQ_CTL:
1976         process_fn = riscv_iommu_process_dbg;
1977         regb = RISCV_IOMMU_REG_TR_REQ_CTL;
1978         busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
1979         break;
1980 
1981     default:
1982         break;
1983     }
1984 
1985     /*
1986      * Registers update might be not synchronized with core logic.
1987      * If system software updates register when relevant BUSY bit
1988      * is set IOMMU behavior of additional writes to the register
1989      * is UNSPECIFIED.
1990      */
1991     riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
1992 
1993     /* Busy flag update, MSB 4-byte register. */
1994     if (busy) {
1995         uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
1996         stl_le_p(&s->regs_rw[regb], rw | busy);
1997     }
1998 
1999     if (process_fn) {
2000         process_fn(s);
2001     }
2002 
2003     return MEMTX_OK;
2004 }
2005 
2006 static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
2007     uint64_t *data, unsigned size, MemTxAttrs attrs)
2008 {
2009     RISCVIOMMUState *s = opaque;
2010     uint64_t val = -1;
2011     uint8_t *ptr;
2012 
2013     if ((addr & (size - 1)) != 0) {
2014         /* Unsupported MMIO alignment. */
2015         return MEMTX_ERROR;
2016     }
2017 
2018     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
2019         return MEMTX_ACCESS_ERROR;
2020     }
2021 
2022     ptr = &s->regs_rw[addr];
2023     val = ldn_le_p(ptr, size);
2024 
2025     *data = val;
2026 
2027     return MEMTX_OK;
2028 }
2029 
2030 static const MemoryRegionOps riscv_iommu_mmio_ops = {
2031     .read_with_attrs = riscv_iommu_mmio_read,
2032     .write_with_attrs = riscv_iommu_mmio_write,
2033     .endianness = DEVICE_NATIVE_ENDIAN,
2034     .impl = {
2035         .min_access_size = 4,
2036         .max_access_size = 8,
2037         .unaligned = false,
2038     },
2039     .valid = {
2040         .min_access_size = 4,
2041         .max_access_size = 8,
2042     }
2043 };
2044 
2045 /*
2046  * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
2047  * memory region as untranslated address, for additional MSI/MRIF interception
2048  * by IOMMU interrupt remapping implementation.
2049  * Note: Device emulation code generating an MSI is expected to provide a valid
2050  * memory transaction attributes with requested_id set.
2051  */
2052 static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
2053     uint64_t data, unsigned size, MemTxAttrs attrs)
2054 {
2055     RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
2056     RISCVIOMMUContext *ctx;
2057     MemTxResult res;
2058     void *ref;
2059     uint32_t devid = attrs.requester_id;
2060 
2061     if (attrs.unspecified) {
2062         return MEMTX_ACCESS_ERROR;
2063     }
2064 
2065     /* FIXME: PCIe bus remapping for attached endpoints. */
2066     devid |= s->bus << 8;
2067 
2068     ctx = riscv_iommu_ctx(s, devid, 0, &ref);
2069     if (ctx == NULL) {
2070         res = MEMTX_ACCESS_ERROR;
2071     } else {
2072         res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
2073     }
2074     riscv_iommu_ctx_put(s, ref);
2075     return res;
2076 }
2077 
2078 static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
2079     uint64_t *data, unsigned size, MemTxAttrs attrs)
2080 {
2081     return MEMTX_ACCESS_ERROR;
2082 }
2083 
2084 static const MemoryRegionOps riscv_iommu_trap_ops = {
2085     .read_with_attrs = riscv_iommu_trap_read,
2086     .write_with_attrs = riscv_iommu_trap_write,
2087     .endianness = DEVICE_LITTLE_ENDIAN,
2088     .impl = {
2089         .min_access_size = 4,
2090         .max_access_size = 8,
2091         .unaligned = true,
2092     },
2093     .valid = {
2094         .min_access_size = 4,
2095         .max_access_size = 8,
2096     }
2097 };
2098 
2099 static void riscv_iommu_realize(DeviceState *dev, Error **errp)
2100 {
2101     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2102 
2103     s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
2104     if (s->enable_msi) {
2105         s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
2106     }
2107     if (s->enable_ats) {
2108         s->cap |= RISCV_IOMMU_CAP_ATS;
2109     }
2110     if (s->enable_s_stage) {
2111         s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
2112                   RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
2113     }
2114     if (s->enable_g_stage) {
2115         s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
2116                   RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
2117     }
2118     /* Enable translation debug interface */
2119     s->cap |= RISCV_IOMMU_CAP_DBG;
2120 
2121     /* Report QEMU target physical address space limits */
2122     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
2123                        TARGET_PHYS_ADDR_SPACE_BITS);
2124 
2125     /* TODO: method to report supported PID bits */
2126     s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
2127     s->cap |= RISCV_IOMMU_CAP_PD8;
2128 
2129     /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
2130     s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
2131                         RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
2132 
2133     /* register storage */
2134     s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2135     s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2136     s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2137 
2138      /* Mark all registers read-only */
2139     memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
2140 
2141     /*
2142      * Register complete MMIO space, including MSI/PBA registers.
2143      * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
2144      * managed directly by the PCIDevice implementation.
2145      */
2146     memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
2147         "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
2148 
2149     /* Set power-on register state */
2150     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
2151     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
2152     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
2153              ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
2154     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
2155         ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
2156     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
2157         ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
2158     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
2159         ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
2160     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
2161         ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
2162     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
2163         RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
2164     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
2165         RISCV_IOMMU_CQCSR_BUSY);
2166     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
2167         RISCV_IOMMU_FQCSR_FQOF);
2168     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
2169         RISCV_IOMMU_FQCSR_BUSY);
2170     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
2171         RISCV_IOMMU_PQCSR_PQOF);
2172     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
2173         RISCV_IOMMU_PQCSR_BUSY);
2174     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
2175     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
2176     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
2177     /* If debug registers enabled. */
2178     if (s->cap & RISCV_IOMMU_CAP_DBG) {
2179         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
2180         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
2181             RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
2182     }
2183 
2184     /* Memory region for downstream access, if specified. */
2185     if (s->target_mr) {
2186         s->target_as = g_new0(AddressSpace, 1);
2187         address_space_init(s->target_as, s->target_mr,
2188             "riscv-iommu-downstream");
2189     } else {
2190         /* Fallback to global system memory. */
2191         s->target_as = &address_space_memory;
2192     }
2193 
2194     /* Memory region for untranslated MRIF/MSI writes */
2195     memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
2196             "riscv-iommu-trap", ~0ULL);
2197     address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
2198 
2199     /* Device translation context cache */
2200     s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
2201                                          riscv_iommu_ctx_equal,
2202                                          g_free, NULL);
2203 
2204     s->iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
2205                                          riscv_iommu_iot_equal,
2206                                          g_free, NULL);
2207 
2208     s->iommus.le_next = NULL;
2209     s->iommus.le_prev = NULL;
2210     QLIST_INIT(&s->spaces);
2211 }
2212 
2213 static void riscv_iommu_unrealize(DeviceState *dev)
2214 {
2215     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2216 
2217     g_hash_table_unref(s->iot_cache);
2218     g_hash_table_unref(s->ctx_cache);
2219 }
2220 
2221 static Property riscv_iommu_properties[] = {
2222     DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
2223         RISCV_IOMMU_SPEC_DOT_VER),
2224     DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
2225     DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
2226         LIMIT_CACHE_IOT),
2227     DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
2228     DEFINE_PROP_BOOL("ats", RISCVIOMMUState, enable_ats, TRUE),
2229     DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
2230     DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
2231     DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
2232     DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
2233         TYPE_MEMORY_REGION, MemoryRegion *),
2234     DEFINE_PROP_END_OF_LIST(),
2235 };
2236 
2237 static void riscv_iommu_class_init(ObjectClass *klass, void* data)
2238 {
2239     DeviceClass *dc = DEVICE_CLASS(klass);
2240 
2241     /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
2242     dc->user_creatable = false;
2243     dc->realize = riscv_iommu_realize;
2244     dc->unrealize = riscv_iommu_unrealize;
2245     device_class_set_props(dc, riscv_iommu_properties);
2246 }
2247 
2248 static const TypeInfo riscv_iommu_info = {
2249     .name = TYPE_RISCV_IOMMU,
2250     .parent = TYPE_DEVICE,
2251     .instance_size = sizeof(RISCVIOMMUState),
2252     .class_init = riscv_iommu_class_init,
2253 };
2254 
2255 static const char *IOMMU_FLAG_STR[] = {
2256     "NA",
2257     "RO",
2258     "WR",
2259     "RW",
2260 };
2261 
2262 /* RISC-V IOMMU Memory Region - Address Translation Space */
2263 static IOMMUTLBEntry riscv_iommu_memory_region_translate(
2264     IOMMUMemoryRegion *iommu_mr, hwaddr addr,
2265     IOMMUAccessFlags flag, int iommu_idx)
2266 {
2267     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2268     RISCVIOMMUContext *ctx;
2269     void *ref;
2270     IOMMUTLBEntry iotlb = {
2271         .iova = addr,
2272         .target_as = as->iommu->target_as,
2273         .addr_mask = ~0ULL,
2274         .perm = flag,
2275     };
2276 
2277     ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
2278     if (ctx == NULL) {
2279         /* Translation disabled or invalid. */
2280         iotlb.addr_mask = 0;
2281         iotlb.perm = IOMMU_NONE;
2282     } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
2283         /* Translation disabled or fault reported. */
2284         iotlb.addr_mask = 0;
2285         iotlb.perm = IOMMU_NONE;
2286     }
2287 
2288     /* Trace all dma translations with original access flags. */
2289     trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
2290                           PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
2291                           IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
2292                           iotlb.translated_addr);
2293 
2294     riscv_iommu_ctx_put(as->iommu, ref);
2295 
2296     return iotlb;
2297 }
2298 
2299 static int riscv_iommu_memory_region_notify(
2300     IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
2301     IOMMUNotifierFlag new, Error **errp)
2302 {
2303     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2304 
2305     if (old == IOMMU_NOTIFIER_NONE) {
2306         as->notifier = true;
2307         trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
2308     } else if (new == IOMMU_NOTIFIER_NONE) {
2309         as->notifier = false;
2310         trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
2311     }
2312 
2313     return 0;
2314 }
2315 
2316 static inline bool pci_is_iommu(PCIDevice *pdev)
2317 {
2318     return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
2319 }
2320 
2321 static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
2322 {
2323     RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
2324     PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
2325     AddressSpace *as = NULL;
2326 
2327     if (pdev && pci_is_iommu(pdev)) {
2328         return s->target_as;
2329     }
2330 
2331     /* Find first registered IOMMU device */
2332     while (s->iommus.le_prev) {
2333         s = *(s->iommus.le_prev);
2334     }
2335 
2336     /* Find first matching IOMMU */
2337     while (s != NULL && as == NULL) {
2338         as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
2339         s = s->iommus.le_next;
2340     }
2341 
2342     return as ? as : &address_space_memory;
2343 }
2344 
2345 static const PCIIOMMUOps riscv_iommu_ops = {
2346     .get_address_space = riscv_iommu_find_as,
2347 };
2348 
2349 void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
2350         Error **errp)
2351 {
2352     if (bus->iommu_ops &&
2353         bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
2354         /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
2355         RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
2356         QLIST_INSERT_AFTER(last, iommu, iommus);
2357     } else if (!bus->iommu_ops && !bus->iommu_opaque) {
2358         pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
2359     } else {
2360         error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
2361             pci_bus_num(bus));
2362     }
2363 }
2364 
2365 static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
2366     MemTxAttrs attrs)
2367 {
2368     return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
2369 }
2370 
2371 static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
2372 {
2373     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2374     return 1 << as->iommu->pid_bits;
2375 }
2376 
2377 static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
2378 {
2379     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
2380 
2381     imrc->translate = riscv_iommu_memory_region_translate;
2382     imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
2383     imrc->attrs_to_index = riscv_iommu_memory_region_index;
2384     imrc->num_indexes = riscv_iommu_memory_region_index_len;
2385 }
2386 
2387 static const TypeInfo riscv_iommu_memory_region_info = {
2388     .parent = TYPE_IOMMU_MEMORY_REGION,
2389     .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
2390     .class_init = riscv_iommu_memory_region_init,
2391 };
2392 
2393 static void riscv_iommu_register_mr_types(void)
2394 {
2395     type_register_static(&riscv_iommu_memory_region_info);
2396     type_register_static(&riscv_iommu_info);
2397 }
2398 
2399 type_init(riscv_iommu_register_mr_types);
2400