1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qapi/error.h" 25 #include "hw/sysbus.h" 26 #include "exec/address-spaces.h" 27 #include "intel_iommu_internal.h" 28 #include "hw/pci/pci.h" 29 #include "hw/pci/pci_bus.h" 30 #include "hw/i386/pc.h" 31 #include "hw/i386/apic-msidef.h" 32 #include "hw/boards.h" 33 #include "hw/i386/x86-iommu.h" 34 #include "hw/pci-host/q35.h" 35 #include "sysemu/kvm.h" 36 #include "hw/i386/apic_internal.h" 37 #include "kvm_i386.h" 38 #include "trace.h" 39 40 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 41 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 42 43 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 44 uint64_t wmask, uint64_t w1cmask) 45 { 46 stq_le_p(&s->csr[addr], val); 47 stq_le_p(&s->wmask[addr], wmask); 48 stq_le_p(&s->w1cmask[addr], w1cmask); 49 } 50 51 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 52 { 53 stq_le_p(&s->womask[addr], mask); 54 } 55 56 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 57 uint32_t wmask, uint32_t w1cmask) 58 { 59 stl_le_p(&s->csr[addr], val); 60 stl_le_p(&s->wmask[addr], wmask); 61 stl_le_p(&s->w1cmask[addr], w1cmask); 62 } 63 64 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 65 { 66 stl_le_p(&s->womask[addr], mask); 67 } 68 69 /* "External" get/set operations */ 70 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 71 { 72 uint64_t oldval = ldq_le_p(&s->csr[addr]); 73 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 74 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 75 stq_le_p(&s->csr[addr], 76 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 77 } 78 79 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 80 { 81 uint32_t oldval = ldl_le_p(&s->csr[addr]); 82 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 83 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 84 stl_le_p(&s->csr[addr], 85 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 86 } 87 88 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 89 { 90 uint64_t val = ldq_le_p(&s->csr[addr]); 91 uint64_t womask = ldq_le_p(&s->womask[addr]); 92 return val & ~womask; 93 } 94 95 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 96 { 97 uint32_t val = ldl_le_p(&s->csr[addr]); 98 uint32_t womask = ldl_le_p(&s->womask[addr]); 99 return val & ~womask; 100 } 101 102 /* "Internal" get/set operations */ 103 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 104 { 105 return ldq_le_p(&s->csr[addr]); 106 } 107 108 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 109 { 110 return ldl_le_p(&s->csr[addr]); 111 } 112 113 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 114 { 115 stq_le_p(&s->csr[addr], val); 116 } 117 118 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 119 uint32_t clear, uint32_t mask) 120 { 121 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 122 stl_le_p(&s->csr[addr], new_val); 123 return new_val; 124 } 125 126 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 127 uint64_t clear, uint64_t mask) 128 { 129 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 130 stq_le_p(&s->csr[addr], new_val); 131 return new_val; 132 } 133 134 static inline void vtd_iommu_lock(IntelIOMMUState *s) 135 { 136 qemu_mutex_lock(&s->iommu_lock); 137 } 138 139 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 140 { 141 qemu_mutex_unlock(&s->iommu_lock); 142 } 143 144 /* Whether the address space needs to notify new mappings */ 145 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 146 { 147 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 148 } 149 150 /* GHashTable functions */ 151 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 152 { 153 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 154 } 155 156 static guint vtd_uint64_hash(gconstpointer v) 157 { 158 return (guint)*(const uint64_t *)v; 159 } 160 161 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 162 gpointer user_data) 163 { 164 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 165 uint16_t domain_id = *(uint16_t *)user_data; 166 return entry->domain_id == domain_id; 167 } 168 169 /* The shift of an addr for a certain level of paging structure */ 170 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 171 { 172 assert(level != 0); 173 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 174 } 175 176 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 177 { 178 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 179 } 180 181 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 182 gpointer user_data) 183 { 184 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 185 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 186 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 187 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 188 return (entry->domain_id == info->domain_id) && 189 (((entry->gfn & info->mask) == gfn) || 190 (entry->gfn == gfn_tlb)); 191 } 192 193 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 194 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 195 */ 196 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 197 { 198 VTDAddressSpace *vtd_as; 199 VTDBus *vtd_bus; 200 GHashTableIter bus_it; 201 uint32_t devfn_it; 202 203 trace_vtd_context_cache_reset(); 204 205 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 206 207 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 208 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 209 vtd_as = vtd_bus->dev_as[devfn_it]; 210 if (!vtd_as) { 211 continue; 212 } 213 vtd_as->context_cache_entry.context_cache_gen = 0; 214 } 215 } 216 s->context_cache_gen = 1; 217 } 218 219 /* Must be called with IOMMU lock held. */ 220 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 221 { 222 assert(s->iotlb); 223 g_hash_table_remove_all(s->iotlb); 224 } 225 226 static void vtd_reset_iotlb(IntelIOMMUState *s) 227 { 228 vtd_iommu_lock(s); 229 vtd_reset_iotlb_locked(s); 230 vtd_iommu_unlock(s); 231 } 232 233 static void vtd_reset_caches(IntelIOMMUState *s) 234 { 235 vtd_iommu_lock(s); 236 vtd_reset_iotlb_locked(s); 237 vtd_reset_context_cache_locked(s); 238 vtd_iommu_unlock(s); 239 } 240 241 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 242 uint32_t level) 243 { 244 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 245 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 246 } 247 248 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 249 { 250 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 251 } 252 253 /* Must be called with IOMMU lock held */ 254 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 255 hwaddr addr) 256 { 257 VTDIOTLBEntry *entry; 258 uint64_t key; 259 int level; 260 261 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 262 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 263 source_id, level); 264 entry = g_hash_table_lookup(s->iotlb, &key); 265 if (entry) { 266 goto out; 267 } 268 } 269 270 out: 271 return entry; 272 } 273 274 /* Must be with IOMMU lock held */ 275 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 276 uint16_t domain_id, hwaddr addr, uint64_t slpte, 277 uint8_t access_flags, uint32_t level) 278 { 279 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 280 uint64_t *key = g_malloc(sizeof(*key)); 281 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 282 283 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 284 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 285 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 286 vtd_reset_iotlb_locked(s); 287 } 288 289 entry->gfn = gfn; 290 entry->domain_id = domain_id; 291 entry->slpte = slpte; 292 entry->access_flags = access_flags; 293 entry->mask = vtd_slpt_level_page_mask(level); 294 *key = vtd_get_iotlb_key(gfn, source_id, level); 295 g_hash_table_replace(s->iotlb, key, entry); 296 } 297 298 /* Given the reg addr of both the message data and address, generate an 299 * interrupt via MSI. 300 */ 301 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 302 hwaddr mesg_data_reg) 303 { 304 MSIMessage msi; 305 306 assert(mesg_data_reg < DMAR_REG_SIZE); 307 assert(mesg_addr_reg < DMAR_REG_SIZE); 308 309 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 310 msi.data = vtd_get_long_raw(s, mesg_data_reg); 311 312 trace_vtd_irq_generate(msi.address, msi.data); 313 314 apic_get_class()->send_msi(&msi); 315 } 316 317 /* Generate a fault event to software via MSI if conditions are met. 318 * Notice that the value of FSTS_REG being passed to it should be the one 319 * before any update. 320 */ 321 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 322 { 323 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 324 pre_fsts & VTD_FSTS_IQE) { 325 error_report_once("There are previous interrupt conditions " 326 "to be serviced by software, fault event " 327 "is not generated"); 328 return; 329 } 330 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 331 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 332 error_report_once("Interrupt Mask set, irq is not generated"); 333 } else { 334 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 335 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 336 } 337 } 338 339 /* Check if the Fault (F) field of the Fault Recording Register referenced by 340 * @index is Set. 341 */ 342 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 343 { 344 /* Each reg is 128-bit */ 345 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 346 addr += 8; /* Access the high 64-bit half */ 347 348 assert(index < DMAR_FRCD_REG_NR); 349 350 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 351 } 352 353 /* Update the PPF field of Fault Status Register. 354 * Should be called whenever change the F field of any fault recording 355 * registers. 356 */ 357 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 358 { 359 uint32_t i; 360 uint32_t ppf_mask = 0; 361 362 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 363 if (vtd_is_frcd_set(s, i)) { 364 ppf_mask = VTD_FSTS_PPF; 365 break; 366 } 367 } 368 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 369 trace_vtd_fsts_ppf(!!ppf_mask); 370 } 371 372 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 373 { 374 /* Each reg is 128-bit */ 375 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 376 addr += 8; /* Access the high 64-bit half */ 377 378 assert(index < DMAR_FRCD_REG_NR); 379 380 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 381 vtd_update_fsts_ppf(s); 382 } 383 384 /* Must not update F field now, should be done later */ 385 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 386 uint16_t source_id, hwaddr addr, 387 VTDFaultReason fault, bool is_write) 388 { 389 uint64_t hi = 0, lo; 390 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 391 392 assert(index < DMAR_FRCD_REG_NR); 393 394 lo = VTD_FRCD_FI(addr); 395 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 396 if (!is_write) { 397 hi |= VTD_FRCD_T; 398 } 399 vtd_set_quad_raw(s, frcd_reg_addr, lo); 400 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 401 402 trace_vtd_frr_new(index, hi, lo); 403 } 404 405 /* Try to collapse multiple pending faults from the same requester */ 406 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 407 { 408 uint32_t i; 409 uint64_t frcd_reg; 410 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 411 412 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 413 frcd_reg = vtd_get_quad_raw(s, addr); 414 if ((frcd_reg & VTD_FRCD_F) && 415 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 416 return true; 417 } 418 addr += 16; /* 128-bit for each */ 419 } 420 return false; 421 } 422 423 /* Log and report an DMAR (address translation) fault to software */ 424 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 425 hwaddr addr, VTDFaultReason fault, 426 bool is_write) 427 { 428 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 429 430 assert(fault < VTD_FR_MAX); 431 432 if (fault == VTD_FR_RESERVED_ERR) { 433 /* This is not a normal fault reason case. Drop it. */ 434 return; 435 } 436 437 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 438 439 if (fsts_reg & VTD_FSTS_PFO) { 440 error_report_once("New fault is not recorded due to " 441 "Primary Fault Overflow"); 442 return; 443 } 444 445 if (vtd_try_collapse_fault(s, source_id)) { 446 error_report_once("New fault is not recorded due to " 447 "compression of faults"); 448 return; 449 } 450 451 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 452 error_report_once("Next Fault Recording Reg is used, " 453 "new fault is not recorded, set PFO field"); 454 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 455 return; 456 } 457 458 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 459 460 if (fsts_reg & VTD_FSTS_PPF) { 461 error_report_once("There are pending faults already, " 462 "fault event is not generated"); 463 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 464 s->next_frcd_reg++; 465 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 466 s->next_frcd_reg = 0; 467 } 468 } else { 469 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 470 VTD_FSTS_FRI(s->next_frcd_reg)); 471 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 472 s->next_frcd_reg++; 473 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 474 s->next_frcd_reg = 0; 475 } 476 /* This case actually cause the PPF to be Set. 477 * So generate fault event (interrupt). 478 */ 479 vtd_generate_fault_event(s, fsts_reg); 480 } 481 } 482 483 /* Handle Invalidation Queue Errors of queued invalidation interface error 484 * conditions. 485 */ 486 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 487 { 488 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 489 490 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 491 vtd_generate_fault_event(s, fsts_reg); 492 } 493 494 /* Set the IWC field and try to generate an invalidation completion interrupt */ 495 static void vtd_generate_completion_event(IntelIOMMUState *s) 496 { 497 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 498 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 499 return; 500 } 501 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 502 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 503 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 504 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 505 "new event not generated"); 506 return; 507 } else { 508 /* Generate the interrupt event */ 509 trace_vtd_inv_desc_wait_irq("Generating complete event"); 510 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 511 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 512 } 513 } 514 515 static inline bool vtd_root_entry_present(VTDRootEntry *root) 516 { 517 return root->val & VTD_ROOT_ENTRY_P; 518 } 519 520 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 521 VTDRootEntry *re) 522 { 523 dma_addr_t addr; 524 525 addr = s->root + index * sizeof(*re); 526 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { 527 trace_vtd_re_invalid(re->rsvd, re->val); 528 re->val = 0; 529 return -VTD_FR_ROOT_TABLE_INV; 530 } 531 re->val = le64_to_cpu(re->val); 532 return 0; 533 } 534 535 static inline bool vtd_ce_present(VTDContextEntry *context) 536 { 537 return context->lo & VTD_CONTEXT_ENTRY_P; 538 } 539 540 static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, 541 VTDContextEntry *ce) 542 { 543 dma_addr_t addr; 544 545 /* we have checked that root entry is present */ 546 addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce); 547 if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) { 548 trace_vtd_re_invalid(root->rsvd, root->val); 549 return -VTD_FR_CONTEXT_TABLE_INV; 550 } 551 ce->lo = le64_to_cpu(ce->lo); 552 ce->hi = le64_to_cpu(ce->hi); 553 return 0; 554 } 555 556 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 557 { 558 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 559 } 560 561 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 562 { 563 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 564 } 565 566 /* Whether the pte indicates the address of the page frame */ 567 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 568 { 569 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 570 } 571 572 /* Get the content of a spte located in @base_addr[@index] */ 573 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 574 { 575 uint64_t slpte; 576 577 assert(index < VTD_SL_PT_ENTRY_NR); 578 579 if (dma_memory_read(&address_space_memory, 580 base_addr + index * sizeof(slpte), &slpte, 581 sizeof(slpte))) { 582 slpte = (uint64_t)-1; 583 return slpte; 584 } 585 slpte = le64_to_cpu(slpte); 586 return slpte; 587 } 588 589 /* Given an iova and the level of paging structure, return the offset 590 * of current level. 591 */ 592 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 593 { 594 return (iova >> vtd_slpt_level_shift(level)) & 595 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 596 } 597 598 /* Check Capability Register to see if the @level of page-table is supported */ 599 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 600 { 601 return VTD_CAP_SAGAW_MASK & s->cap & 602 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 603 } 604 605 /* Get the page-table level that hardware should use for the second-level 606 * page-table walk from the Address Width field of context-entry. 607 */ 608 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 609 { 610 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 611 } 612 613 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 614 { 615 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 616 } 617 618 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 619 { 620 return ce->lo & VTD_CONTEXT_ENTRY_TT; 621 } 622 623 /* Return true if check passed, otherwise false */ 624 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 625 VTDContextEntry *ce) 626 { 627 switch (vtd_ce_get_type(ce)) { 628 case VTD_CONTEXT_TT_MULTI_LEVEL: 629 /* Always supported */ 630 break; 631 case VTD_CONTEXT_TT_DEV_IOTLB: 632 if (!x86_iommu->dt_supported) { 633 return false; 634 } 635 break; 636 case VTD_CONTEXT_TT_PASS_THROUGH: 637 if (!x86_iommu->pt_supported) { 638 return false; 639 } 640 break; 641 default: 642 /* Unknwon type */ 643 return false; 644 } 645 return true; 646 } 647 648 static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw) 649 { 650 uint32_t ce_agaw = vtd_ce_get_agaw(ce); 651 return 1ULL << MIN(ce_agaw, aw); 652 } 653 654 /* Return true if IOVA passes range check, otherwise false. */ 655 static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce, 656 uint8_t aw) 657 { 658 /* 659 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 660 * in CAP_REG and AW in context-entry. 661 */ 662 return !(iova & ~(vtd_iova_limit(ce, aw) - 1)); 663 } 664 665 /* 666 * Rsvd field masks for spte: 667 * Index [1] to [4] 4k pages 668 * Index [5] to [8] large pages 669 */ 670 static uint64_t vtd_paging_entry_rsvd_field[9]; 671 672 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 673 { 674 if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) { 675 /* Maybe large page */ 676 return slpte & vtd_paging_entry_rsvd_field[level + 4]; 677 } else { 678 return slpte & vtd_paging_entry_rsvd_field[level]; 679 } 680 } 681 682 /* Find the VTD address space associated with a given bus number */ 683 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 684 { 685 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 686 if (!vtd_bus) { 687 /* 688 * Iterate over the registered buses to find the one which 689 * currently hold this bus number, and update the bus_num 690 * lookup table: 691 */ 692 GHashTableIter iter; 693 694 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 695 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 696 if (pci_bus_num(vtd_bus->bus) == bus_num) { 697 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 698 return vtd_bus; 699 } 700 } 701 } 702 return vtd_bus; 703 } 704 705 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 706 * of the translation, can be used for deciding the size of large page. 707 */ 708 static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, 709 uint64_t *slptep, uint32_t *slpte_level, 710 bool *reads, bool *writes, uint8_t aw_bits) 711 { 712 dma_addr_t addr = vtd_ce_get_slpt_base(ce); 713 uint32_t level = vtd_ce_get_level(ce); 714 uint32_t offset; 715 uint64_t slpte; 716 uint64_t access_right_check; 717 718 if (!vtd_iova_range_check(iova, ce, aw_bits)) { 719 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 720 __func__, iova); 721 return -VTD_FR_ADDR_BEYOND_MGAW; 722 } 723 724 /* FIXME: what is the Atomics request here? */ 725 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 726 727 while (true) { 728 offset = vtd_iova_level_offset(iova, level); 729 slpte = vtd_get_slpte(addr, offset); 730 731 if (slpte == (uint64_t)-1) { 732 error_report_once("%s: detected read error on DMAR slpte " 733 "(iova=0x%" PRIx64 ")", __func__, iova); 734 if (level == vtd_ce_get_level(ce)) { 735 /* Invalid programming of context-entry */ 736 return -VTD_FR_CONTEXT_ENTRY_INV; 737 } else { 738 return -VTD_FR_PAGING_ENTRY_INV; 739 } 740 } 741 *reads = (*reads) && (slpte & VTD_SL_R); 742 *writes = (*writes) && (slpte & VTD_SL_W); 743 if (!(slpte & access_right_check)) { 744 error_report_once("%s: detected slpte permission error " 745 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 746 "slpte=0x%" PRIx64 ", write=%d)", __func__, 747 iova, level, slpte, is_write); 748 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 749 } 750 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 751 error_report_once("%s: detected splte reserve non-zero " 752 "iova=0x%" PRIx64 ", level=0x%" PRIx32 753 "slpte=0x%" PRIx64 ")", __func__, iova, 754 level, slpte); 755 return -VTD_FR_PAGING_ENTRY_RSVD; 756 } 757 758 if (vtd_is_last_slpte(slpte, level)) { 759 *slptep = slpte; 760 *slpte_level = level; 761 return 0; 762 } 763 addr = vtd_get_slpte_addr(slpte, aw_bits); 764 level--; 765 } 766 } 767 768 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); 769 770 /** 771 * Constant information used during page walking 772 * 773 * @hook_fn: hook func to be called when detected page 774 * @private: private data to be passed into hook func 775 * @notify_unmap: whether we should notify invalid entries 776 * @as: VT-d address space of the device 777 * @aw: maximum address width 778 * @domain: domain ID of the page walk 779 */ 780 typedef struct { 781 VTDAddressSpace *as; 782 vtd_page_walk_hook hook_fn; 783 void *private; 784 bool notify_unmap; 785 uint8_t aw; 786 uint16_t domain_id; 787 } vtd_page_walk_info; 788 789 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) 790 { 791 VTDAddressSpace *as = info->as; 792 vtd_page_walk_hook hook_fn = info->hook_fn; 793 void *private = info->private; 794 DMAMap target = { 795 .iova = entry->iova, 796 .size = entry->addr_mask, 797 .translated_addr = entry->translated_addr, 798 .perm = entry->perm, 799 }; 800 DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 801 802 if (entry->perm == IOMMU_NONE && !info->notify_unmap) { 803 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 804 return 0; 805 } 806 807 assert(hook_fn); 808 809 /* Update local IOVA mapped ranges */ 810 if (entry->perm) { 811 if (mapped) { 812 /* If it's exactly the same translation, skip */ 813 if (!memcmp(mapped, &target, sizeof(target))) { 814 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 815 entry->translated_addr); 816 return 0; 817 } else { 818 /* 819 * Translation changed. Normally this should not 820 * happen, but it can happen when with buggy guest 821 * OSes. Note that there will be a small window that 822 * we don't have map at all. But that's the best 823 * effort we can do. The ideal way to emulate this is 824 * atomically modify the PTE to follow what has 825 * changed, but we can't. One example is that vfio 826 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 827 * interface to modify a mapping (meanwhile it seems 828 * meaningless to even provide one). Anyway, let's 829 * mark this as a TODO in case one day we'll have 830 * a better solution. 831 */ 832 IOMMUAccessFlags cache_perm = entry->perm; 833 int ret; 834 835 /* Emulate an UNMAP */ 836 entry->perm = IOMMU_NONE; 837 trace_vtd_page_walk_one(info->domain_id, 838 entry->iova, 839 entry->translated_addr, 840 entry->addr_mask, 841 entry->perm); 842 ret = hook_fn(entry, private); 843 if (ret) { 844 return ret; 845 } 846 /* Drop any existing mapping */ 847 iova_tree_remove(as->iova_tree, &target); 848 /* Recover the correct permission */ 849 entry->perm = cache_perm; 850 } 851 } 852 iova_tree_insert(as->iova_tree, &target); 853 } else { 854 if (!mapped) { 855 /* Skip since we didn't map this range at all */ 856 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 857 return 0; 858 } 859 iova_tree_remove(as->iova_tree, &target); 860 } 861 862 trace_vtd_page_walk_one(info->domain_id, entry->iova, 863 entry->translated_addr, entry->addr_mask, 864 entry->perm); 865 return hook_fn(entry, private); 866 } 867 868 /** 869 * vtd_page_walk_level - walk over specific level for IOVA range 870 * 871 * @addr: base GPA addr to start the walk 872 * @start: IOVA range start address 873 * @end: IOVA range end address (start <= addr < end) 874 * @read: whether parent level has read permission 875 * @write: whether parent level has write permission 876 * @info: constant information for the page walk 877 */ 878 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 879 uint64_t end, uint32_t level, bool read, 880 bool write, vtd_page_walk_info *info) 881 { 882 bool read_cur, write_cur, entry_valid; 883 uint32_t offset; 884 uint64_t slpte; 885 uint64_t subpage_size, subpage_mask; 886 IOMMUTLBEntry entry; 887 uint64_t iova = start; 888 uint64_t iova_next; 889 int ret = 0; 890 891 trace_vtd_page_walk_level(addr, level, start, end); 892 893 subpage_size = 1ULL << vtd_slpt_level_shift(level); 894 subpage_mask = vtd_slpt_level_page_mask(level); 895 896 while (iova < end) { 897 iova_next = (iova & subpage_mask) + subpage_size; 898 899 offset = vtd_iova_level_offset(iova, level); 900 slpte = vtd_get_slpte(addr, offset); 901 902 if (slpte == (uint64_t)-1) { 903 trace_vtd_page_walk_skip_read(iova, iova_next); 904 goto next; 905 } 906 907 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 908 trace_vtd_page_walk_skip_reserve(iova, iova_next); 909 goto next; 910 } 911 912 /* Permissions are stacked with parents' */ 913 read_cur = read && (slpte & VTD_SL_R); 914 write_cur = write && (slpte & VTD_SL_W); 915 916 /* 917 * As long as we have either read/write permission, this is a 918 * valid entry. The rule works for both page entries and page 919 * table entries. 920 */ 921 entry_valid = read_cur | write_cur; 922 923 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 924 /* 925 * This is a valid PDE (or even bigger than PDE). We need 926 * to walk one further level. 927 */ 928 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 929 iova, MIN(iova_next, end), level - 1, 930 read_cur, write_cur, info); 931 } else { 932 /* 933 * This means we are either: 934 * 935 * (1) the real page entry (either 4K page, or huge page) 936 * (2) the whole range is invalid 937 * 938 * In either case, we send an IOTLB notification down. 939 */ 940 entry.target_as = &address_space_memory; 941 entry.iova = iova & subpage_mask; 942 entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 943 entry.addr_mask = ~subpage_mask; 944 /* NOTE: this is only meaningful if entry_valid == true */ 945 entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 946 ret = vtd_page_walk_one(&entry, info); 947 } 948 949 if (ret < 0) { 950 return ret; 951 } 952 953 next: 954 iova = iova_next; 955 } 956 957 return 0; 958 } 959 960 /** 961 * vtd_page_walk - walk specific IOVA range, and call the hook 962 * 963 * @ce: context entry to walk upon 964 * @start: IOVA address to start the walk 965 * @end: IOVA range end address (start <= addr < end) 966 * @info: page walking information struct 967 */ 968 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, 969 vtd_page_walk_info *info) 970 { 971 dma_addr_t addr = vtd_ce_get_slpt_base(ce); 972 uint32_t level = vtd_ce_get_level(ce); 973 974 if (!vtd_iova_range_check(start, ce, info->aw)) { 975 return -VTD_FR_ADDR_BEYOND_MGAW; 976 } 977 978 if (!vtd_iova_range_check(end, ce, info->aw)) { 979 /* Fix end so that it reaches the maximum */ 980 end = vtd_iova_limit(ce, info->aw); 981 } 982 983 return vtd_page_walk_level(addr, start, end, level, true, true, info); 984 } 985 986 /* Map a device to its corresponding domain (context-entry) */ 987 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 988 uint8_t devfn, VTDContextEntry *ce) 989 { 990 VTDRootEntry re; 991 int ret_fr; 992 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 993 994 ret_fr = vtd_get_root_entry(s, bus_num, &re); 995 if (ret_fr) { 996 return ret_fr; 997 } 998 999 if (!vtd_root_entry_present(&re)) { 1000 /* Not error - it's okay we don't have root entry. */ 1001 trace_vtd_re_not_present(bus_num); 1002 return -VTD_FR_ROOT_ENTRY_P; 1003 } 1004 1005 if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) { 1006 trace_vtd_re_invalid(re.rsvd, re.val); 1007 return -VTD_FR_ROOT_ENTRY_RSVD; 1008 } 1009 1010 ret_fr = vtd_get_context_entry_from_root(&re, devfn, ce); 1011 if (ret_fr) { 1012 return ret_fr; 1013 } 1014 1015 if (!vtd_ce_present(ce)) { 1016 /* Not error - it's okay we don't have context entry. */ 1017 trace_vtd_ce_not_present(bus_num, devfn); 1018 return -VTD_FR_CONTEXT_ENTRY_P; 1019 } 1020 1021 if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || 1022 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1023 trace_vtd_ce_invalid(ce->hi, ce->lo); 1024 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1025 } 1026 1027 /* Check if the programming of context-entry is valid */ 1028 if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1029 trace_vtd_ce_invalid(ce->hi, ce->lo); 1030 return -VTD_FR_CONTEXT_ENTRY_INV; 1031 } 1032 1033 /* Do translation type check */ 1034 if (!vtd_ce_type_check(x86_iommu, ce)) { 1035 trace_vtd_ce_invalid(ce->hi, ce->lo); 1036 return -VTD_FR_CONTEXT_ENTRY_INV; 1037 } 1038 1039 return 0; 1040 } 1041 1042 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, 1043 void *private) 1044 { 1045 memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); 1046 return 0; 1047 } 1048 1049 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1050 VTDContextEntry *ce, 1051 hwaddr addr, hwaddr size) 1052 { 1053 IntelIOMMUState *s = vtd_as->iommu_state; 1054 vtd_page_walk_info info = { 1055 .hook_fn = vtd_sync_shadow_page_hook, 1056 .private = (void *)&vtd_as->iommu, 1057 .notify_unmap = true, 1058 .aw = s->aw_bits, 1059 .as = vtd_as, 1060 .domain_id = VTD_CONTEXT_ENTRY_DID(ce->hi), 1061 }; 1062 1063 return vtd_page_walk(ce, addr, addr + size, &info); 1064 } 1065 1066 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1067 { 1068 int ret; 1069 VTDContextEntry ce; 1070 IOMMUNotifier *n; 1071 1072 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1073 pci_bus_num(vtd_as->bus), 1074 vtd_as->devfn, &ce); 1075 if (ret) { 1076 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1077 /* 1078 * It's a valid scenario to have a context entry that is 1079 * not present. For example, when a device is removed 1080 * from an existing domain then the context entry will be 1081 * zeroed by the guest before it was put into another 1082 * domain. When this happens, instead of synchronizing 1083 * the shadow pages we should invalidate all existing 1084 * mappings and notify the backends. 1085 */ 1086 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1087 vtd_address_space_unmap(vtd_as, n); 1088 } 1089 ret = 0; 1090 } 1091 return ret; 1092 } 1093 1094 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1095 } 1096 1097 /* 1098 * Fetch translation type for specific device. Returns <0 if error 1099 * happens, otherwise return the shifted type to check against 1100 * VTD_CONTEXT_TT_*. 1101 */ 1102 static int vtd_dev_get_trans_type(VTDAddressSpace *as) 1103 { 1104 IntelIOMMUState *s; 1105 VTDContextEntry ce; 1106 int ret; 1107 1108 s = as->iommu_state; 1109 1110 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1111 as->devfn, &ce); 1112 if (ret) { 1113 return ret; 1114 } 1115 1116 return vtd_ce_get_type(&ce); 1117 } 1118 1119 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1120 { 1121 int ret; 1122 1123 assert(as); 1124 1125 ret = vtd_dev_get_trans_type(as); 1126 if (ret < 0) { 1127 /* 1128 * Possibly failed to parse the context entry for some reason 1129 * (e.g., during init, or any guest configuration errors on 1130 * context entries). We should assume PT not enabled for 1131 * safety. 1132 */ 1133 return false; 1134 } 1135 1136 return ret == VTD_CONTEXT_TT_PASS_THROUGH; 1137 } 1138 1139 /* Return whether the device is using IOMMU translation. */ 1140 static bool vtd_switch_address_space(VTDAddressSpace *as) 1141 { 1142 bool use_iommu; 1143 /* Whether we need to take the BQL on our own */ 1144 bool take_bql = !qemu_mutex_iothread_locked(); 1145 1146 assert(as); 1147 1148 use_iommu = as->iommu_state->dmar_enabled & !vtd_dev_pt_enabled(as); 1149 1150 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1151 VTD_PCI_SLOT(as->devfn), 1152 VTD_PCI_FUNC(as->devfn), 1153 use_iommu); 1154 1155 /* 1156 * It's possible that we reach here without BQL, e.g., when called 1157 * from vtd_pt_enable_fast_path(). However the memory APIs need 1158 * it. We'd better make sure we have had it already, or, take it. 1159 */ 1160 if (take_bql) { 1161 qemu_mutex_lock_iothread(); 1162 } 1163 1164 /* Turn off first then on the other */ 1165 if (use_iommu) { 1166 memory_region_set_enabled(&as->sys_alias, false); 1167 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1168 } else { 1169 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1170 memory_region_set_enabled(&as->sys_alias, true); 1171 } 1172 1173 if (take_bql) { 1174 qemu_mutex_unlock_iothread(); 1175 } 1176 1177 return use_iommu; 1178 } 1179 1180 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1181 { 1182 GHashTableIter iter; 1183 VTDBus *vtd_bus; 1184 int i; 1185 1186 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1187 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1188 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1189 if (!vtd_bus->dev_as[i]) { 1190 continue; 1191 } 1192 vtd_switch_address_space(vtd_bus->dev_as[i]); 1193 } 1194 } 1195 } 1196 1197 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1198 { 1199 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1200 } 1201 1202 static const bool vtd_qualified_faults[] = { 1203 [VTD_FR_RESERVED] = false, 1204 [VTD_FR_ROOT_ENTRY_P] = false, 1205 [VTD_FR_CONTEXT_ENTRY_P] = true, 1206 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1207 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1208 [VTD_FR_WRITE] = true, 1209 [VTD_FR_READ] = true, 1210 [VTD_FR_PAGING_ENTRY_INV] = true, 1211 [VTD_FR_ROOT_TABLE_INV] = false, 1212 [VTD_FR_CONTEXT_TABLE_INV] = false, 1213 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1214 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1215 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1216 [VTD_FR_RESERVED_ERR] = false, 1217 [VTD_FR_MAX] = false, 1218 }; 1219 1220 /* To see if a fault condition is "qualified", which is reported to software 1221 * only if the FPD field in the context-entry used to process the faulting 1222 * request is 0. 1223 */ 1224 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1225 { 1226 return vtd_qualified_faults[fault]; 1227 } 1228 1229 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1230 { 1231 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1232 } 1233 1234 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1235 { 1236 VTDBus *vtd_bus; 1237 VTDAddressSpace *vtd_as; 1238 bool success = false; 1239 1240 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1241 if (!vtd_bus) { 1242 goto out; 1243 } 1244 1245 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1246 if (!vtd_as) { 1247 goto out; 1248 } 1249 1250 if (vtd_switch_address_space(vtd_as) == false) { 1251 /* We switched off IOMMU region successfully. */ 1252 success = true; 1253 } 1254 1255 out: 1256 trace_vtd_pt_enable_fast_path(source_id, success); 1257 } 1258 1259 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1260 * translation. 1261 * 1262 * Called from RCU critical section. 1263 * 1264 * @bus_num: The bus number 1265 * @devfn: The devfn, which is the combined of device and function number 1266 * @is_write: The access is a write operation 1267 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1268 * 1269 * Returns true if translation is successful, otherwise false. 1270 */ 1271 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1272 uint8_t devfn, hwaddr addr, bool is_write, 1273 IOMMUTLBEntry *entry) 1274 { 1275 IntelIOMMUState *s = vtd_as->iommu_state; 1276 VTDContextEntry ce; 1277 uint8_t bus_num = pci_bus_num(bus); 1278 VTDContextCacheEntry *cc_entry; 1279 uint64_t slpte, page_mask; 1280 uint32_t level; 1281 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1282 int ret_fr; 1283 bool is_fpd_set = false; 1284 bool reads = true; 1285 bool writes = true; 1286 uint8_t access_flags; 1287 VTDIOTLBEntry *iotlb_entry; 1288 1289 /* 1290 * We have standalone memory region for interrupt addresses, we 1291 * should never receive translation requests in this region. 1292 */ 1293 assert(!vtd_is_interrupt_addr(addr)); 1294 1295 vtd_iommu_lock(s); 1296 1297 cc_entry = &vtd_as->context_cache_entry; 1298 1299 /* Try to fetch slpte form IOTLB */ 1300 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1301 if (iotlb_entry) { 1302 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1303 iotlb_entry->domain_id); 1304 slpte = iotlb_entry->slpte; 1305 access_flags = iotlb_entry->access_flags; 1306 page_mask = iotlb_entry->mask; 1307 goto out; 1308 } 1309 1310 /* Try to fetch context-entry from cache first */ 1311 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1312 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1313 cc_entry->context_entry.lo, 1314 cc_entry->context_cache_gen); 1315 ce = cc_entry->context_entry; 1316 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1317 } else { 1318 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1319 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1320 if (ret_fr) { 1321 ret_fr = -ret_fr; 1322 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { 1323 trace_vtd_fault_disabled(); 1324 } else { 1325 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); 1326 } 1327 goto error; 1328 } 1329 /* Update context-cache */ 1330 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1331 cc_entry->context_cache_gen, 1332 s->context_cache_gen); 1333 cc_entry->context_entry = ce; 1334 cc_entry->context_cache_gen = s->context_cache_gen; 1335 } 1336 1337 /* 1338 * We don't need to translate for pass-through context entries. 1339 * Also, let's ignore IOTLB caching as well for PT devices. 1340 */ 1341 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1342 entry->iova = addr & VTD_PAGE_MASK_4K; 1343 entry->translated_addr = entry->iova; 1344 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1345 entry->perm = IOMMU_RW; 1346 trace_vtd_translate_pt(source_id, entry->iova); 1347 1348 /* 1349 * When this happens, it means firstly caching-mode is not 1350 * enabled, and this is the first passthrough translation for 1351 * the device. Let's enable the fast path for passthrough. 1352 * 1353 * When passthrough is disabled again for the device, we can 1354 * capture it via the context entry invalidation, then the 1355 * IOMMU region can be swapped back. 1356 */ 1357 vtd_pt_enable_fast_path(s, source_id); 1358 vtd_iommu_unlock(s); 1359 return true; 1360 } 1361 1362 ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, 1363 &reads, &writes, s->aw_bits); 1364 if (ret_fr) { 1365 ret_fr = -ret_fr; 1366 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { 1367 trace_vtd_fault_disabled(); 1368 } else { 1369 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); 1370 } 1371 goto error; 1372 } 1373 1374 page_mask = vtd_slpt_level_page_mask(level); 1375 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1376 vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, 1377 access_flags, level); 1378 out: 1379 vtd_iommu_unlock(s); 1380 entry->iova = addr & page_mask; 1381 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1382 entry->addr_mask = ~page_mask; 1383 entry->perm = access_flags; 1384 return true; 1385 1386 error: 1387 vtd_iommu_unlock(s); 1388 entry->iova = 0; 1389 entry->translated_addr = 0; 1390 entry->addr_mask = 0; 1391 entry->perm = IOMMU_NONE; 1392 return false; 1393 } 1394 1395 static void vtd_root_table_setup(IntelIOMMUState *s) 1396 { 1397 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1398 s->root_extended = s->root & VTD_RTADDR_RTT; 1399 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1400 1401 trace_vtd_reg_dmar_root(s->root, s->root_extended); 1402 } 1403 1404 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1405 uint32_t index, uint32_t mask) 1406 { 1407 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1408 } 1409 1410 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1411 { 1412 uint64_t value = 0; 1413 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1414 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1415 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1416 s->intr_eime = value & VTD_IRTA_EIME; 1417 1418 /* Notify global invalidation */ 1419 vtd_iec_notify_all(s, true, 0, 0); 1420 1421 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1422 } 1423 1424 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1425 { 1426 VTDAddressSpace *vtd_as; 1427 1428 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1429 vtd_sync_shadow_page_table(vtd_as); 1430 } 1431 } 1432 1433 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1434 { 1435 trace_vtd_inv_desc_cc_global(); 1436 /* Protects context cache */ 1437 vtd_iommu_lock(s); 1438 s->context_cache_gen++; 1439 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1440 vtd_reset_context_cache_locked(s); 1441 } 1442 vtd_iommu_unlock(s); 1443 vtd_address_space_refresh_all(s); 1444 /* 1445 * From VT-d spec 6.5.2.1, a global context entry invalidation 1446 * should be followed by a IOTLB global invalidation, so we should 1447 * be safe even without this. Hoewever, let's replay the region as 1448 * well to be safer, and go back here when we need finer tunes for 1449 * VT-d emulation codes. 1450 */ 1451 vtd_iommu_replay_all(s); 1452 } 1453 1454 /* Do a context-cache device-selective invalidation. 1455 * @func_mask: FM field after shifting 1456 */ 1457 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1458 uint16_t source_id, 1459 uint16_t func_mask) 1460 { 1461 uint16_t mask; 1462 VTDBus *vtd_bus; 1463 VTDAddressSpace *vtd_as; 1464 uint8_t bus_n, devfn; 1465 uint16_t devfn_it; 1466 1467 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1468 1469 switch (func_mask & 3) { 1470 case 0: 1471 mask = 0; /* No bits in the SID field masked */ 1472 break; 1473 case 1: 1474 mask = 4; /* Mask bit 2 in the SID field */ 1475 break; 1476 case 2: 1477 mask = 6; /* Mask bit 2:1 in the SID field */ 1478 break; 1479 case 3: 1480 mask = 7; /* Mask bit 2:0 in the SID field */ 1481 break; 1482 } 1483 mask = ~mask; 1484 1485 bus_n = VTD_SID_TO_BUS(source_id); 1486 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1487 if (vtd_bus) { 1488 devfn = VTD_SID_TO_DEVFN(source_id); 1489 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1490 vtd_as = vtd_bus->dev_as[devfn_it]; 1491 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1492 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1493 VTD_PCI_FUNC(devfn_it)); 1494 vtd_iommu_lock(s); 1495 vtd_as->context_cache_entry.context_cache_gen = 0; 1496 vtd_iommu_unlock(s); 1497 /* 1498 * Do switch address space when needed, in case if the 1499 * device passthrough bit is switched. 1500 */ 1501 vtd_switch_address_space(vtd_as); 1502 /* 1503 * So a device is moving out of (or moving into) a 1504 * domain, resync the shadow page table. 1505 * This won't bring bad even if we have no such 1506 * notifier registered - the IOMMU notification 1507 * framework will skip MAP notifications if that 1508 * happened. 1509 */ 1510 vtd_sync_shadow_page_table(vtd_as); 1511 } 1512 } 1513 } 1514 } 1515 1516 /* Context-cache invalidation 1517 * Returns the Context Actual Invalidation Granularity. 1518 * @val: the content of the CCMD_REG 1519 */ 1520 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1521 { 1522 uint64_t caig; 1523 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1524 1525 switch (type) { 1526 case VTD_CCMD_DOMAIN_INVL: 1527 /* Fall through */ 1528 case VTD_CCMD_GLOBAL_INVL: 1529 caig = VTD_CCMD_GLOBAL_INVL_A; 1530 vtd_context_global_invalidate(s); 1531 break; 1532 1533 case VTD_CCMD_DEVICE_INVL: 1534 caig = VTD_CCMD_DEVICE_INVL_A; 1535 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1536 break; 1537 1538 default: 1539 error_report_once("%s: invalid context: 0x%" PRIx64, 1540 __func__, val); 1541 caig = 0; 1542 } 1543 return caig; 1544 } 1545 1546 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1547 { 1548 trace_vtd_inv_desc_iotlb_global(); 1549 vtd_reset_iotlb(s); 1550 vtd_iommu_replay_all(s); 1551 } 1552 1553 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1554 { 1555 VTDContextEntry ce; 1556 VTDAddressSpace *vtd_as; 1557 1558 trace_vtd_inv_desc_iotlb_domain(domain_id); 1559 1560 vtd_iommu_lock(s); 1561 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1562 &domain_id); 1563 vtd_iommu_unlock(s); 1564 1565 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1566 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1567 vtd_as->devfn, &ce) && 1568 domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { 1569 vtd_sync_shadow_page_table(vtd_as); 1570 } 1571 } 1572 } 1573 1574 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1575 uint16_t domain_id, hwaddr addr, 1576 uint8_t am) 1577 { 1578 VTDAddressSpace *vtd_as; 1579 VTDContextEntry ce; 1580 int ret; 1581 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1582 1583 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1584 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1585 vtd_as->devfn, &ce); 1586 if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { 1587 if (vtd_as_has_map_notifier(vtd_as)) { 1588 /* 1589 * As long as we have MAP notifications registered in 1590 * any of our IOMMU notifiers, we need to sync the 1591 * shadow page table. 1592 */ 1593 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 1594 } else { 1595 /* 1596 * For UNMAP-only notifiers, we don't need to walk the 1597 * page tables. We just deliver the PSI down to 1598 * invalidate caches. 1599 */ 1600 IOMMUTLBEntry entry = { 1601 .target_as = &address_space_memory, 1602 .iova = addr, 1603 .translated_addr = 0, 1604 .addr_mask = size - 1, 1605 .perm = IOMMU_NONE, 1606 }; 1607 memory_region_notify_iommu(&vtd_as->iommu, 0, entry); 1608 } 1609 } 1610 } 1611 } 1612 1613 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 1614 hwaddr addr, uint8_t am) 1615 { 1616 VTDIOTLBPageInvInfo info; 1617 1618 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 1619 1620 assert(am <= VTD_MAMV); 1621 info.domain_id = domain_id; 1622 info.addr = addr; 1623 info.mask = ~((1 << am) - 1); 1624 vtd_iommu_lock(s); 1625 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 1626 vtd_iommu_unlock(s); 1627 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 1628 } 1629 1630 /* Flush IOTLB 1631 * Returns the IOTLB Actual Invalidation Granularity. 1632 * @val: the content of the IOTLB_REG 1633 */ 1634 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 1635 { 1636 uint64_t iaig; 1637 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 1638 uint16_t domain_id; 1639 hwaddr addr; 1640 uint8_t am; 1641 1642 switch (type) { 1643 case VTD_TLB_GLOBAL_FLUSH: 1644 iaig = VTD_TLB_GLOBAL_FLUSH_A; 1645 vtd_iotlb_global_invalidate(s); 1646 break; 1647 1648 case VTD_TLB_DSI_FLUSH: 1649 domain_id = VTD_TLB_DID(val); 1650 iaig = VTD_TLB_DSI_FLUSH_A; 1651 vtd_iotlb_domain_invalidate(s, domain_id); 1652 break; 1653 1654 case VTD_TLB_PSI_FLUSH: 1655 domain_id = VTD_TLB_DID(val); 1656 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 1657 am = VTD_IVA_AM(addr); 1658 addr = VTD_IVA_ADDR(addr); 1659 if (am > VTD_MAMV) { 1660 error_report_once("%s: address mask overflow: 0x%" PRIx64, 1661 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 1662 iaig = 0; 1663 break; 1664 } 1665 iaig = VTD_TLB_PSI_FLUSH_A; 1666 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 1667 break; 1668 1669 default: 1670 error_report_once("%s: invalid granularity: 0x%" PRIx64, 1671 __func__, val); 1672 iaig = 0; 1673 } 1674 return iaig; 1675 } 1676 1677 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 1678 1679 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 1680 { 1681 return s->qi_enabled && (s->iq_tail == s->iq_head) && 1682 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 1683 } 1684 1685 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 1686 { 1687 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 1688 1689 trace_vtd_inv_qi_enable(en); 1690 1691 if (en) { 1692 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 1693 /* 2^(x+8) entries */ 1694 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8); 1695 s->qi_enabled = true; 1696 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 1697 /* Ok - report back to driver */ 1698 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 1699 1700 if (s->iq_tail != 0) { 1701 /* 1702 * This is a spec violation but Windows guests are known to set up 1703 * Queued Invalidation this way so we allow the write and process 1704 * Invalidation Descriptors right away. 1705 */ 1706 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 1707 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 1708 vtd_fetch_inv_desc(s); 1709 } 1710 } 1711 } else { 1712 if (vtd_queued_inv_disable_check(s)) { 1713 /* disable Queued Invalidation */ 1714 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 1715 s->iq_head = 0; 1716 s->qi_enabled = false; 1717 /* Ok - report back to driver */ 1718 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 1719 } else { 1720 error_report_once("%s: detected improper state when disable QI " 1721 "(head=0x%x, tail=0x%x, last_type=%d)", 1722 __func__, 1723 s->iq_head, s->iq_tail, s->iq_last_desc_type); 1724 } 1725 } 1726 } 1727 1728 /* Set Root Table Pointer */ 1729 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 1730 { 1731 vtd_root_table_setup(s); 1732 /* Ok - report back to driver */ 1733 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 1734 vtd_reset_caches(s); 1735 vtd_address_space_refresh_all(s); 1736 } 1737 1738 /* Set Interrupt Remap Table Pointer */ 1739 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 1740 { 1741 vtd_interrupt_remap_table_setup(s); 1742 /* Ok - report back to driver */ 1743 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 1744 } 1745 1746 /* Handle Translation Enable/Disable */ 1747 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 1748 { 1749 if (s->dmar_enabled == en) { 1750 return; 1751 } 1752 1753 trace_vtd_dmar_enable(en); 1754 1755 if (en) { 1756 s->dmar_enabled = true; 1757 /* Ok - report back to driver */ 1758 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 1759 } else { 1760 s->dmar_enabled = false; 1761 1762 /* Clear the index of Fault Recording Register */ 1763 s->next_frcd_reg = 0; 1764 /* Ok - report back to driver */ 1765 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 1766 } 1767 1768 vtd_reset_caches(s); 1769 vtd_address_space_refresh_all(s); 1770 } 1771 1772 /* Handle Interrupt Remap Enable/Disable */ 1773 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 1774 { 1775 trace_vtd_ir_enable(en); 1776 1777 if (en) { 1778 s->intr_enabled = true; 1779 /* Ok - report back to driver */ 1780 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 1781 } else { 1782 s->intr_enabled = false; 1783 /* Ok - report back to driver */ 1784 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 1785 } 1786 } 1787 1788 /* Handle write to Global Command Register */ 1789 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 1790 { 1791 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 1792 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 1793 uint32_t changed = status ^ val; 1794 1795 trace_vtd_reg_write_gcmd(status, val); 1796 if (changed & VTD_GCMD_TE) { 1797 /* Translation enable/disable */ 1798 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 1799 } 1800 if (val & VTD_GCMD_SRTP) { 1801 /* Set/update the root-table pointer */ 1802 vtd_handle_gcmd_srtp(s); 1803 } 1804 if (changed & VTD_GCMD_QIE) { 1805 /* Queued Invalidation Enable */ 1806 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 1807 } 1808 if (val & VTD_GCMD_SIRTP) { 1809 /* Set/update the interrupt remapping root-table pointer */ 1810 vtd_handle_gcmd_sirtp(s); 1811 } 1812 if (changed & VTD_GCMD_IRE) { 1813 /* Interrupt remap enable/disable */ 1814 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 1815 } 1816 } 1817 1818 /* Handle write to Context Command Register */ 1819 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 1820 { 1821 uint64_t ret; 1822 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 1823 1824 /* Context-cache invalidation request */ 1825 if (val & VTD_CCMD_ICC) { 1826 if (s->qi_enabled) { 1827 error_report_once("Queued Invalidation enabled, " 1828 "should not use register-based invalidation"); 1829 return; 1830 } 1831 ret = vtd_context_cache_invalidate(s, val); 1832 /* Invalidation completed. Change something to show */ 1833 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 1834 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 1835 ret); 1836 } 1837 } 1838 1839 /* Handle write to IOTLB Invalidation Register */ 1840 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 1841 { 1842 uint64_t ret; 1843 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 1844 1845 /* IOTLB invalidation request */ 1846 if (val & VTD_TLB_IVT) { 1847 if (s->qi_enabled) { 1848 error_report_once("Queued Invalidation enabled, " 1849 "should not use register-based invalidation"); 1850 return; 1851 } 1852 ret = vtd_iotlb_flush(s, val); 1853 /* Invalidation completed. Change something to show */ 1854 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 1855 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 1856 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 1857 } 1858 } 1859 1860 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 1861 static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset, 1862 VTDInvDesc *inv_desc) 1863 { 1864 dma_addr_t addr = base_addr + offset * sizeof(*inv_desc); 1865 if (dma_memory_read(&address_space_memory, addr, inv_desc, 1866 sizeof(*inv_desc))) { 1867 error_report_once("Read INV DESC failed"); 1868 inv_desc->lo = 0; 1869 inv_desc->hi = 0; 1870 return false; 1871 } 1872 inv_desc->lo = le64_to_cpu(inv_desc->lo); 1873 inv_desc->hi = le64_to_cpu(inv_desc->hi); 1874 return true; 1875 } 1876 1877 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 1878 { 1879 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 1880 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 1881 trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); 1882 return false; 1883 } 1884 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 1885 /* Status Write */ 1886 uint32_t status_data = (uint32_t)(inv_desc->lo >> 1887 VTD_INV_DESC_WAIT_DATA_SHIFT); 1888 1889 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 1890 1891 /* FIXME: need to be masked with HAW? */ 1892 dma_addr_t status_addr = inv_desc->hi; 1893 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 1894 status_data = cpu_to_le32(status_data); 1895 if (dma_memory_write(&address_space_memory, status_addr, &status_data, 1896 sizeof(status_data))) { 1897 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 1898 return false; 1899 } 1900 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 1901 /* Interrupt flag */ 1902 vtd_generate_completion_event(s); 1903 } else { 1904 trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); 1905 return false; 1906 } 1907 return true; 1908 } 1909 1910 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 1911 VTDInvDesc *inv_desc) 1912 { 1913 uint16_t sid, fmask; 1914 1915 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 1916 trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); 1917 return false; 1918 } 1919 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 1920 case VTD_INV_DESC_CC_DOMAIN: 1921 trace_vtd_inv_desc_cc_domain( 1922 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 1923 /* Fall through */ 1924 case VTD_INV_DESC_CC_GLOBAL: 1925 vtd_context_global_invalidate(s); 1926 break; 1927 1928 case VTD_INV_DESC_CC_DEVICE: 1929 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 1930 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 1931 vtd_context_device_invalidate(s, sid, fmask); 1932 break; 1933 1934 default: 1935 trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); 1936 return false; 1937 } 1938 return true; 1939 } 1940 1941 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 1942 { 1943 uint16_t domain_id; 1944 uint8_t am; 1945 hwaddr addr; 1946 1947 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 1948 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 1949 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1950 return false; 1951 } 1952 1953 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 1954 case VTD_INV_DESC_IOTLB_GLOBAL: 1955 vtd_iotlb_global_invalidate(s); 1956 break; 1957 1958 case VTD_INV_DESC_IOTLB_DOMAIN: 1959 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 1960 vtd_iotlb_domain_invalidate(s, domain_id); 1961 break; 1962 1963 case VTD_INV_DESC_IOTLB_PAGE: 1964 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 1965 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 1966 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 1967 if (am > VTD_MAMV) { 1968 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1969 return false; 1970 } 1971 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 1972 break; 1973 1974 default: 1975 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1976 return false; 1977 } 1978 return true; 1979 } 1980 1981 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 1982 VTDInvDesc *inv_desc) 1983 { 1984 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 1985 inv_desc->iec.index, 1986 inv_desc->iec.index_mask); 1987 1988 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 1989 inv_desc->iec.index, 1990 inv_desc->iec.index_mask); 1991 return true; 1992 } 1993 1994 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 1995 VTDInvDesc *inv_desc) 1996 { 1997 VTDAddressSpace *vtd_dev_as; 1998 IOMMUTLBEntry entry; 1999 struct VTDBus *vtd_bus; 2000 hwaddr addr; 2001 uint64_t sz; 2002 uint16_t sid; 2003 uint8_t devfn; 2004 bool size; 2005 uint8_t bus_num; 2006 2007 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2008 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2009 devfn = sid & 0xff; 2010 bus_num = sid >> 8; 2011 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2012 2013 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2014 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2015 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 2016 return false; 2017 } 2018 2019 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2020 if (!vtd_bus) { 2021 goto done; 2022 } 2023 2024 vtd_dev_as = vtd_bus->dev_as[devfn]; 2025 if (!vtd_dev_as) { 2026 goto done; 2027 } 2028 2029 /* According to ATS spec table 2.4: 2030 * S = 0, bits 15:12 = xxxx range size: 4K 2031 * S = 1, bits 15:12 = xxx0 range size: 8K 2032 * S = 1, bits 15:12 = xx01 range size: 16K 2033 * S = 1, bits 15:12 = x011 range size: 32K 2034 * S = 1, bits 15:12 = 0111 range size: 64K 2035 * ... 2036 */ 2037 if (size) { 2038 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2039 addr &= ~(sz - 1); 2040 } else { 2041 sz = VTD_PAGE_SIZE; 2042 } 2043 2044 entry.target_as = &vtd_dev_as->as; 2045 entry.addr_mask = sz - 1; 2046 entry.iova = addr; 2047 entry.perm = IOMMU_NONE; 2048 entry.translated_addr = 0; 2049 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); 2050 2051 done: 2052 return true; 2053 } 2054 2055 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2056 { 2057 VTDInvDesc inv_desc; 2058 uint8_t desc_type; 2059 2060 trace_vtd_inv_qi_head(s->iq_head); 2061 if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) { 2062 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2063 return false; 2064 } 2065 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2066 /* FIXME: should update at first or at last? */ 2067 s->iq_last_desc_type = desc_type; 2068 2069 switch (desc_type) { 2070 case VTD_INV_DESC_CC: 2071 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2072 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2073 return false; 2074 } 2075 break; 2076 2077 case VTD_INV_DESC_IOTLB: 2078 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2079 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2080 return false; 2081 } 2082 break; 2083 2084 case VTD_INV_DESC_WAIT: 2085 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2086 if (!vtd_process_wait_desc(s, &inv_desc)) { 2087 return false; 2088 } 2089 break; 2090 2091 case VTD_INV_DESC_IEC: 2092 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2093 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2094 return false; 2095 } 2096 break; 2097 2098 case VTD_INV_DESC_DEVICE: 2099 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2100 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2101 return false; 2102 } 2103 break; 2104 2105 default: 2106 trace_vtd_inv_desc_invalid(inv_desc.hi, inv_desc.lo); 2107 return false; 2108 } 2109 s->iq_head++; 2110 if (s->iq_head == s->iq_size) { 2111 s->iq_head = 0; 2112 } 2113 return true; 2114 } 2115 2116 /* Try to fetch and process more Invalidation Descriptors */ 2117 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2118 { 2119 trace_vtd_inv_qi_fetch(); 2120 2121 if (s->iq_tail >= s->iq_size) { 2122 /* Detects an invalid Tail pointer */ 2123 error_report_once("%s: detected invalid QI tail " 2124 "(tail=0x%x, size=0x%x)", 2125 __func__, s->iq_tail, s->iq_size); 2126 vtd_handle_inv_queue_error(s); 2127 return; 2128 } 2129 while (s->iq_head != s->iq_tail) { 2130 if (!vtd_process_inv_desc(s)) { 2131 /* Invalidation Queue Errors */ 2132 vtd_handle_inv_queue_error(s); 2133 break; 2134 } 2135 /* Must update the IQH_REG in time */ 2136 vtd_set_quad_raw(s, DMAR_IQH_REG, 2137 (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) & 2138 VTD_IQH_QH_MASK); 2139 } 2140 } 2141 2142 /* Handle write to Invalidation Queue Tail Register */ 2143 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2144 { 2145 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2146 2147 s->iq_tail = VTD_IQT_QT(val); 2148 trace_vtd_inv_qi_tail(s->iq_tail); 2149 2150 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2151 /* Process Invalidation Queue here */ 2152 vtd_fetch_inv_desc(s); 2153 } 2154 } 2155 2156 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2157 { 2158 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2159 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2160 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2161 2162 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2163 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2164 trace_vtd_fsts_clear_ip(); 2165 } 2166 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2167 * Descriptors if there are any when Queued Invalidation is enabled? 2168 */ 2169 } 2170 2171 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2172 { 2173 uint32_t fectl_reg; 2174 /* FIXME: when software clears the IM field, check the IP field. But do we 2175 * need to compare the old value and the new value to conclude that 2176 * software clears the IM field? Or just check if the IM field is zero? 2177 */ 2178 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2179 2180 trace_vtd_reg_write_fectl(fectl_reg); 2181 2182 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2183 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2184 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2185 } 2186 } 2187 2188 static void vtd_handle_ics_write(IntelIOMMUState *s) 2189 { 2190 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2191 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2192 2193 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2194 trace_vtd_reg_ics_clear_ip(); 2195 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2196 } 2197 } 2198 2199 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2200 { 2201 uint32_t iectl_reg; 2202 /* FIXME: when software clears the IM field, check the IP field. But do we 2203 * need to compare the old value and the new value to conclude that 2204 * software clears the IM field? Or just check if the IM field is zero? 2205 */ 2206 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2207 2208 trace_vtd_reg_write_iectl(iectl_reg); 2209 2210 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2211 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2212 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2213 } 2214 } 2215 2216 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2217 { 2218 IntelIOMMUState *s = opaque; 2219 uint64_t val; 2220 2221 trace_vtd_reg_read(addr, size); 2222 2223 if (addr + size > DMAR_REG_SIZE) { 2224 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2225 " size=0x%u", __func__, addr, size); 2226 return (uint64_t)-1; 2227 } 2228 2229 switch (addr) { 2230 /* Root Table Address Register, 64-bit */ 2231 case DMAR_RTADDR_REG: 2232 if (size == 4) { 2233 val = s->root & ((1ULL << 32) - 1); 2234 } else { 2235 val = s->root; 2236 } 2237 break; 2238 2239 case DMAR_RTADDR_REG_HI: 2240 assert(size == 4); 2241 val = s->root >> 32; 2242 break; 2243 2244 /* Invalidation Queue Address Register, 64-bit */ 2245 case DMAR_IQA_REG: 2246 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2247 if (size == 4) { 2248 val = val & ((1ULL << 32) - 1); 2249 } 2250 break; 2251 2252 case DMAR_IQA_REG_HI: 2253 assert(size == 4); 2254 val = s->iq >> 32; 2255 break; 2256 2257 default: 2258 if (size == 4) { 2259 val = vtd_get_long(s, addr); 2260 } else { 2261 val = vtd_get_quad(s, addr); 2262 } 2263 } 2264 2265 return val; 2266 } 2267 2268 static void vtd_mem_write(void *opaque, hwaddr addr, 2269 uint64_t val, unsigned size) 2270 { 2271 IntelIOMMUState *s = opaque; 2272 2273 trace_vtd_reg_write(addr, size, val); 2274 2275 if (addr + size > DMAR_REG_SIZE) { 2276 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2277 " size=0x%u", __func__, addr, size); 2278 return; 2279 } 2280 2281 switch (addr) { 2282 /* Global Command Register, 32-bit */ 2283 case DMAR_GCMD_REG: 2284 vtd_set_long(s, addr, val); 2285 vtd_handle_gcmd_write(s); 2286 break; 2287 2288 /* Context Command Register, 64-bit */ 2289 case DMAR_CCMD_REG: 2290 if (size == 4) { 2291 vtd_set_long(s, addr, val); 2292 } else { 2293 vtd_set_quad(s, addr, val); 2294 vtd_handle_ccmd_write(s); 2295 } 2296 break; 2297 2298 case DMAR_CCMD_REG_HI: 2299 assert(size == 4); 2300 vtd_set_long(s, addr, val); 2301 vtd_handle_ccmd_write(s); 2302 break; 2303 2304 /* IOTLB Invalidation Register, 64-bit */ 2305 case DMAR_IOTLB_REG: 2306 if (size == 4) { 2307 vtd_set_long(s, addr, val); 2308 } else { 2309 vtd_set_quad(s, addr, val); 2310 vtd_handle_iotlb_write(s); 2311 } 2312 break; 2313 2314 case DMAR_IOTLB_REG_HI: 2315 assert(size == 4); 2316 vtd_set_long(s, addr, val); 2317 vtd_handle_iotlb_write(s); 2318 break; 2319 2320 /* Invalidate Address Register, 64-bit */ 2321 case DMAR_IVA_REG: 2322 if (size == 4) { 2323 vtd_set_long(s, addr, val); 2324 } else { 2325 vtd_set_quad(s, addr, val); 2326 } 2327 break; 2328 2329 case DMAR_IVA_REG_HI: 2330 assert(size == 4); 2331 vtd_set_long(s, addr, val); 2332 break; 2333 2334 /* Fault Status Register, 32-bit */ 2335 case DMAR_FSTS_REG: 2336 assert(size == 4); 2337 vtd_set_long(s, addr, val); 2338 vtd_handle_fsts_write(s); 2339 break; 2340 2341 /* Fault Event Control Register, 32-bit */ 2342 case DMAR_FECTL_REG: 2343 assert(size == 4); 2344 vtd_set_long(s, addr, val); 2345 vtd_handle_fectl_write(s); 2346 break; 2347 2348 /* Fault Event Data Register, 32-bit */ 2349 case DMAR_FEDATA_REG: 2350 assert(size == 4); 2351 vtd_set_long(s, addr, val); 2352 break; 2353 2354 /* Fault Event Address Register, 32-bit */ 2355 case DMAR_FEADDR_REG: 2356 if (size == 4) { 2357 vtd_set_long(s, addr, val); 2358 } else { 2359 /* 2360 * While the register is 32-bit only, some guests (Xen...) write to 2361 * it with 64-bit. 2362 */ 2363 vtd_set_quad(s, addr, val); 2364 } 2365 break; 2366 2367 /* Fault Event Upper Address Register, 32-bit */ 2368 case DMAR_FEUADDR_REG: 2369 assert(size == 4); 2370 vtd_set_long(s, addr, val); 2371 break; 2372 2373 /* Protected Memory Enable Register, 32-bit */ 2374 case DMAR_PMEN_REG: 2375 assert(size == 4); 2376 vtd_set_long(s, addr, val); 2377 break; 2378 2379 /* Root Table Address Register, 64-bit */ 2380 case DMAR_RTADDR_REG: 2381 if (size == 4) { 2382 vtd_set_long(s, addr, val); 2383 } else { 2384 vtd_set_quad(s, addr, val); 2385 } 2386 break; 2387 2388 case DMAR_RTADDR_REG_HI: 2389 assert(size == 4); 2390 vtd_set_long(s, addr, val); 2391 break; 2392 2393 /* Invalidation Queue Tail Register, 64-bit */ 2394 case DMAR_IQT_REG: 2395 if (size == 4) { 2396 vtd_set_long(s, addr, val); 2397 } else { 2398 vtd_set_quad(s, addr, val); 2399 } 2400 vtd_handle_iqt_write(s); 2401 break; 2402 2403 case DMAR_IQT_REG_HI: 2404 assert(size == 4); 2405 vtd_set_long(s, addr, val); 2406 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2407 break; 2408 2409 /* Invalidation Queue Address Register, 64-bit */ 2410 case DMAR_IQA_REG: 2411 if (size == 4) { 2412 vtd_set_long(s, addr, val); 2413 } else { 2414 vtd_set_quad(s, addr, val); 2415 } 2416 break; 2417 2418 case DMAR_IQA_REG_HI: 2419 assert(size == 4); 2420 vtd_set_long(s, addr, val); 2421 break; 2422 2423 /* Invalidation Completion Status Register, 32-bit */ 2424 case DMAR_ICS_REG: 2425 assert(size == 4); 2426 vtd_set_long(s, addr, val); 2427 vtd_handle_ics_write(s); 2428 break; 2429 2430 /* Invalidation Event Control Register, 32-bit */ 2431 case DMAR_IECTL_REG: 2432 assert(size == 4); 2433 vtd_set_long(s, addr, val); 2434 vtd_handle_iectl_write(s); 2435 break; 2436 2437 /* Invalidation Event Data Register, 32-bit */ 2438 case DMAR_IEDATA_REG: 2439 assert(size == 4); 2440 vtd_set_long(s, addr, val); 2441 break; 2442 2443 /* Invalidation Event Address Register, 32-bit */ 2444 case DMAR_IEADDR_REG: 2445 assert(size == 4); 2446 vtd_set_long(s, addr, val); 2447 break; 2448 2449 /* Invalidation Event Upper Address Register, 32-bit */ 2450 case DMAR_IEUADDR_REG: 2451 assert(size == 4); 2452 vtd_set_long(s, addr, val); 2453 break; 2454 2455 /* Fault Recording Registers, 128-bit */ 2456 case DMAR_FRCD_REG_0_0: 2457 if (size == 4) { 2458 vtd_set_long(s, addr, val); 2459 } else { 2460 vtd_set_quad(s, addr, val); 2461 } 2462 break; 2463 2464 case DMAR_FRCD_REG_0_1: 2465 assert(size == 4); 2466 vtd_set_long(s, addr, val); 2467 break; 2468 2469 case DMAR_FRCD_REG_0_2: 2470 if (size == 4) { 2471 vtd_set_long(s, addr, val); 2472 } else { 2473 vtd_set_quad(s, addr, val); 2474 /* May clear bit 127 (Fault), update PPF */ 2475 vtd_update_fsts_ppf(s); 2476 } 2477 break; 2478 2479 case DMAR_FRCD_REG_0_3: 2480 assert(size == 4); 2481 vtd_set_long(s, addr, val); 2482 /* May clear bit 127 (Fault), update PPF */ 2483 vtd_update_fsts_ppf(s); 2484 break; 2485 2486 case DMAR_IRTA_REG: 2487 if (size == 4) { 2488 vtd_set_long(s, addr, val); 2489 } else { 2490 vtd_set_quad(s, addr, val); 2491 } 2492 break; 2493 2494 case DMAR_IRTA_REG_HI: 2495 assert(size == 4); 2496 vtd_set_long(s, addr, val); 2497 break; 2498 2499 default: 2500 if (size == 4) { 2501 vtd_set_long(s, addr, val); 2502 } else { 2503 vtd_set_quad(s, addr, val); 2504 } 2505 } 2506 } 2507 2508 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2509 IOMMUAccessFlags flag, int iommu_idx) 2510 { 2511 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2512 IntelIOMMUState *s = vtd_as->iommu_state; 2513 IOMMUTLBEntry iotlb = { 2514 /* We'll fill in the rest later. */ 2515 .target_as = &address_space_memory, 2516 }; 2517 bool success; 2518 2519 if (likely(s->dmar_enabled)) { 2520 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2521 addr, flag & IOMMU_WO, &iotlb); 2522 } else { 2523 /* DMAR disabled, passthrough, use 4k-page*/ 2524 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2525 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2526 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2527 iotlb.perm = IOMMU_RW; 2528 success = true; 2529 } 2530 2531 if (likely(success)) { 2532 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 2533 VTD_PCI_SLOT(vtd_as->devfn), 2534 VTD_PCI_FUNC(vtd_as->devfn), 2535 iotlb.iova, iotlb.translated_addr, 2536 iotlb.addr_mask); 2537 } else { 2538 error_report_once("%s: detected translation failure " 2539 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 2540 __func__, pci_bus_num(vtd_as->bus), 2541 VTD_PCI_SLOT(vtd_as->devfn), 2542 VTD_PCI_FUNC(vtd_as->devfn), 2543 iotlb.iova); 2544 } 2545 2546 return iotlb; 2547 } 2548 2549 static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 2550 IOMMUNotifierFlag old, 2551 IOMMUNotifierFlag new) 2552 { 2553 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2554 IntelIOMMUState *s = vtd_as->iommu_state; 2555 2556 if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { 2557 error_report("We need to set caching-mode=1 for intel-iommu to enable " 2558 "device assignment with IOMMU protection."); 2559 exit(1); 2560 } 2561 2562 /* Update per-address-space notifier flags */ 2563 vtd_as->notifier_flags = new; 2564 2565 if (old == IOMMU_NOTIFIER_NONE) { 2566 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 2567 } else if (new == IOMMU_NOTIFIER_NONE) { 2568 QLIST_REMOVE(vtd_as, next); 2569 } 2570 } 2571 2572 static int vtd_post_load(void *opaque, int version_id) 2573 { 2574 IntelIOMMUState *iommu = opaque; 2575 2576 /* 2577 * Memory regions are dynamically turned on/off depending on 2578 * context entry configurations from the guest. After migration, 2579 * we need to make sure the memory regions are still correct. 2580 */ 2581 vtd_switch_address_space_all(iommu); 2582 2583 return 0; 2584 } 2585 2586 static const VMStateDescription vtd_vmstate = { 2587 .name = "iommu-intel", 2588 .version_id = 1, 2589 .minimum_version_id = 1, 2590 .priority = MIG_PRI_IOMMU, 2591 .post_load = vtd_post_load, 2592 .fields = (VMStateField[]) { 2593 VMSTATE_UINT64(root, IntelIOMMUState), 2594 VMSTATE_UINT64(intr_root, IntelIOMMUState), 2595 VMSTATE_UINT64(iq, IntelIOMMUState), 2596 VMSTATE_UINT32(intr_size, IntelIOMMUState), 2597 VMSTATE_UINT16(iq_head, IntelIOMMUState), 2598 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 2599 VMSTATE_UINT16(iq_size, IntelIOMMUState), 2600 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 2601 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 2602 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 2603 VMSTATE_BOOL(root_extended, IntelIOMMUState), 2604 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 2605 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 2606 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 2607 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 2608 VMSTATE_END_OF_LIST() 2609 } 2610 }; 2611 2612 static const MemoryRegionOps vtd_mem_ops = { 2613 .read = vtd_mem_read, 2614 .write = vtd_mem_write, 2615 .endianness = DEVICE_LITTLE_ENDIAN, 2616 .impl = { 2617 .min_access_size = 4, 2618 .max_access_size = 8, 2619 }, 2620 .valid = { 2621 .min_access_size = 4, 2622 .max_access_size = 8, 2623 }, 2624 }; 2625 2626 static Property vtd_properties[] = { 2627 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 2628 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 2629 ON_OFF_AUTO_AUTO), 2630 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 2631 DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits, 2632 VTD_HOST_ADDRESS_WIDTH), 2633 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 2634 DEFINE_PROP_END_OF_LIST(), 2635 }; 2636 2637 /* Read IRTE entry with specific index */ 2638 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 2639 VTD_IR_TableEntry *entry, uint16_t sid) 2640 { 2641 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 2642 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 2643 dma_addr_t addr = 0x00; 2644 uint16_t mask, source_id; 2645 uint8_t bus, bus_max, bus_min; 2646 2647 addr = iommu->intr_root + index * sizeof(*entry); 2648 if (dma_memory_read(&address_space_memory, addr, entry, 2649 sizeof(*entry))) { 2650 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 2651 __func__, index, addr); 2652 return -VTD_FR_IR_ROOT_INVAL; 2653 } 2654 2655 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 2656 le64_to_cpu(entry->data[0])); 2657 2658 if (!entry->irte.present) { 2659 error_report_once("%s: detected non-present IRTE " 2660 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 2661 __func__, index, le64_to_cpu(entry->data[1]), 2662 le64_to_cpu(entry->data[0])); 2663 return -VTD_FR_IR_ENTRY_P; 2664 } 2665 2666 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 2667 entry->irte.__reserved_2) { 2668 error_report_once("%s: detected non-zero reserved IRTE " 2669 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 2670 __func__, index, le64_to_cpu(entry->data[1]), 2671 le64_to_cpu(entry->data[0])); 2672 return -VTD_FR_IR_IRTE_RSVD; 2673 } 2674 2675 if (sid != X86_IOMMU_SID_INVALID) { 2676 /* Validate IRTE SID */ 2677 source_id = le32_to_cpu(entry->irte.source_id); 2678 switch (entry->irte.sid_vtype) { 2679 case VTD_SVT_NONE: 2680 break; 2681 2682 case VTD_SVT_ALL: 2683 mask = vtd_svt_mask[entry->irte.sid_q]; 2684 if ((source_id & mask) != (sid & mask)) { 2685 error_report_once("%s: invalid IRTE SID " 2686 "(index=%u, sid=%u, source_id=%u)", 2687 __func__, index, sid, source_id); 2688 return -VTD_FR_IR_SID_ERR; 2689 } 2690 break; 2691 2692 case VTD_SVT_BUS: 2693 bus_max = source_id >> 8; 2694 bus_min = source_id & 0xff; 2695 bus = sid >> 8; 2696 if (bus > bus_max || bus < bus_min) { 2697 error_report_once("%s: invalid SVT_BUS " 2698 "(index=%u, bus=%u, min=%u, max=%u)", 2699 __func__, index, bus, bus_min, bus_max); 2700 return -VTD_FR_IR_SID_ERR; 2701 } 2702 break; 2703 2704 default: 2705 error_report_once("%s: detected invalid IRTE SVT " 2706 "(index=%u, type=%d)", __func__, 2707 index, entry->irte.sid_vtype); 2708 /* Take this as verification failure. */ 2709 return -VTD_FR_IR_SID_ERR; 2710 break; 2711 } 2712 } 2713 2714 return 0; 2715 } 2716 2717 /* Fetch IRQ information of specific IR index */ 2718 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 2719 X86IOMMUIrq *irq, uint16_t sid) 2720 { 2721 VTD_IR_TableEntry irte = {}; 2722 int ret = 0; 2723 2724 ret = vtd_irte_get(iommu, index, &irte, sid); 2725 if (ret) { 2726 return ret; 2727 } 2728 2729 irq->trigger_mode = irte.irte.trigger_mode; 2730 irq->vector = irte.irte.vector; 2731 irq->delivery_mode = irte.irte.delivery_mode; 2732 irq->dest = le32_to_cpu(irte.irte.dest_id); 2733 if (!iommu->intr_eime) { 2734 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 2735 #define VTD_IR_APIC_DEST_SHIFT (8) 2736 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 2737 VTD_IR_APIC_DEST_SHIFT; 2738 } 2739 irq->dest_mode = irte.irte.dest_mode; 2740 irq->redir_hint = irte.irte.redir_hint; 2741 2742 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 2743 irq->delivery_mode, irq->dest, irq->dest_mode); 2744 2745 return 0; 2746 } 2747 2748 /* Interrupt remapping for MSI/MSI-X entry */ 2749 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 2750 MSIMessage *origin, 2751 MSIMessage *translated, 2752 uint16_t sid) 2753 { 2754 int ret = 0; 2755 VTD_IR_MSIAddress addr; 2756 uint16_t index; 2757 X86IOMMUIrq irq = {}; 2758 2759 assert(origin && translated); 2760 2761 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 2762 2763 if (!iommu || !iommu->intr_enabled) { 2764 memcpy(translated, origin, sizeof(*origin)); 2765 goto out; 2766 } 2767 2768 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 2769 error_report_once("%s: MSI address high 32 bits non-zero detected: " 2770 "address=0x%" PRIx64, __func__, origin->address); 2771 return -VTD_FR_IR_REQ_RSVD; 2772 } 2773 2774 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 2775 if (addr.addr.__head != 0xfee) { 2776 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 2777 __func__, addr.data); 2778 return -VTD_FR_IR_REQ_RSVD; 2779 } 2780 2781 /* This is compatible mode. */ 2782 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 2783 memcpy(translated, origin, sizeof(*origin)); 2784 goto out; 2785 } 2786 2787 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 2788 2789 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 2790 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 2791 2792 if (addr.addr.sub_valid) { 2793 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 2794 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 2795 } 2796 2797 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 2798 if (ret) { 2799 return ret; 2800 } 2801 2802 if (addr.addr.sub_valid) { 2803 trace_vtd_ir_remap_type("MSI"); 2804 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 2805 error_report_once("%s: invalid IR MSI " 2806 "(sid=%u, address=0x%" PRIx64 2807 ", data=0x%" PRIx32 ")", 2808 __func__, sid, origin->address, origin->data); 2809 return -VTD_FR_IR_REQ_RSVD; 2810 } 2811 } else { 2812 uint8_t vector = origin->data & 0xff; 2813 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 2814 2815 trace_vtd_ir_remap_type("IOAPIC"); 2816 /* IOAPIC entry vector should be aligned with IRTE vector 2817 * (see vt-d spec 5.1.5.1). */ 2818 if (vector != irq.vector) { 2819 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 2820 } 2821 2822 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 2823 * (see vt-d spec 5.1.5.1). */ 2824 if (trigger_mode != irq.trigger_mode) { 2825 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 2826 irq.trigger_mode); 2827 } 2828 } 2829 2830 /* 2831 * We'd better keep the last two bits, assuming that guest OS 2832 * might modify it. Keep it does not hurt after all. 2833 */ 2834 irq.msi_addr_last_bits = addr.addr.__not_care; 2835 2836 /* Translate X86IOMMUIrq to MSI message */ 2837 x86_iommu_irq_to_msi_message(&irq, translated); 2838 2839 out: 2840 trace_vtd_ir_remap_msi(origin->address, origin->data, 2841 translated->address, translated->data); 2842 return 0; 2843 } 2844 2845 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 2846 MSIMessage *dst, uint16_t sid) 2847 { 2848 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 2849 src, dst, sid); 2850 } 2851 2852 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 2853 uint64_t *data, unsigned size, 2854 MemTxAttrs attrs) 2855 { 2856 return MEMTX_OK; 2857 } 2858 2859 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 2860 uint64_t value, unsigned size, 2861 MemTxAttrs attrs) 2862 { 2863 int ret = 0; 2864 MSIMessage from = {}, to = {}; 2865 uint16_t sid = X86_IOMMU_SID_INVALID; 2866 2867 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 2868 from.data = (uint32_t) value; 2869 2870 if (!attrs.unspecified) { 2871 /* We have explicit Source ID */ 2872 sid = attrs.requester_id; 2873 } 2874 2875 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 2876 if (ret) { 2877 /* TODO: report error */ 2878 /* Drop this interrupt */ 2879 return MEMTX_ERROR; 2880 } 2881 2882 apic_get_class()->send_msi(&to); 2883 2884 return MEMTX_OK; 2885 } 2886 2887 static const MemoryRegionOps vtd_mem_ir_ops = { 2888 .read_with_attrs = vtd_mem_ir_read, 2889 .write_with_attrs = vtd_mem_ir_write, 2890 .endianness = DEVICE_LITTLE_ENDIAN, 2891 .impl = { 2892 .min_access_size = 4, 2893 .max_access_size = 4, 2894 }, 2895 .valid = { 2896 .min_access_size = 4, 2897 .max_access_size = 4, 2898 }, 2899 }; 2900 2901 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 2902 { 2903 uintptr_t key = (uintptr_t)bus; 2904 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 2905 VTDAddressSpace *vtd_dev_as; 2906 char name[128]; 2907 2908 if (!vtd_bus) { 2909 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 2910 *new_key = (uintptr_t)bus; 2911 /* No corresponding free() */ 2912 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 2913 PCI_DEVFN_MAX); 2914 vtd_bus->bus = bus; 2915 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 2916 } 2917 2918 vtd_dev_as = vtd_bus->dev_as[devfn]; 2919 2920 if (!vtd_dev_as) { 2921 snprintf(name, sizeof(name), "intel_iommu_devfn_%d", devfn); 2922 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 2923 2924 vtd_dev_as->bus = bus; 2925 vtd_dev_as->devfn = (uint8_t)devfn; 2926 vtd_dev_as->iommu_state = s; 2927 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 2928 vtd_dev_as->iova_tree = iova_tree_new(); 2929 2930 /* 2931 * Memory region relationships looks like (Address range shows 2932 * only lower 32 bits to make it short in length...): 2933 * 2934 * |-----------------+-------------------+----------| 2935 * | Name | Address range | Priority | 2936 * |-----------------+-------------------+----------+ 2937 * | vtd_root | 00000000-ffffffff | 0 | 2938 * | intel_iommu | 00000000-ffffffff | 1 | 2939 * | vtd_sys_alias | 00000000-ffffffff | 1 | 2940 * | intel_iommu_ir | fee00000-feefffff | 64 | 2941 * |-----------------+-------------------+----------| 2942 * 2943 * We enable/disable DMAR by switching enablement for 2944 * vtd_sys_alias and intel_iommu regions. IR region is always 2945 * enabled. 2946 */ 2947 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 2948 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 2949 "intel_iommu_dmar", 2950 UINT64_MAX); 2951 memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s), 2952 "vtd_sys_alias", get_system_memory(), 2953 0, memory_region_size(get_system_memory())); 2954 memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s), 2955 &vtd_mem_ir_ops, s, "intel_iommu_ir", 2956 VTD_INTERRUPT_ADDR_SIZE); 2957 memory_region_init(&vtd_dev_as->root, OBJECT(s), 2958 "vtd_root", UINT64_MAX); 2959 memory_region_add_subregion_overlap(&vtd_dev_as->root, 2960 VTD_INTERRUPT_ADDR_FIRST, 2961 &vtd_dev_as->iommu_ir, 64); 2962 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name); 2963 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 2964 &vtd_dev_as->sys_alias, 1); 2965 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 2966 MEMORY_REGION(&vtd_dev_as->iommu), 2967 1); 2968 vtd_switch_address_space(vtd_dev_as); 2969 } 2970 return vtd_dev_as; 2971 } 2972 2973 /* Unmap the whole range in the notifier's scope. */ 2974 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 2975 { 2976 IOMMUTLBEntry entry; 2977 hwaddr size; 2978 hwaddr start = n->start; 2979 hwaddr end = n->end; 2980 IntelIOMMUState *s = as->iommu_state; 2981 DMAMap map; 2982 2983 /* 2984 * Note: all the codes in this function has a assumption that IOVA 2985 * bits are no more than VTD_MGAW bits (which is restricted by 2986 * VT-d spec), otherwise we need to consider overflow of 64 bits. 2987 */ 2988 2989 if (end > VTD_ADDRESS_SIZE(s->aw_bits)) { 2990 /* 2991 * Don't need to unmap regions that is bigger than the whole 2992 * VT-d supported address space size 2993 */ 2994 end = VTD_ADDRESS_SIZE(s->aw_bits); 2995 } 2996 2997 assert(start <= end); 2998 size = end - start; 2999 3000 if (ctpop64(size) != 1) { 3001 /* 3002 * This size cannot format a correct mask. Let's enlarge it to 3003 * suite the minimum available mask. 3004 */ 3005 int n = 64 - clz64(size); 3006 if (n > s->aw_bits) { 3007 /* should not happen, but in case it happens, limit it */ 3008 n = s->aw_bits; 3009 } 3010 size = 1ULL << n; 3011 } 3012 3013 entry.target_as = &address_space_memory; 3014 /* Adjust iova for the size */ 3015 entry.iova = n->start & ~(size - 1); 3016 /* This field is meaningless for unmap */ 3017 entry.translated_addr = 0; 3018 entry.perm = IOMMU_NONE; 3019 entry.addr_mask = size - 1; 3020 3021 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3022 VTD_PCI_SLOT(as->devfn), 3023 VTD_PCI_FUNC(as->devfn), 3024 entry.iova, size); 3025 3026 map.iova = entry.iova; 3027 map.size = entry.addr_mask; 3028 iova_tree_remove(as->iova_tree, &map); 3029 3030 memory_region_notify_one(n, &entry); 3031 } 3032 3033 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3034 { 3035 VTDAddressSpace *vtd_as; 3036 IOMMUNotifier *n; 3037 3038 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3039 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3040 vtd_address_space_unmap(vtd_as, n); 3041 } 3042 } 3043 } 3044 3045 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3046 { 3047 vtd_address_space_unmap_all(s); 3048 vtd_switch_address_space_all(s); 3049 } 3050 3051 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) 3052 { 3053 memory_region_notify_one((IOMMUNotifier *)private, entry); 3054 return 0; 3055 } 3056 3057 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3058 { 3059 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3060 IntelIOMMUState *s = vtd_as->iommu_state; 3061 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3062 VTDContextEntry ce; 3063 3064 /* 3065 * The replay can be triggered by either a invalidation or a newly 3066 * created entry. No matter what, we release existing mappings 3067 * (it means flushing caches for UNMAP-only registers). 3068 */ 3069 vtd_address_space_unmap(vtd_as, n); 3070 3071 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3072 trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn), 3073 PCI_FUNC(vtd_as->devfn), 3074 VTD_CONTEXT_ENTRY_DID(ce.hi), 3075 ce.hi, ce.lo); 3076 if (vtd_as_has_map_notifier(vtd_as)) { 3077 /* This is required only for MAP typed notifiers */ 3078 vtd_page_walk_info info = { 3079 .hook_fn = vtd_replay_hook, 3080 .private = (void *)n, 3081 .notify_unmap = false, 3082 .aw = s->aw_bits, 3083 .as = vtd_as, 3084 .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), 3085 }; 3086 3087 vtd_page_walk(&ce, 0, ~0ULL, &info); 3088 } 3089 } else { 3090 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3091 PCI_FUNC(vtd_as->devfn)); 3092 } 3093 3094 return; 3095 } 3096 3097 /* Do the initialization. It will also be called when reset, so pay 3098 * attention when adding new initialization stuff. 3099 */ 3100 static void vtd_init(IntelIOMMUState *s) 3101 { 3102 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3103 3104 memset(s->csr, 0, DMAR_REG_SIZE); 3105 memset(s->wmask, 0, DMAR_REG_SIZE); 3106 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3107 memset(s->womask, 0, DMAR_REG_SIZE); 3108 3109 s->root = 0; 3110 s->root_extended = false; 3111 s->dmar_enabled = false; 3112 s->iq_head = 0; 3113 s->iq_tail = 0; 3114 s->iq = 0; 3115 s->iq_size = 0; 3116 s->qi_enabled = false; 3117 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3118 s->next_frcd_reg = 0; 3119 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3120 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3121 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3122 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3123 s->cap |= VTD_CAP_SAGAW_48bit; 3124 } 3125 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3126 3127 /* 3128 * Rsvd field masks for spte 3129 */ 3130 vtd_paging_entry_rsvd_field[0] = ~0ULL; 3131 vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits); 3132 vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3133 vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3134 vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3135 vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits); 3136 vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); 3137 vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); 3138 vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits); 3139 3140 if (x86_iommu->intr_supported) { 3141 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3142 if (s->intr_eim == ON_OFF_AUTO_ON) { 3143 s->ecap |= VTD_ECAP_EIM; 3144 } 3145 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3146 } 3147 3148 if (x86_iommu->dt_supported) { 3149 s->ecap |= VTD_ECAP_DT; 3150 } 3151 3152 if (x86_iommu->pt_supported) { 3153 s->ecap |= VTD_ECAP_PT; 3154 } 3155 3156 if (s->caching_mode) { 3157 s->cap |= VTD_CAP_CM; 3158 } 3159 3160 vtd_reset_caches(s); 3161 3162 /* Define registers with default values and bit semantics */ 3163 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3164 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3165 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3166 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3167 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3168 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3169 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0); 3170 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3171 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3172 3173 /* Advanced Fault Logging not supported */ 3174 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3175 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3176 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3177 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3178 3179 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3180 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3181 */ 3182 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3183 3184 /* Treated as RO for implementations that PLMR and PHMR fields reported 3185 * as Clear in the CAP_REG. 3186 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3187 */ 3188 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3189 3190 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3191 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3192 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0); 3193 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3194 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3195 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3196 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3197 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3198 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3199 3200 /* IOTLB registers */ 3201 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3202 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3203 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3204 3205 /* Fault Recording Registers, 128-bit */ 3206 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3207 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3208 3209 /* 3210 * Interrupt remapping registers. 3211 */ 3212 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3213 } 3214 3215 /* Should not reset address_spaces when reset because devices will still use 3216 * the address space they got at first (won't ask the bus again). 3217 */ 3218 static void vtd_reset(DeviceState *dev) 3219 { 3220 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3221 3222 vtd_init(s); 3223 vtd_address_space_refresh_all(s); 3224 } 3225 3226 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3227 { 3228 IntelIOMMUState *s = opaque; 3229 VTDAddressSpace *vtd_as; 3230 3231 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3232 3233 vtd_as = vtd_find_add_as(s, bus, devfn); 3234 return &vtd_as->as; 3235 } 3236 3237 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3238 { 3239 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3240 3241 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu->intr_supported) { 3242 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3243 return false; 3244 } 3245 3246 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3247 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3248 && x86_iommu->intr_supported ? 3249 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3250 } 3251 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3252 if (!kvm_irqchip_in_kernel()) { 3253 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3254 return false; 3255 } 3256 if (!kvm_enable_x2apic()) { 3257 error_setg(errp, "eim=on requires support on the KVM side" 3258 "(X2APIC_API, first shipped in v4.7)"); 3259 return false; 3260 } 3261 } 3262 3263 /* Currently only address widths supported are 39 and 48 bits */ 3264 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3265 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3266 error_setg(errp, "Supported values for x-aw-bits are: %d, %d", 3267 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3268 return false; 3269 } 3270 3271 return true; 3272 } 3273 3274 static void vtd_realize(DeviceState *dev, Error **errp) 3275 { 3276 MachineState *ms = MACHINE(qdev_get_machine()); 3277 PCMachineState *pcms = PC_MACHINE(ms); 3278 PCIBus *bus = pcms->bus; 3279 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3280 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); 3281 3282 x86_iommu->type = TYPE_INTEL; 3283 3284 if (!vtd_decide_config(s, errp)) { 3285 return; 3286 } 3287 3288 QLIST_INIT(&s->vtd_as_with_notifiers); 3289 qemu_mutex_init(&s->iommu_lock); 3290 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3291 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3292 "intel_iommu", DMAR_REG_SIZE); 3293 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3294 /* No corresponding destroy */ 3295 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3296 g_free, g_free); 3297 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3298 g_free, g_free); 3299 vtd_init(s); 3300 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3301 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3302 /* Pseudo address space under root PCI bus. */ 3303 pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3304 } 3305 3306 static void vtd_class_init(ObjectClass *klass, void *data) 3307 { 3308 DeviceClass *dc = DEVICE_CLASS(klass); 3309 X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass); 3310 3311 dc->reset = vtd_reset; 3312 dc->vmsd = &vtd_vmstate; 3313 dc->props = vtd_properties; 3314 dc->hotpluggable = false; 3315 x86_class->realize = vtd_realize; 3316 x86_class->int_remap = vtd_int_remap; 3317 /* Supported by the pc-q35-* machine types */ 3318 dc->user_creatable = true; 3319 } 3320 3321 static const TypeInfo vtd_info = { 3322 .name = TYPE_INTEL_IOMMU_DEVICE, 3323 .parent = TYPE_X86_IOMMU_DEVICE, 3324 .instance_size = sizeof(IntelIOMMUState), 3325 .class_init = vtd_class_init, 3326 }; 3327 3328 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3329 void *data) 3330 { 3331 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3332 3333 imrc->translate = vtd_iommu_translate; 3334 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3335 imrc->replay = vtd_iommu_replay; 3336 } 3337 3338 static const TypeInfo vtd_iommu_memory_region_info = { 3339 .parent = TYPE_IOMMU_MEMORY_REGION, 3340 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3341 .class_init = vtd_iommu_memory_region_class_init, 3342 }; 3343 3344 static void vtd_register_types(void) 3345 { 3346 type_register_static(&vtd_info); 3347 type_register_static(&vtd_iommu_memory_region_info); 3348 } 3349 3350 type_init(vtd_register_types) 3351