1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qapi/error.h" 25 #include "hw/sysbus.h" 26 #include "exec/address-spaces.h" 27 #include "intel_iommu_internal.h" 28 #include "hw/pci/pci.h" 29 #include "hw/pci/pci_bus.h" 30 #include "hw/i386/pc.h" 31 #include "hw/i386/apic-msidef.h" 32 #include "hw/boards.h" 33 #include "hw/i386/x86-iommu.h" 34 #include "hw/pci-host/q35.h" 35 #include "sysemu/kvm.h" 36 #include "hw/i386/apic_internal.h" 37 #include "kvm_i386.h" 38 #include "trace.h" 39 40 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 41 uint64_t wmask, uint64_t w1cmask) 42 { 43 stq_le_p(&s->csr[addr], val); 44 stq_le_p(&s->wmask[addr], wmask); 45 stq_le_p(&s->w1cmask[addr], w1cmask); 46 } 47 48 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 49 { 50 stq_le_p(&s->womask[addr], mask); 51 } 52 53 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 54 uint32_t wmask, uint32_t w1cmask) 55 { 56 stl_le_p(&s->csr[addr], val); 57 stl_le_p(&s->wmask[addr], wmask); 58 stl_le_p(&s->w1cmask[addr], w1cmask); 59 } 60 61 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 62 { 63 stl_le_p(&s->womask[addr], mask); 64 } 65 66 /* "External" get/set operations */ 67 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 68 { 69 uint64_t oldval = ldq_le_p(&s->csr[addr]); 70 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 71 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 72 stq_le_p(&s->csr[addr], 73 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 74 } 75 76 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 77 { 78 uint32_t oldval = ldl_le_p(&s->csr[addr]); 79 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 80 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 81 stl_le_p(&s->csr[addr], 82 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 83 } 84 85 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 86 { 87 uint64_t val = ldq_le_p(&s->csr[addr]); 88 uint64_t womask = ldq_le_p(&s->womask[addr]); 89 return val & ~womask; 90 } 91 92 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 93 { 94 uint32_t val = ldl_le_p(&s->csr[addr]); 95 uint32_t womask = ldl_le_p(&s->womask[addr]); 96 return val & ~womask; 97 } 98 99 /* "Internal" get/set operations */ 100 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 101 { 102 return ldq_le_p(&s->csr[addr]); 103 } 104 105 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 106 { 107 return ldl_le_p(&s->csr[addr]); 108 } 109 110 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 111 { 112 stq_le_p(&s->csr[addr], val); 113 } 114 115 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 116 uint32_t clear, uint32_t mask) 117 { 118 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 119 stl_le_p(&s->csr[addr], new_val); 120 return new_val; 121 } 122 123 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 124 uint64_t clear, uint64_t mask) 125 { 126 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 127 stq_le_p(&s->csr[addr], new_val); 128 return new_val; 129 } 130 131 static inline void vtd_iommu_lock(IntelIOMMUState *s) 132 { 133 qemu_mutex_lock(&s->iommu_lock); 134 } 135 136 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 137 { 138 qemu_mutex_unlock(&s->iommu_lock); 139 } 140 141 /* Whether the address space needs to notify new mappings */ 142 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 143 { 144 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 145 } 146 147 /* GHashTable functions */ 148 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 149 { 150 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 151 } 152 153 static guint vtd_uint64_hash(gconstpointer v) 154 { 155 return (guint)*(const uint64_t *)v; 156 } 157 158 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 159 gpointer user_data) 160 { 161 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 162 uint16_t domain_id = *(uint16_t *)user_data; 163 return entry->domain_id == domain_id; 164 } 165 166 /* The shift of an addr for a certain level of paging structure */ 167 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 168 { 169 assert(level != 0); 170 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 171 } 172 173 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 174 { 175 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 176 } 177 178 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 179 gpointer user_data) 180 { 181 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 182 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 183 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 184 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 185 return (entry->domain_id == info->domain_id) && 186 (((entry->gfn & info->mask) == gfn) || 187 (entry->gfn == gfn_tlb)); 188 } 189 190 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 191 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 192 */ 193 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 194 { 195 VTDAddressSpace *vtd_as; 196 VTDBus *vtd_bus; 197 GHashTableIter bus_it; 198 uint32_t devfn_it; 199 200 trace_vtd_context_cache_reset(); 201 202 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 203 204 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 205 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 206 vtd_as = vtd_bus->dev_as[devfn_it]; 207 if (!vtd_as) { 208 continue; 209 } 210 vtd_as->context_cache_entry.context_cache_gen = 0; 211 } 212 } 213 s->context_cache_gen = 1; 214 } 215 216 /* Must be called with IOMMU lock held. */ 217 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 218 { 219 assert(s->iotlb); 220 g_hash_table_remove_all(s->iotlb); 221 } 222 223 static void vtd_reset_iotlb(IntelIOMMUState *s) 224 { 225 vtd_iommu_lock(s); 226 vtd_reset_iotlb_locked(s); 227 vtd_iommu_unlock(s); 228 } 229 230 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 231 uint32_t level) 232 { 233 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 234 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 235 } 236 237 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 238 { 239 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 240 } 241 242 /* Must be called with IOMMU lock held */ 243 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 244 hwaddr addr) 245 { 246 VTDIOTLBEntry *entry; 247 uint64_t key; 248 int level; 249 250 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 251 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 252 source_id, level); 253 entry = g_hash_table_lookup(s->iotlb, &key); 254 if (entry) { 255 goto out; 256 } 257 } 258 259 out: 260 return entry; 261 } 262 263 /* Must be with IOMMU lock held */ 264 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 265 uint16_t domain_id, hwaddr addr, uint64_t slpte, 266 uint8_t access_flags, uint32_t level) 267 { 268 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 269 uint64_t *key = g_malloc(sizeof(*key)); 270 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 271 272 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 273 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 274 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 275 vtd_reset_iotlb_locked(s); 276 } 277 278 entry->gfn = gfn; 279 entry->domain_id = domain_id; 280 entry->slpte = slpte; 281 entry->access_flags = access_flags; 282 entry->mask = vtd_slpt_level_page_mask(level); 283 *key = vtd_get_iotlb_key(gfn, source_id, level); 284 g_hash_table_replace(s->iotlb, key, entry); 285 } 286 287 /* Given the reg addr of both the message data and address, generate an 288 * interrupt via MSI. 289 */ 290 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 291 hwaddr mesg_data_reg) 292 { 293 MSIMessage msi; 294 295 assert(mesg_data_reg < DMAR_REG_SIZE); 296 assert(mesg_addr_reg < DMAR_REG_SIZE); 297 298 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 299 msi.data = vtd_get_long_raw(s, mesg_data_reg); 300 301 trace_vtd_irq_generate(msi.address, msi.data); 302 303 apic_get_class()->send_msi(&msi); 304 } 305 306 /* Generate a fault event to software via MSI if conditions are met. 307 * Notice that the value of FSTS_REG being passed to it should be the one 308 * before any update. 309 */ 310 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 311 { 312 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 313 pre_fsts & VTD_FSTS_IQE) { 314 error_report_once("There are previous interrupt conditions " 315 "to be serviced by software, fault event " 316 "is not generated"); 317 return; 318 } 319 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 320 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 321 error_report_once("Interrupt Mask set, irq is not generated"); 322 } else { 323 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 324 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 325 } 326 } 327 328 /* Check if the Fault (F) field of the Fault Recording Register referenced by 329 * @index is Set. 330 */ 331 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 332 { 333 /* Each reg is 128-bit */ 334 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 335 addr += 8; /* Access the high 64-bit half */ 336 337 assert(index < DMAR_FRCD_REG_NR); 338 339 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 340 } 341 342 /* Update the PPF field of Fault Status Register. 343 * Should be called whenever change the F field of any fault recording 344 * registers. 345 */ 346 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 347 { 348 uint32_t i; 349 uint32_t ppf_mask = 0; 350 351 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 352 if (vtd_is_frcd_set(s, i)) { 353 ppf_mask = VTD_FSTS_PPF; 354 break; 355 } 356 } 357 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 358 trace_vtd_fsts_ppf(!!ppf_mask); 359 } 360 361 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 362 { 363 /* Each reg is 128-bit */ 364 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 365 addr += 8; /* Access the high 64-bit half */ 366 367 assert(index < DMAR_FRCD_REG_NR); 368 369 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 370 vtd_update_fsts_ppf(s); 371 } 372 373 /* Must not update F field now, should be done later */ 374 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 375 uint16_t source_id, hwaddr addr, 376 VTDFaultReason fault, bool is_write) 377 { 378 uint64_t hi = 0, lo; 379 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 380 381 assert(index < DMAR_FRCD_REG_NR); 382 383 lo = VTD_FRCD_FI(addr); 384 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 385 if (!is_write) { 386 hi |= VTD_FRCD_T; 387 } 388 vtd_set_quad_raw(s, frcd_reg_addr, lo); 389 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 390 391 trace_vtd_frr_new(index, hi, lo); 392 } 393 394 /* Try to collapse multiple pending faults from the same requester */ 395 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 396 { 397 uint32_t i; 398 uint64_t frcd_reg; 399 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 400 401 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 402 frcd_reg = vtd_get_quad_raw(s, addr); 403 if ((frcd_reg & VTD_FRCD_F) && 404 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 405 return true; 406 } 407 addr += 16; /* 128-bit for each */ 408 } 409 return false; 410 } 411 412 /* Log and report an DMAR (address translation) fault to software */ 413 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 414 hwaddr addr, VTDFaultReason fault, 415 bool is_write) 416 { 417 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 418 419 assert(fault < VTD_FR_MAX); 420 421 if (fault == VTD_FR_RESERVED_ERR) { 422 /* This is not a normal fault reason case. Drop it. */ 423 return; 424 } 425 426 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 427 428 if (fsts_reg & VTD_FSTS_PFO) { 429 error_report_once("New fault is not recorded due to " 430 "Primary Fault Overflow"); 431 return; 432 } 433 434 if (vtd_try_collapse_fault(s, source_id)) { 435 error_report_once("New fault is not recorded due to " 436 "compression of faults"); 437 return; 438 } 439 440 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 441 error_report_once("Next Fault Recording Reg is used, " 442 "new fault is not recorded, set PFO field"); 443 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 444 return; 445 } 446 447 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 448 449 if (fsts_reg & VTD_FSTS_PPF) { 450 error_report_once("There are pending faults already, " 451 "fault event is not generated"); 452 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 453 s->next_frcd_reg++; 454 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 455 s->next_frcd_reg = 0; 456 } 457 } else { 458 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 459 VTD_FSTS_FRI(s->next_frcd_reg)); 460 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 461 s->next_frcd_reg++; 462 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 463 s->next_frcd_reg = 0; 464 } 465 /* This case actually cause the PPF to be Set. 466 * So generate fault event (interrupt). 467 */ 468 vtd_generate_fault_event(s, fsts_reg); 469 } 470 } 471 472 /* Handle Invalidation Queue Errors of queued invalidation interface error 473 * conditions. 474 */ 475 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 476 { 477 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 478 479 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 480 vtd_generate_fault_event(s, fsts_reg); 481 } 482 483 /* Set the IWC field and try to generate an invalidation completion interrupt */ 484 static void vtd_generate_completion_event(IntelIOMMUState *s) 485 { 486 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 487 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 488 return; 489 } 490 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 491 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 492 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 493 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 494 "new event not generated"); 495 return; 496 } else { 497 /* Generate the interrupt event */ 498 trace_vtd_inv_desc_wait_irq("Generating complete event"); 499 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 500 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 501 } 502 } 503 504 static inline bool vtd_root_entry_present(VTDRootEntry *root) 505 { 506 return root->val & VTD_ROOT_ENTRY_P; 507 } 508 509 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 510 VTDRootEntry *re) 511 { 512 dma_addr_t addr; 513 514 addr = s->root + index * sizeof(*re); 515 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { 516 trace_vtd_re_invalid(re->rsvd, re->val); 517 re->val = 0; 518 return -VTD_FR_ROOT_TABLE_INV; 519 } 520 re->val = le64_to_cpu(re->val); 521 return 0; 522 } 523 524 static inline bool vtd_ce_present(VTDContextEntry *context) 525 { 526 return context->lo & VTD_CONTEXT_ENTRY_P; 527 } 528 529 static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index, 530 VTDContextEntry *ce) 531 { 532 dma_addr_t addr; 533 534 /* we have checked that root entry is present */ 535 addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce); 536 if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) { 537 trace_vtd_re_invalid(root->rsvd, root->val); 538 return -VTD_FR_CONTEXT_TABLE_INV; 539 } 540 ce->lo = le64_to_cpu(ce->lo); 541 ce->hi = le64_to_cpu(ce->hi); 542 return 0; 543 } 544 545 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 546 { 547 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 548 } 549 550 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 551 { 552 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 553 } 554 555 /* Whether the pte indicates the address of the page frame */ 556 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 557 { 558 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 559 } 560 561 /* Get the content of a spte located in @base_addr[@index] */ 562 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 563 { 564 uint64_t slpte; 565 566 assert(index < VTD_SL_PT_ENTRY_NR); 567 568 if (dma_memory_read(&address_space_memory, 569 base_addr + index * sizeof(slpte), &slpte, 570 sizeof(slpte))) { 571 slpte = (uint64_t)-1; 572 return slpte; 573 } 574 slpte = le64_to_cpu(slpte); 575 return slpte; 576 } 577 578 /* Given an iova and the level of paging structure, return the offset 579 * of current level. 580 */ 581 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 582 { 583 return (iova >> vtd_slpt_level_shift(level)) & 584 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 585 } 586 587 /* Check Capability Register to see if the @level of page-table is supported */ 588 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 589 { 590 return VTD_CAP_SAGAW_MASK & s->cap & 591 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 592 } 593 594 /* Get the page-table level that hardware should use for the second-level 595 * page-table walk from the Address Width field of context-entry. 596 */ 597 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 598 { 599 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 600 } 601 602 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 603 { 604 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 605 } 606 607 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 608 { 609 return ce->lo & VTD_CONTEXT_ENTRY_TT; 610 } 611 612 /* Return true if check passed, otherwise false */ 613 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 614 VTDContextEntry *ce) 615 { 616 switch (vtd_ce_get_type(ce)) { 617 case VTD_CONTEXT_TT_MULTI_LEVEL: 618 /* Always supported */ 619 break; 620 case VTD_CONTEXT_TT_DEV_IOTLB: 621 if (!x86_iommu->dt_supported) { 622 return false; 623 } 624 break; 625 case VTD_CONTEXT_TT_PASS_THROUGH: 626 if (!x86_iommu->pt_supported) { 627 return false; 628 } 629 break; 630 default: 631 /* Unknwon type */ 632 return false; 633 } 634 return true; 635 } 636 637 static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw) 638 { 639 uint32_t ce_agaw = vtd_ce_get_agaw(ce); 640 return 1ULL << MIN(ce_agaw, aw); 641 } 642 643 /* Return true if IOVA passes range check, otherwise false. */ 644 static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce, 645 uint8_t aw) 646 { 647 /* 648 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 649 * in CAP_REG and AW in context-entry. 650 */ 651 return !(iova & ~(vtd_iova_limit(ce, aw) - 1)); 652 } 653 654 /* 655 * Rsvd field masks for spte: 656 * Index [1] to [4] 4k pages 657 * Index [5] to [8] large pages 658 */ 659 static uint64_t vtd_paging_entry_rsvd_field[9]; 660 661 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 662 { 663 if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) { 664 /* Maybe large page */ 665 return slpte & vtd_paging_entry_rsvd_field[level + 4]; 666 } else { 667 return slpte & vtd_paging_entry_rsvd_field[level]; 668 } 669 } 670 671 /* Find the VTD address space associated with a given bus number */ 672 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 673 { 674 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 675 if (!vtd_bus) { 676 /* 677 * Iterate over the registered buses to find the one which 678 * currently hold this bus number, and update the bus_num 679 * lookup table: 680 */ 681 GHashTableIter iter; 682 683 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 684 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 685 if (pci_bus_num(vtd_bus->bus) == bus_num) { 686 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 687 return vtd_bus; 688 } 689 } 690 } 691 return vtd_bus; 692 } 693 694 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 695 * of the translation, can be used for deciding the size of large page. 696 */ 697 static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, 698 uint64_t *slptep, uint32_t *slpte_level, 699 bool *reads, bool *writes, uint8_t aw_bits) 700 { 701 dma_addr_t addr = vtd_ce_get_slpt_base(ce); 702 uint32_t level = vtd_ce_get_level(ce); 703 uint32_t offset; 704 uint64_t slpte; 705 uint64_t access_right_check; 706 707 if (!vtd_iova_range_check(iova, ce, aw_bits)) { 708 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 709 __func__, iova); 710 return -VTD_FR_ADDR_BEYOND_MGAW; 711 } 712 713 /* FIXME: what is the Atomics request here? */ 714 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 715 716 while (true) { 717 offset = vtd_iova_level_offset(iova, level); 718 slpte = vtd_get_slpte(addr, offset); 719 720 if (slpte == (uint64_t)-1) { 721 error_report_once("%s: detected read error on DMAR slpte " 722 "(iova=0x%" PRIx64 ")", __func__, iova); 723 if (level == vtd_ce_get_level(ce)) { 724 /* Invalid programming of context-entry */ 725 return -VTD_FR_CONTEXT_ENTRY_INV; 726 } else { 727 return -VTD_FR_PAGING_ENTRY_INV; 728 } 729 } 730 *reads = (*reads) && (slpte & VTD_SL_R); 731 *writes = (*writes) && (slpte & VTD_SL_W); 732 if (!(slpte & access_right_check)) { 733 error_report_once("%s: detected slpte permission error " 734 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 735 "slpte=0x%" PRIx64 ", write=%d)", __func__, 736 iova, level, slpte, is_write); 737 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 738 } 739 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 740 error_report_once("%s: detected splte reserve non-zero " 741 "iova=0x%" PRIx64 ", level=0x%" PRIx32 742 "slpte=0x%" PRIx64 ")", __func__, iova, 743 level, slpte); 744 return -VTD_FR_PAGING_ENTRY_RSVD; 745 } 746 747 if (vtd_is_last_slpte(slpte, level)) { 748 *slptep = slpte; 749 *slpte_level = level; 750 return 0; 751 } 752 addr = vtd_get_slpte_addr(slpte, aw_bits); 753 level--; 754 } 755 } 756 757 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); 758 759 /** 760 * Constant information used during page walking 761 * 762 * @hook_fn: hook func to be called when detected page 763 * @private: private data to be passed into hook func 764 * @notify_unmap: whether we should notify invalid entries 765 * @as: VT-d address space of the device 766 * @aw: maximum address width 767 * @domain: domain ID of the page walk 768 */ 769 typedef struct { 770 VTDAddressSpace *as; 771 vtd_page_walk_hook hook_fn; 772 void *private; 773 bool notify_unmap; 774 uint8_t aw; 775 uint16_t domain_id; 776 } vtd_page_walk_info; 777 778 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) 779 { 780 VTDAddressSpace *as = info->as; 781 vtd_page_walk_hook hook_fn = info->hook_fn; 782 void *private = info->private; 783 DMAMap target = { 784 .iova = entry->iova, 785 .size = entry->addr_mask, 786 .translated_addr = entry->translated_addr, 787 .perm = entry->perm, 788 }; 789 DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 790 791 if (entry->perm == IOMMU_NONE && !info->notify_unmap) { 792 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 793 return 0; 794 } 795 796 assert(hook_fn); 797 798 /* Update local IOVA mapped ranges */ 799 if (entry->perm) { 800 if (mapped) { 801 /* If it's exactly the same translation, skip */ 802 if (!memcmp(mapped, &target, sizeof(target))) { 803 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 804 entry->translated_addr); 805 return 0; 806 } else { 807 /* 808 * Translation changed. Normally this should not 809 * happen, but it can happen when with buggy guest 810 * OSes. Note that there will be a small window that 811 * we don't have map at all. But that's the best 812 * effort we can do. The ideal way to emulate this is 813 * atomically modify the PTE to follow what has 814 * changed, but we can't. One example is that vfio 815 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 816 * interface to modify a mapping (meanwhile it seems 817 * meaningless to even provide one). Anyway, let's 818 * mark this as a TODO in case one day we'll have 819 * a better solution. 820 */ 821 IOMMUAccessFlags cache_perm = entry->perm; 822 int ret; 823 824 /* Emulate an UNMAP */ 825 entry->perm = IOMMU_NONE; 826 trace_vtd_page_walk_one(info->domain_id, 827 entry->iova, 828 entry->translated_addr, 829 entry->addr_mask, 830 entry->perm); 831 ret = hook_fn(entry, private); 832 if (ret) { 833 return ret; 834 } 835 /* Drop any existing mapping */ 836 iova_tree_remove(as->iova_tree, &target); 837 /* Recover the correct permission */ 838 entry->perm = cache_perm; 839 } 840 } 841 iova_tree_insert(as->iova_tree, &target); 842 } else { 843 if (!mapped) { 844 /* Skip since we didn't map this range at all */ 845 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 846 return 0; 847 } 848 iova_tree_remove(as->iova_tree, &target); 849 } 850 851 trace_vtd_page_walk_one(info->domain_id, entry->iova, 852 entry->translated_addr, entry->addr_mask, 853 entry->perm); 854 return hook_fn(entry, private); 855 } 856 857 /** 858 * vtd_page_walk_level - walk over specific level for IOVA range 859 * 860 * @addr: base GPA addr to start the walk 861 * @start: IOVA range start address 862 * @end: IOVA range end address (start <= addr < end) 863 * @read: whether parent level has read permission 864 * @write: whether parent level has write permission 865 * @info: constant information for the page walk 866 */ 867 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 868 uint64_t end, uint32_t level, bool read, 869 bool write, vtd_page_walk_info *info) 870 { 871 bool read_cur, write_cur, entry_valid; 872 uint32_t offset; 873 uint64_t slpte; 874 uint64_t subpage_size, subpage_mask; 875 IOMMUTLBEntry entry; 876 uint64_t iova = start; 877 uint64_t iova_next; 878 int ret = 0; 879 880 trace_vtd_page_walk_level(addr, level, start, end); 881 882 subpage_size = 1ULL << vtd_slpt_level_shift(level); 883 subpage_mask = vtd_slpt_level_page_mask(level); 884 885 while (iova < end) { 886 iova_next = (iova & subpage_mask) + subpage_size; 887 888 offset = vtd_iova_level_offset(iova, level); 889 slpte = vtd_get_slpte(addr, offset); 890 891 if (slpte == (uint64_t)-1) { 892 trace_vtd_page_walk_skip_read(iova, iova_next); 893 goto next; 894 } 895 896 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 897 trace_vtd_page_walk_skip_reserve(iova, iova_next); 898 goto next; 899 } 900 901 /* Permissions are stacked with parents' */ 902 read_cur = read && (slpte & VTD_SL_R); 903 write_cur = write && (slpte & VTD_SL_W); 904 905 /* 906 * As long as we have either read/write permission, this is a 907 * valid entry. The rule works for both page entries and page 908 * table entries. 909 */ 910 entry_valid = read_cur | write_cur; 911 912 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 913 /* 914 * This is a valid PDE (or even bigger than PDE). We need 915 * to walk one further level. 916 */ 917 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 918 iova, MIN(iova_next, end), level - 1, 919 read_cur, write_cur, info); 920 } else { 921 /* 922 * This means we are either: 923 * 924 * (1) the real page entry (either 4K page, or huge page) 925 * (2) the whole range is invalid 926 * 927 * In either case, we send an IOTLB notification down. 928 */ 929 entry.target_as = &address_space_memory; 930 entry.iova = iova & subpage_mask; 931 entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 932 entry.addr_mask = ~subpage_mask; 933 /* NOTE: this is only meaningful if entry_valid == true */ 934 entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 935 ret = vtd_page_walk_one(&entry, info); 936 } 937 938 if (ret < 0) { 939 return ret; 940 } 941 942 next: 943 iova = iova_next; 944 } 945 946 return 0; 947 } 948 949 /** 950 * vtd_page_walk - walk specific IOVA range, and call the hook 951 * 952 * @ce: context entry to walk upon 953 * @start: IOVA address to start the walk 954 * @end: IOVA range end address (start <= addr < end) 955 * @info: page walking information struct 956 */ 957 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, 958 vtd_page_walk_info *info) 959 { 960 dma_addr_t addr = vtd_ce_get_slpt_base(ce); 961 uint32_t level = vtd_ce_get_level(ce); 962 963 if (!vtd_iova_range_check(start, ce, info->aw)) { 964 return -VTD_FR_ADDR_BEYOND_MGAW; 965 } 966 967 if (!vtd_iova_range_check(end, ce, info->aw)) { 968 /* Fix end so that it reaches the maximum */ 969 end = vtd_iova_limit(ce, info->aw); 970 } 971 972 return vtd_page_walk_level(addr, start, end, level, true, true, info); 973 } 974 975 /* Map a device to its corresponding domain (context-entry) */ 976 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 977 uint8_t devfn, VTDContextEntry *ce) 978 { 979 VTDRootEntry re; 980 int ret_fr; 981 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 982 983 ret_fr = vtd_get_root_entry(s, bus_num, &re); 984 if (ret_fr) { 985 return ret_fr; 986 } 987 988 if (!vtd_root_entry_present(&re)) { 989 /* Not error - it's okay we don't have root entry. */ 990 trace_vtd_re_not_present(bus_num); 991 return -VTD_FR_ROOT_ENTRY_P; 992 } 993 994 if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) { 995 trace_vtd_re_invalid(re.rsvd, re.val); 996 return -VTD_FR_ROOT_ENTRY_RSVD; 997 } 998 999 ret_fr = vtd_get_context_entry_from_root(&re, devfn, ce); 1000 if (ret_fr) { 1001 return ret_fr; 1002 } 1003 1004 if (!vtd_ce_present(ce)) { 1005 /* Not error - it's okay we don't have context entry. */ 1006 trace_vtd_ce_not_present(bus_num, devfn); 1007 return -VTD_FR_CONTEXT_ENTRY_P; 1008 } 1009 1010 if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || 1011 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1012 trace_vtd_ce_invalid(ce->hi, ce->lo); 1013 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1014 } 1015 1016 /* Check if the programming of context-entry is valid */ 1017 if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1018 trace_vtd_ce_invalid(ce->hi, ce->lo); 1019 return -VTD_FR_CONTEXT_ENTRY_INV; 1020 } 1021 1022 /* Do translation type check */ 1023 if (!vtd_ce_type_check(x86_iommu, ce)) { 1024 trace_vtd_ce_invalid(ce->hi, ce->lo); 1025 return -VTD_FR_CONTEXT_ENTRY_INV; 1026 } 1027 1028 return 0; 1029 } 1030 1031 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, 1032 void *private) 1033 { 1034 memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); 1035 return 0; 1036 } 1037 1038 /* If context entry is NULL, we'll try to fetch it on our own. */ 1039 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1040 VTDContextEntry *ce, 1041 hwaddr addr, hwaddr size) 1042 { 1043 IntelIOMMUState *s = vtd_as->iommu_state; 1044 vtd_page_walk_info info = { 1045 .hook_fn = vtd_sync_shadow_page_hook, 1046 .private = (void *)&vtd_as->iommu, 1047 .notify_unmap = true, 1048 .aw = s->aw_bits, 1049 .as = vtd_as, 1050 }; 1051 VTDContextEntry ce_cache; 1052 int ret; 1053 1054 if (ce) { 1055 /* If the caller provided context entry, use it */ 1056 ce_cache = *ce; 1057 } else { 1058 /* If the caller didn't provide ce, try to fetch */ 1059 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1060 vtd_as->devfn, &ce_cache); 1061 if (ret) { 1062 /* 1063 * This should not really happen, but in case it happens, 1064 * we just skip the sync for this time. After all we even 1065 * don't have the root table pointer! 1066 */ 1067 error_report_once("%s: invalid context entry for bus 0x%x" 1068 " devfn 0x%x", 1069 __func__, pci_bus_num(vtd_as->bus), 1070 vtd_as->devfn); 1071 return 0; 1072 } 1073 } 1074 1075 info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); 1076 1077 return vtd_page_walk(&ce_cache, addr, addr + size, &info); 1078 } 1079 1080 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1081 { 1082 return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); 1083 } 1084 1085 /* 1086 * Fetch translation type for specific device. Returns <0 if error 1087 * happens, otherwise return the shifted type to check against 1088 * VTD_CONTEXT_TT_*. 1089 */ 1090 static int vtd_dev_get_trans_type(VTDAddressSpace *as) 1091 { 1092 IntelIOMMUState *s; 1093 VTDContextEntry ce; 1094 int ret; 1095 1096 s = as->iommu_state; 1097 1098 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1099 as->devfn, &ce); 1100 if (ret) { 1101 return ret; 1102 } 1103 1104 return vtd_ce_get_type(&ce); 1105 } 1106 1107 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1108 { 1109 int ret; 1110 1111 assert(as); 1112 1113 ret = vtd_dev_get_trans_type(as); 1114 if (ret < 0) { 1115 /* 1116 * Possibly failed to parse the context entry for some reason 1117 * (e.g., during init, or any guest configuration errors on 1118 * context entries). We should assume PT not enabled for 1119 * safety. 1120 */ 1121 return false; 1122 } 1123 1124 return ret == VTD_CONTEXT_TT_PASS_THROUGH; 1125 } 1126 1127 /* Return whether the device is using IOMMU translation. */ 1128 static bool vtd_switch_address_space(VTDAddressSpace *as) 1129 { 1130 bool use_iommu; 1131 /* Whether we need to take the BQL on our own */ 1132 bool take_bql = !qemu_mutex_iothread_locked(); 1133 1134 assert(as); 1135 1136 use_iommu = as->iommu_state->dmar_enabled & !vtd_dev_pt_enabled(as); 1137 1138 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1139 VTD_PCI_SLOT(as->devfn), 1140 VTD_PCI_FUNC(as->devfn), 1141 use_iommu); 1142 1143 /* 1144 * It's possible that we reach here without BQL, e.g., when called 1145 * from vtd_pt_enable_fast_path(). However the memory APIs need 1146 * it. We'd better make sure we have had it already, or, take it. 1147 */ 1148 if (take_bql) { 1149 qemu_mutex_lock_iothread(); 1150 } 1151 1152 /* Turn off first then on the other */ 1153 if (use_iommu) { 1154 memory_region_set_enabled(&as->sys_alias, false); 1155 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1156 } else { 1157 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1158 memory_region_set_enabled(&as->sys_alias, true); 1159 } 1160 1161 if (take_bql) { 1162 qemu_mutex_unlock_iothread(); 1163 } 1164 1165 return use_iommu; 1166 } 1167 1168 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1169 { 1170 GHashTableIter iter; 1171 VTDBus *vtd_bus; 1172 int i; 1173 1174 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1175 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1176 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1177 if (!vtd_bus->dev_as[i]) { 1178 continue; 1179 } 1180 vtd_switch_address_space(vtd_bus->dev_as[i]); 1181 } 1182 } 1183 } 1184 1185 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1186 { 1187 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1188 } 1189 1190 static const bool vtd_qualified_faults[] = { 1191 [VTD_FR_RESERVED] = false, 1192 [VTD_FR_ROOT_ENTRY_P] = false, 1193 [VTD_FR_CONTEXT_ENTRY_P] = true, 1194 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1195 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1196 [VTD_FR_WRITE] = true, 1197 [VTD_FR_READ] = true, 1198 [VTD_FR_PAGING_ENTRY_INV] = true, 1199 [VTD_FR_ROOT_TABLE_INV] = false, 1200 [VTD_FR_CONTEXT_TABLE_INV] = false, 1201 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1202 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1203 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1204 [VTD_FR_RESERVED_ERR] = false, 1205 [VTD_FR_MAX] = false, 1206 }; 1207 1208 /* To see if a fault condition is "qualified", which is reported to software 1209 * only if the FPD field in the context-entry used to process the faulting 1210 * request is 0. 1211 */ 1212 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1213 { 1214 return vtd_qualified_faults[fault]; 1215 } 1216 1217 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1218 { 1219 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1220 } 1221 1222 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1223 { 1224 VTDBus *vtd_bus; 1225 VTDAddressSpace *vtd_as; 1226 bool success = false; 1227 1228 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1229 if (!vtd_bus) { 1230 goto out; 1231 } 1232 1233 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1234 if (!vtd_as) { 1235 goto out; 1236 } 1237 1238 if (vtd_switch_address_space(vtd_as) == false) { 1239 /* We switched off IOMMU region successfully. */ 1240 success = true; 1241 } 1242 1243 out: 1244 trace_vtd_pt_enable_fast_path(source_id, success); 1245 } 1246 1247 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1248 * translation. 1249 * 1250 * Called from RCU critical section. 1251 * 1252 * @bus_num: The bus number 1253 * @devfn: The devfn, which is the combined of device and function number 1254 * @is_write: The access is a write operation 1255 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1256 * 1257 * Returns true if translation is successful, otherwise false. 1258 */ 1259 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1260 uint8_t devfn, hwaddr addr, bool is_write, 1261 IOMMUTLBEntry *entry) 1262 { 1263 IntelIOMMUState *s = vtd_as->iommu_state; 1264 VTDContextEntry ce; 1265 uint8_t bus_num = pci_bus_num(bus); 1266 VTDContextCacheEntry *cc_entry; 1267 uint64_t slpte, page_mask; 1268 uint32_t level; 1269 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1270 int ret_fr; 1271 bool is_fpd_set = false; 1272 bool reads = true; 1273 bool writes = true; 1274 uint8_t access_flags; 1275 VTDIOTLBEntry *iotlb_entry; 1276 1277 /* 1278 * We have standalone memory region for interrupt addresses, we 1279 * should never receive translation requests in this region. 1280 */ 1281 assert(!vtd_is_interrupt_addr(addr)); 1282 1283 vtd_iommu_lock(s); 1284 1285 cc_entry = &vtd_as->context_cache_entry; 1286 1287 /* Try to fetch slpte form IOTLB */ 1288 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1289 if (iotlb_entry) { 1290 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1291 iotlb_entry->domain_id); 1292 slpte = iotlb_entry->slpte; 1293 access_flags = iotlb_entry->access_flags; 1294 page_mask = iotlb_entry->mask; 1295 goto out; 1296 } 1297 1298 /* Try to fetch context-entry from cache first */ 1299 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1300 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1301 cc_entry->context_entry.lo, 1302 cc_entry->context_cache_gen); 1303 ce = cc_entry->context_entry; 1304 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1305 } else { 1306 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1307 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1308 if (ret_fr) { 1309 ret_fr = -ret_fr; 1310 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { 1311 trace_vtd_fault_disabled(); 1312 } else { 1313 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); 1314 } 1315 goto error; 1316 } 1317 /* Update context-cache */ 1318 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1319 cc_entry->context_cache_gen, 1320 s->context_cache_gen); 1321 cc_entry->context_entry = ce; 1322 cc_entry->context_cache_gen = s->context_cache_gen; 1323 } 1324 1325 /* 1326 * We don't need to translate for pass-through context entries. 1327 * Also, let's ignore IOTLB caching as well for PT devices. 1328 */ 1329 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1330 entry->iova = addr & VTD_PAGE_MASK_4K; 1331 entry->translated_addr = entry->iova; 1332 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1333 entry->perm = IOMMU_RW; 1334 trace_vtd_translate_pt(source_id, entry->iova); 1335 1336 /* 1337 * When this happens, it means firstly caching-mode is not 1338 * enabled, and this is the first passthrough translation for 1339 * the device. Let's enable the fast path for passthrough. 1340 * 1341 * When passthrough is disabled again for the device, we can 1342 * capture it via the context entry invalidation, then the 1343 * IOMMU region can be swapped back. 1344 */ 1345 vtd_pt_enable_fast_path(s, source_id); 1346 vtd_iommu_unlock(s); 1347 return true; 1348 } 1349 1350 ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, 1351 &reads, &writes, s->aw_bits); 1352 if (ret_fr) { 1353 ret_fr = -ret_fr; 1354 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { 1355 trace_vtd_fault_disabled(); 1356 } else { 1357 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); 1358 } 1359 goto error; 1360 } 1361 1362 page_mask = vtd_slpt_level_page_mask(level); 1363 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1364 vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, 1365 access_flags, level); 1366 out: 1367 vtd_iommu_unlock(s); 1368 entry->iova = addr & page_mask; 1369 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1370 entry->addr_mask = ~page_mask; 1371 entry->perm = access_flags; 1372 return true; 1373 1374 error: 1375 vtd_iommu_unlock(s); 1376 entry->iova = 0; 1377 entry->translated_addr = 0; 1378 entry->addr_mask = 0; 1379 entry->perm = IOMMU_NONE; 1380 return false; 1381 } 1382 1383 static void vtd_root_table_setup(IntelIOMMUState *s) 1384 { 1385 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1386 s->root_extended = s->root & VTD_RTADDR_RTT; 1387 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1388 1389 trace_vtd_reg_dmar_root(s->root, s->root_extended); 1390 } 1391 1392 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1393 uint32_t index, uint32_t mask) 1394 { 1395 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1396 } 1397 1398 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1399 { 1400 uint64_t value = 0; 1401 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1402 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1403 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1404 s->intr_eime = value & VTD_IRTA_EIME; 1405 1406 /* Notify global invalidation */ 1407 vtd_iec_notify_all(s, true, 0, 0); 1408 1409 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1410 } 1411 1412 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1413 { 1414 VTDAddressSpace *vtd_as; 1415 1416 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1417 vtd_sync_shadow_page_table(vtd_as); 1418 } 1419 } 1420 1421 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1422 { 1423 trace_vtd_inv_desc_cc_global(); 1424 /* Protects context cache */ 1425 vtd_iommu_lock(s); 1426 s->context_cache_gen++; 1427 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1428 vtd_reset_context_cache_locked(s); 1429 } 1430 vtd_iommu_unlock(s); 1431 vtd_switch_address_space_all(s); 1432 /* 1433 * From VT-d spec 6.5.2.1, a global context entry invalidation 1434 * should be followed by a IOTLB global invalidation, so we should 1435 * be safe even without this. Hoewever, let's replay the region as 1436 * well to be safer, and go back here when we need finer tunes for 1437 * VT-d emulation codes. 1438 */ 1439 vtd_iommu_replay_all(s); 1440 } 1441 1442 /* Do a context-cache device-selective invalidation. 1443 * @func_mask: FM field after shifting 1444 */ 1445 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1446 uint16_t source_id, 1447 uint16_t func_mask) 1448 { 1449 uint16_t mask; 1450 VTDBus *vtd_bus; 1451 VTDAddressSpace *vtd_as; 1452 uint8_t bus_n, devfn; 1453 uint16_t devfn_it; 1454 1455 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1456 1457 switch (func_mask & 3) { 1458 case 0: 1459 mask = 0; /* No bits in the SID field masked */ 1460 break; 1461 case 1: 1462 mask = 4; /* Mask bit 2 in the SID field */ 1463 break; 1464 case 2: 1465 mask = 6; /* Mask bit 2:1 in the SID field */ 1466 break; 1467 case 3: 1468 mask = 7; /* Mask bit 2:0 in the SID field */ 1469 break; 1470 } 1471 mask = ~mask; 1472 1473 bus_n = VTD_SID_TO_BUS(source_id); 1474 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1475 if (vtd_bus) { 1476 devfn = VTD_SID_TO_DEVFN(source_id); 1477 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1478 vtd_as = vtd_bus->dev_as[devfn_it]; 1479 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1480 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1481 VTD_PCI_FUNC(devfn_it)); 1482 vtd_iommu_lock(s); 1483 vtd_as->context_cache_entry.context_cache_gen = 0; 1484 vtd_iommu_unlock(s); 1485 /* 1486 * Do switch address space when needed, in case if the 1487 * device passthrough bit is switched. 1488 */ 1489 vtd_switch_address_space(vtd_as); 1490 /* 1491 * So a device is moving out of (or moving into) a 1492 * domain, resync the shadow page table. 1493 * This won't bring bad even if we have no such 1494 * notifier registered - the IOMMU notification 1495 * framework will skip MAP notifications if that 1496 * happened. 1497 */ 1498 vtd_sync_shadow_page_table(vtd_as); 1499 } 1500 } 1501 } 1502 } 1503 1504 /* Context-cache invalidation 1505 * Returns the Context Actual Invalidation Granularity. 1506 * @val: the content of the CCMD_REG 1507 */ 1508 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1509 { 1510 uint64_t caig; 1511 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1512 1513 switch (type) { 1514 case VTD_CCMD_DOMAIN_INVL: 1515 /* Fall through */ 1516 case VTD_CCMD_GLOBAL_INVL: 1517 caig = VTD_CCMD_GLOBAL_INVL_A; 1518 vtd_context_global_invalidate(s); 1519 break; 1520 1521 case VTD_CCMD_DEVICE_INVL: 1522 caig = VTD_CCMD_DEVICE_INVL_A; 1523 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1524 break; 1525 1526 default: 1527 error_report_once("%s: invalid context: 0x%" PRIx64, 1528 __func__, val); 1529 caig = 0; 1530 } 1531 return caig; 1532 } 1533 1534 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1535 { 1536 trace_vtd_inv_desc_iotlb_global(); 1537 vtd_reset_iotlb(s); 1538 vtd_iommu_replay_all(s); 1539 } 1540 1541 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1542 { 1543 VTDContextEntry ce; 1544 VTDAddressSpace *vtd_as; 1545 1546 trace_vtd_inv_desc_iotlb_domain(domain_id); 1547 1548 vtd_iommu_lock(s); 1549 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1550 &domain_id); 1551 vtd_iommu_unlock(s); 1552 1553 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1554 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1555 vtd_as->devfn, &ce) && 1556 domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { 1557 vtd_sync_shadow_page_table(vtd_as); 1558 } 1559 } 1560 } 1561 1562 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1563 uint16_t domain_id, hwaddr addr, 1564 uint8_t am) 1565 { 1566 VTDAddressSpace *vtd_as; 1567 VTDContextEntry ce; 1568 int ret; 1569 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1570 1571 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1572 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1573 vtd_as->devfn, &ce); 1574 if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { 1575 if (vtd_as_has_map_notifier(vtd_as)) { 1576 /* 1577 * As long as we have MAP notifications registered in 1578 * any of our IOMMU notifiers, we need to sync the 1579 * shadow page table. 1580 */ 1581 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 1582 } else { 1583 /* 1584 * For UNMAP-only notifiers, we don't need to walk the 1585 * page tables. We just deliver the PSI down to 1586 * invalidate caches. 1587 */ 1588 IOMMUTLBEntry entry = { 1589 .target_as = &address_space_memory, 1590 .iova = addr, 1591 .translated_addr = 0, 1592 .addr_mask = size - 1, 1593 .perm = IOMMU_NONE, 1594 }; 1595 memory_region_notify_iommu(&vtd_as->iommu, 0, entry); 1596 } 1597 } 1598 } 1599 } 1600 1601 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 1602 hwaddr addr, uint8_t am) 1603 { 1604 VTDIOTLBPageInvInfo info; 1605 1606 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 1607 1608 assert(am <= VTD_MAMV); 1609 info.domain_id = domain_id; 1610 info.addr = addr; 1611 info.mask = ~((1 << am) - 1); 1612 vtd_iommu_lock(s); 1613 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 1614 vtd_iommu_unlock(s); 1615 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 1616 } 1617 1618 /* Flush IOTLB 1619 * Returns the IOTLB Actual Invalidation Granularity. 1620 * @val: the content of the IOTLB_REG 1621 */ 1622 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 1623 { 1624 uint64_t iaig; 1625 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 1626 uint16_t domain_id; 1627 hwaddr addr; 1628 uint8_t am; 1629 1630 switch (type) { 1631 case VTD_TLB_GLOBAL_FLUSH: 1632 iaig = VTD_TLB_GLOBAL_FLUSH_A; 1633 vtd_iotlb_global_invalidate(s); 1634 break; 1635 1636 case VTD_TLB_DSI_FLUSH: 1637 domain_id = VTD_TLB_DID(val); 1638 iaig = VTD_TLB_DSI_FLUSH_A; 1639 vtd_iotlb_domain_invalidate(s, domain_id); 1640 break; 1641 1642 case VTD_TLB_PSI_FLUSH: 1643 domain_id = VTD_TLB_DID(val); 1644 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 1645 am = VTD_IVA_AM(addr); 1646 addr = VTD_IVA_ADDR(addr); 1647 if (am > VTD_MAMV) { 1648 error_report_once("%s: address mask overflow: 0x%" PRIx64, 1649 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 1650 iaig = 0; 1651 break; 1652 } 1653 iaig = VTD_TLB_PSI_FLUSH_A; 1654 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 1655 break; 1656 1657 default: 1658 error_report_once("%s: invalid granularity: 0x%" PRIx64, 1659 __func__, val); 1660 iaig = 0; 1661 } 1662 return iaig; 1663 } 1664 1665 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 1666 1667 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 1668 { 1669 return s->qi_enabled && (s->iq_tail == s->iq_head) && 1670 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 1671 } 1672 1673 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 1674 { 1675 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 1676 1677 trace_vtd_inv_qi_enable(en); 1678 1679 if (en) { 1680 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 1681 /* 2^(x+8) entries */ 1682 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8); 1683 s->qi_enabled = true; 1684 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 1685 /* Ok - report back to driver */ 1686 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 1687 1688 if (s->iq_tail != 0) { 1689 /* 1690 * This is a spec violation but Windows guests are known to set up 1691 * Queued Invalidation this way so we allow the write and process 1692 * Invalidation Descriptors right away. 1693 */ 1694 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 1695 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 1696 vtd_fetch_inv_desc(s); 1697 } 1698 } 1699 } else { 1700 if (vtd_queued_inv_disable_check(s)) { 1701 /* disable Queued Invalidation */ 1702 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 1703 s->iq_head = 0; 1704 s->qi_enabled = false; 1705 /* Ok - report back to driver */ 1706 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 1707 } else { 1708 error_report_once("%s: detected improper state when disable QI " 1709 "(head=0x%x, tail=0x%x, last_type=%d)", 1710 __func__, 1711 s->iq_head, s->iq_tail, s->iq_last_desc_type); 1712 } 1713 } 1714 } 1715 1716 /* Set Root Table Pointer */ 1717 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 1718 { 1719 vtd_root_table_setup(s); 1720 /* Ok - report back to driver */ 1721 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 1722 } 1723 1724 /* Set Interrupt Remap Table Pointer */ 1725 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 1726 { 1727 vtd_interrupt_remap_table_setup(s); 1728 /* Ok - report back to driver */ 1729 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 1730 } 1731 1732 /* Handle Translation Enable/Disable */ 1733 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 1734 { 1735 if (s->dmar_enabled == en) { 1736 return; 1737 } 1738 1739 trace_vtd_dmar_enable(en); 1740 1741 if (en) { 1742 s->dmar_enabled = true; 1743 /* Ok - report back to driver */ 1744 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 1745 } else { 1746 s->dmar_enabled = false; 1747 1748 /* Clear the index of Fault Recording Register */ 1749 s->next_frcd_reg = 0; 1750 /* Ok - report back to driver */ 1751 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 1752 } 1753 1754 vtd_switch_address_space_all(s); 1755 } 1756 1757 /* Handle Interrupt Remap Enable/Disable */ 1758 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 1759 { 1760 trace_vtd_ir_enable(en); 1761 1762 if (en) { 1763 s->intr_enabled = true; 1764 /* Ok - report back to driver */ 1765 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 1766 } else { 1767 s->intr_enabled = false; 1768 /* Ok - report back to driver */ 1769 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 1770 } 1771 } 1772 1773 /* Handle write to Global Command Register */ 1774 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 1775 { 1776 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 1777 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 1778 uint32_t changed = status ^ val; 1779 1780 trace_vtd_reg_write_gcmd(status, val); 1781 if (changed & VTD_GCMD_TE) { 1782 /* Translation enable/disable */ 1783 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 1784 } 1785 if (val & VTD_GCMD_SRTP) { 1786 /* Set/update the root-table pointer */ 1787 vtd_handle_gcmd_srtp(s); 1788 } 1789 if (changed & VTD_GCMD_QIE) { 1790 /* Queued Invalidation Enable */ 1791 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 1792 } 1793 if (val & VTD_GCMD_SIRTP) { 1794 /* Set/update the interrupt remapping root-table pointer */ 1795 vtd_handle_gcmd_sirtp(s); 1796 } 1797 if (changed & VTD_GCMD_IRE) { 1798 /* Interrupt remap enable/disable */ 1799 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 1800 } 1801 } 1802 1803 /* Handle write to Context Command Register */ 1804 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 1805 { 1806 uint64_t ret; 1807 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 1808 1809 /* Context-cache invalidation request */ 1810 if (val & VTD_CCMD_ICC) { 1811 if (s->qi_enabled) { 1812 error_report_once("Queued Invalidation enabled, " 1813 "should not use register-based invalidation"); 1814 return; 1815 } 1816 ret = vtd_context_cache_invalidate(s, val); 1817 /* Invalidation completed. Change something to show */ 1818 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 1819 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 1820 ret); 1821 } 1822 } 1823 1824 /* Handle write to IOTLB Invalidation Register */ 1825 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 1826 { 1827 uint64_t ret; 1828 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 1829 1830 /* IOTLB invalidation request */ 1831 if (val & VTD_TLB_IVT) { 1832 if (s->qi_enabled) { 1833 error_report_once("Queued Invalidation enabled, " 1834 "should not use register-based invalidation"); 1835 return; 1836 } 1837 ret = vtd_iotlb_flush(s, val); 1838 /* Invalidation completed. Change something to show */ 1839 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 1840 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 1841 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 1842 } 1843 } 1844 1845 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 1846 static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset, 1847 VTDInvDesc *inv_desc) 1848 { 1849 dma_addr_t addr = base_addr + offset * sizeof(*inv_desc); 1850 if (dma_memory_read(&address_space_memory, addr, inv_desc, 1851 sizeof(*inv_desc))) { 1852 error_report_once("Read INV DESC failed"); 1853 inv_desc->lo = 0; 1854 inv_desc->hi = 0; 1855 return false; 1856 } 1857 inv_desc->lo = le64_to_cpu(inv_desc->lo); 1858 inv_desc->hi = le64_to_cpu(inv_desc->hi); 1859 return true; 1860 } 1861 1862 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 1863 { 1864 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 1865 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 1866 trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); 1867 return false; 1868 } 1869 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 1870 /* Status Write */ 1871 uint32_t status_data = (uint32_t)(inv_desc->lo >> 1872 VTD_INV_DESC_WAIT_DATA_SHIFT); 1873 1874 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 1875 1876 /* FIXME: need to be masked with HAW? */ 1877 dma_addr_t status_addr = inv_desc->hi; 1878 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 1879 status_data = cpu_to_le32(status_data); 1880 if (dma_memory_write(&address_space_memory, status_addr, &status_data, 1881 sizeof(status_data))) { 1882 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 1883 return false; 1884 } 1885 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 1886 /* Interrupt flag */ 1887 vtd_generate_completion_event(s); 1888 } else { 1889 trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo); 1890 return false; 1891 } 1892 return true; 1893 } 1894 1895 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 1896 VTDInvDesc *inv_desc) 1897 { 1898 uint16_t sid, fmask; 1899 1900 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 1901 trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); 1902 return false; 1903 } 1904 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 1905 case VTD_INV_DESC_CC_DOMAIN: 1906 trace_vtd_inv_desc_cc_domain( 1907 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 1908 /* Fall through */ 1909 case VTD_INV_DESC_CC_GLOBAL: 1910 vtd_context_global_invalidate(s); 1911 break; 1912 1913 case VTD_INV_DESC_CC_DEVICE: 1914 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 1915 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 1916 vtd_context_device_invalidate(s, sid, fmask); 1917 break; 1918 1919 default: 1920 trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo); 1921 return false; 1922 } 1923 return true; 1924 } 1925 1926 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 1927 { 1928 uint16_t domain_id; 1929 uint8_t am; 1930 hwaddr addr; 1931 1932 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 1933 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 1934 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1935 return false; 1936 } 1937 1938 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 1939 case VTD_INV_DESC_IOTLB_GLOBAL: 1940 vtd_iotlb_global_invalidate(s); 1941 break; 1942 1943 case VTD_INV_DESC_IOTLB_DOMAIN: 1944 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 1945 vtd_iotlb_domain_invalidate(s, domain_id); 1946 break; 1947 1948 case VTD_INV_DESC_IOTLB_PAGE: 1949 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 1950 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 1951 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 1952 if (am > VTD_MAMV) { 1953 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1954 return false; 1955 } 1956 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 1957 break; 1958 1959 default: 1960 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 1961 return false; 1962 } 1963 return true; 1964 } 1965 1966 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 1967 VTDInvDesc *inv_desc) 1968 { 1969 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 1970 inv_desc->iec.index, 1971 inv_desc->iec.index_mask); 1972 1973 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 1974 inv_desc->iec.index, 1975 inv_desc->iec.index_mask); 1976 return true; 1977 } 1978 1979 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 1980 VTDInvDesc *inv_desc) 1981 { 1982 VTDAddressSpace *vtd_dev_as; 1983 IOMMUTLBEntry entry; 1984 struct VTDBus *vtd_bus; 1985 hwaddr addr; 1986 uint64_t sz; 1987 uint16_t sid; 1988 uint8_t devfn; 1989 bool size; 1990 uint8_t bus_num; 1991 1992 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 1993 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 1994 devfn = sid & 0xff; 1995 bus_num = sid >> 8; 1996 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 1997 1998 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 1999 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2000 trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo); 2001 return false; 2002 } 2003 2004 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2005 if (!vtd_bus) { 2006 goto done; 2007 } 2008 2009 vtd_dev_as = vtd_bus->dev_as[devfn]; 2010 if (!vtd_dev_as) { 2011 goto done; 2012 } 2013 2014 /* According to ATS spec table 2.4: 2015 * S = 0, bits 15:12 = xxxx range size: 4K 2016 * S = 1, bits 15:12 = xxx0 range size: 8K 2017 * S = 1, bits 15:12 = xx01 range size: 16K 2018 * S = 1, bits 15:12 = x011 range size: 32K 2019 * S = 1, bits 15:12 = 0111 range size: 64K 2020 * ... 2021 */ 2022 if (size) { 2023 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2024 addr &= ~(sz - 1); 2025 } else { 2026 sz = VTD_PAGE_SIZE; 2027 } 2028 2029 entry.target_as = &vtd_dev_as->as; 2030 entry.addr_mask = sz - 1; 2031 entry.iova = addr; 2032 entry.perm = IOMMU_NONE; 2033 entry.translated_addr = 0; 2034 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); 2035 2036 done: 2037 return true; 2038 } 2039 2040 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2041 { 2042 VTDInvDesc inv_desc; 2043 uint8_t desc_type; 2044 2045 trace_vtd_inv_qi_head(s->iq_head); 2046 if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) { 2047 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2048 return false; 2049 } 2050 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2051 /* FIXME: should update at first or at last? */ 2052 s->iq_last_desc_type = desc_type; 2053 2054 switch (desc_type) { 2055 case VTD_INV_DESC_CC: 2056 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2057 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2058 return false; 2059 } 2060 break; 2061 2062 case VTD_INV_DESC_IOTLB: 2063 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2064 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2065 return false; 2066 } 2067 break; 2068 2069 case VTD_INV_DESC_WAIT: 2070 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2071 if (!vtd_process_wait_desc(s, &inv_desc)) { 2072 return false; 2073 } 2074 break; 2075 2076 case VTD_INV_DESC_IEC: 2077 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2078 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2079 return false; 2080 } 2081 break; 2082 2083 case VTD_INV_DESC_DEVICE: 2084 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2085 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2086 return false; 2087 } 2088 break; 2089 2090 default: 2091 trace_vtd_inv_desc_invalid(inv_desc.hi, inv_desc.lo); 2092 return false; 2093 } 2094 s->iq_head++; 2095 if (s->iq_head == s->iq_size) { 2096 s->iq_head = 0; 2097 } 2098 return true; 2099 } 2100 2101 /* Try to fetch and process more Invalidation Descriptors */ 2102 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2103 { 2104 trace_vtd_inv_qi_fetch(); 2105 2106 if (s->iq_tail >= s->iq_size) { 2107 /* Detects an invalid Tail pointer */ 2108 error_report_once("%s: detected invalid QI tail " 2109 "(tail=0x%x, size=0x%x)", 2110 __func__, s->iq_tail, s->iq_size); 2111 vtd_handle_inv_queue_error(s); 2112 return; 2113 } 2114 while (s->iq_head != s->iq_tail) { 2115 if (!vtd_process_inv_desc(s)) { 2116 /* Invalidation Queue Errors */ 2117 vtd_handle_inv_queue_error(s); 2118 break; 2119 } 2120 /* Must update the IQH_REG in time */ 2121 vtd_set_quad_raw(s, DMAR_IQH_REG, 2122 (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) & 2123 VTD_IQH_QH_MASK); 2124 } 2125 } 2126 2127 /* Handle write to Invalidation Queue Tail Register */ 2128 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2129 { 2130 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2131 2132 s->iq_tail = VTD_IQT_QT(val); 2133 trace_vtd_inv_qi_tail(s->iq_tail); 2134 2135 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2136 /* Process Invalidation Queue here */ 2137 vtd_fetch_inv_desc(s); 2138 } 2139 } 2140 2141 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2142 { 2143 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2144 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2145 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2146 2147 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2148 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2149 trace_vtd_fsts_clear_ip(); 2150 } 2151 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2152 * Descriptors if there are any when Queued Invalidation is enabled? 2153 */ 2154 } 2155 2156 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2157 { 2158 uint32_t fectl_reg; 2159 /* FIXME: when software clears the IM field, check the IP field. But do we 2160 * need to compare the old value and the new value to conclude that 2161 * software clears the IM field? Or just check if the IM field is zero? 2162 */ 2163 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2164 2165 trace_vtd_reg_write_fectl(fectl_reg); 2166 2167 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2168 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2169 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2170 } 2171 } 2172 2173 static void vtd_handle_ics_write(IntelIOMMUState *s) 2174 { 2175 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2176 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2177 2178 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2179 trace_vtd_reg_ics_clear_ip(); 2180 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2181 } 2182 } 2183 2184 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2185 { 2186 uint32_t iectl_reg; 2187 /* FIXME: when software clears the IM field, check the IP field. But do we 2188 * need to compare the old value and the new value to conclude that 2189 * software clears the IM field? Or just check if the IM field is zero? 2190 */ 2191 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2192 2193 trace_vtd_reg_write_iectl(iectl_reg); 2194 2195 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2196 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2197 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2198 } 2199 } 2200 2201 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2202 { 2203 IntelIOMMUState *s = opaque; 2204 uint64_t val; 2205 2206 trace_vtd_reg_read(addr, size); 2207 2208 if (addr + size > DMAR_REG_SIZE) { 2209 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2210 " size=0x%u", __func__, addr, size); 2211 return (uint64_t)-1; 2212 } 2213 2214 switch (addr) { 2215 /* Root Table Address Register, 64-bit */ 2216 case DMAR_RTADDR_REG: 2217 if (size == 4) { 2218 val = s->root & ((1ULL << 32) - 1); 2219 } else { 2220 val = s->root; 2221 } 2222 break; 2223 2224 case DMAR_RTADDR_REG_HI: 2225 assert(size == 4); 2226 val = s->root >> 32; 2227 break; 2228 2229 /* Invalidation Queue Address Register, 64-bit */ 2230 case DMAR_IQA_REG: 2231 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2232 if (size == 4) { 2233 val = val & ((1ULL << 32) - 1); 2234 } 2235 break; 2236 2237 case DMAR_IQA_REG_HI: 2238 assert(size == 4); 2239 val = s->iq >> 32; 2240 break; 2241 2242 default: 2243 if (size == 4) { 2244 val = vtd_get_long(s, addr); 2245 } else { 2246 val = vtd_get_quad(s, addr); 2247 } 2248 } 2249 2250 return val; 2251 } 2252 2253 static void vtd_mem_write(void *opaque, hwaddr addr, 2254 uint64_t val, unsigned size) 2255 { 2256 IntelIOMMUState *s = opaque; 2257 2258 trace_vtd_reg_write(addr, size, val); 2259 2260 if (addr + size > DMAR_REG_SIZE) { 2261 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2262 " size=0x%u", __func__, addr, size); 2263 return; 2264 } 2265 2266 switch (addr) { 2267 /* Global Command Register, 32-bit */ 2268 case DMAR_GCMD_REG: 2269 vtd_set_long(s, addr, val); 2270 vtd_handle_gcmd_write(s); 2271 break; 2272 2273 /* Context Command Register, 64-bit */ 2274 case DMAR_CCMD_REG: 2275 if (size == 4) { 2276 vtd_set_long(s, addr, val); 2277 } else { 2278 vtd_set_quad(s, addr, val); 2279 vtd_handle_ccmd_write(s); 2280 } 2281 break; 2282 2283 case DMAR_CCMD_REG_HI: 2284 assert(size == 4); 2285 vtd_set_long(s, addr, val); 2286 vtd_handle_ccmd_write(s); 2287 break; 2288 2289 /* IOTLB Invalidation Register, 64-bit */ 2290 case DMAR_IOTLB_REG: 2291 if (size == 4) { 2292 vtd_set_long(s, addr, val); 2293 } else { 2294 vtd_set_quad(s, addr, val); 2295 vtd_handle_iotlb_write(s); 2296 } 2297 break; 2298 2299 case DMAR_IOTLB_REG_HI: 2300 assert(size == 4); 2301 vtd_set_long(s, addr, val); 2302 vtd_handle_iotlb_write(s); 2303 break; 2304 2305 /* Invalidate Address Register, 64-bit */ 2306 case DMAR_IVA_REG: 2307 if (size == 4) { 2308 vtd_set_long(s, addr, val); 2309 } else { 2310 vtd_set_quad(s, addr, val); 2311 } 2312 break; 2313 2314 case DMAR_IVA_REG_HI: 2315 assert(size == 4); 2316 vtd_set_long(s, addr, val); 2317 break; 2318 2319 /* Fault Status Register, 32-bit */ 2320 case DMAR_FSTS_REG: 2321 assert(size == 4); 2322 vtd_set_long(s, addr, val); 2323 vtd_handle_fsts_write(s); 2324 break; 2325 2326 /* Fault Event Control Register, 32-bit */ 2327 case DMAR_FECTL_REG: 2328 assert(size == 4); 2329 vtd_set_long(s, addr, val); 2330 vtd_handle_fectl_write(s); 2331 break; 2332 2333 /* Fault Event Data Register, 32-bit */ 2334 case DMAR_FEDATA_REG: 2335 assert(size == 4); 2336 vtd_set_long(s, addr, val); 2337 break; 2338 2339 /* Fault Event Address Register, 32-bit */ 2340 case DMAR_FEADDR_REG: 2341 if (size == 4) { 2342 vtd_set_long(s, addr, val); 2343 } else { 2344 /* 2345 * While the register is 32-bit only, some guests (Xen...) write to 2346 * it with 64-bit. 2347 */ 2348 vtd_set_quad(s, addr, val); 2349 } 2350 break; 2351 2352 /* Fault Event Upper Address Register, 32-bit */ 2353 case DMAR_FEUADDR_REG: 2354 assert(size == 4); 2355 vtd_set_long(s, addr, val); 2356 break; 2357 2358 /* Protected Memory Enable Register, 32-bit */ 2359 case DMAR_PMEN_REG: 2360 assert(size == 4); 2361 vtd_set_long(s, addr, val); 2362 break; 2363 2364 /* Root Table Address Register, 64-bit */ 2365 case DMAR_RTADDR_REG: 2366 if (size == 4) { 2367 vtd_set_long(s, addr, val); 2368 } else { 2369 vtd_set_quad(s, addr, val); 2370 } 2371 break; 2372 2373 case DMAR_RTADDR_REG_HI: 2374 assert(size == 4); 2375 vtd_set_long(s, addr, val); 2376 break; 2377 2378 /* Invalidation Queue Tail Register, 64-bit */ 2379 case DMAR_IQT_REG: 2380 if (size == 4) { 2381 vtd_set_long(s, addr, val); 2382 } else { 2383 vtd_set_quad(s, addr, val); 2384 } 2385 vtd_handle_iqt_write(s); 2386 break; 2387 2388 case DMAR_IQT_REG_HI: 2389 assert(size == 4); 2390 vtd_set_long(s, addr, val); 2391 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2392 break; 2393 2394 /* Invalidation Queue Address Register, 64-bit */ 2395 case DMAR_IQA_REG: 2396 if (size == 4) { 2397 vtd_set_long(s, addr, val); 2398 } else { 2399 vtd_set_quad(s, addr, val); 2400 } 2401 break; 2402 2403 case DMAR_IQA_REG_HI: 2404 assert(size == 4); 2405 vtd_set_long(s, addr, val); 2406 break; 2407 2408 /* Invalidation Completion Status Register, 32-bit */ 2409 case DMAR_ICS_REG: 2410 assert(size == 4); 2411 vtd_set_long(s, addr, val); 2412 vtd_handle_ics_write(s); 2413 break; 2414 2415 /* Invalidation Event Control Register, 32-bit */ 2416 case DMAR_IECTL_REG: 2417 assert(size == 4); 2418 vtd_set_long(s, addr, val); 2419 vtd_handle_iectl_write(s); 2420 break; 2421 2422 /* Invalidation Event Data Register, 32-bit */ 2423 case DMAR_IEDATA_REG: 2424 assert(size == 4); 2425 vtd_set_long(s, addr, val); 2426 break; 2427 2428 /* Invalidation Event Address Register, 32-bit */ 2429 case DMAR_IEADDR_REG: 2430 assert(size == 4); 2431 vtd_set_long(s, addr, val); 2432 break; 2433 2434 /* Invalidation Event Upper Address Register, 32-bit */ 2435 case DMAR_IEUADDR_REG: 2436 assert(size == 4); 2437 vtd_set_long(s, addr, val); 2438 break; 2439 2440 /* Fault Recording Registers, 128-bit */ 2441 case DMAR_FRCD_REG_0_0: 2442 if (size == 4) { 2443 vtd_set_long(s, addr, val); 2444 } else { 2445 vtd_set_quad(s, addr, val); 2446 } 2447 break; 2448 2449 case DMAR_FRCD_REG_0_1: 2450 assert(size == 4); 2451 vtd_set_long(s, addr, val); 2452 break; 2453 2454 case DMAR_FRCD_REG_0_2: 2455 if (size == 4) { 2456 vtd_set_long(s, addr, val); 2457 } else { 2458 vtd_set_quad(s, addr, val); 2459 /* May clear bit 127 (Fault), update PPF */ 2460 vtd_update_fsts_ppf(s); 2461 } 2462 break; 2463 2464 case DMAR_FRCD_REG_0_3: 2465 assert(size == 4); 2466 vtd_set_long(s, addr, val); 2467 /* May clear bit 127 (Fault), update PPF */ 2468 vtd_update_fsts_ppf(s); 2469 break; 2470 2471 case DMAR_IRTA_REG: 2472 if (size == 4) { 2473 vtd_set_long(s, addr, val); 2474 } else { 2475 vtd_set_quad(s, addr, val); 2476 } 2477 break; 2478 2479 case DMAR_IRTA_REG_HI: 2480 assert(size == 4); 2481 vtd_set_long(s, addr, val); 2482 break; 2483 2484 default: 2485 if (size == 4) { 2486 vtd_set_long(s, addr, val); 2487 } else { 2488 vtd_set_quad(s, addr, val); 2489 } 2490 } 2491 } 2492 2493 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2494 IOMMUAccessFlags flag, int iommu_idx) 2495 { 2496 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2497 IntelIOMMUState *s = vtd_as->iommu_state; 2498 IOMMUTLBEntry iotlb = { 2499 /* We'll fill in the rest later. */ 2500 .target_as = &address_space_memory, 2501 }; 2502 bool success; 2503 2504 if (likely(s->dmar_enabled)) { 2505 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2506 addr, flag & IOMMU_WO, &iotlb); 2507 } else { 2508 /* DMAR disabled, passthrough, use 4k-page*/ 2509 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2510 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2511 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2512 iotlb.perm = IOMMU_RW; 2513 success = true; 2514 } 2515 2516 if (likely(success)) { 2517 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 2518 VTD_PCI_SLOT(vtd_as->devfn), 2519 VTD_PCI_FUNC(vtd_as->devfn), 2520 iotlb.iova, iotlb.translated_addr, 2521 iotlb.addr_mask); 2522 } else { 2523 error_report_once("%s: detected translation failure " 2524 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 2525 __func__, pci_bus_num(vtd_as->bus), 2526 VTD_PCI_SLOT(vtd_as->devfn), 2527 VTD_PCI_FUNC(vtd_as->devfn), 2528 iotlb.iova); 2529 } 2530 2531 return iotlb; 2532 } 2533 2534 static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 2535 IOMMUNotifierFlag old, 2536 IOMMUNotifierFlag new) 2537 { 2538 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2539 IntelIOMMUState *s = vtd_as->iommu_state; 2540 2541 if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { 2542 error_report("We need to set caching-mode=1 for intel-iommu to enable " 2543 "device assignment with IOMMU protection."); 2544 exit(1); 2545 } 2546 2547 /* Update per-address-space notifier flags */ 2548 vtd_as->notifier_flags = new; 2549 2550 if (old == IOMMU_NOTIFIER_NONE) { 2551 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 2552 } else if (new == IOMMU_NOTIFIER_NONE) { 2553 QLIST_REMOVE(vtd_as, next); 2554 } 2555 } 2556 2557 static int vtd_post_load(void *opaque, int version_id) 2558 { 2559 IntelIOMMUState *iommu = opaque; 2560 2561 /* 2562 * Memory regions are dynamically turned on/off depending on 2563 * context entry configurations from the guest. After migration, 2564 * we need to make sure the memory regions are still correct. 2565 */ 2566 vtd_switch_address_space_all(iommu); 2567 2568 return 0; 2569 } 2570 2571 static const VMStateDescription vtd_vmstate = { 2572 .name = "iommu-intel", 2573 .version_id = 1, 2574 .minimum_version_id = 1, 2575 .priority = MIG_PRI_IOMMU, 2576 .post_load = vtd_post_load, 2577 .fields = (VMStateField[]) { 2578 VMSTATE_UINT64(root, IntelIOMMUState), 2579 VMSTATE_UINT64(intr_root, IntelIOMMUState), 2580 VMSTATE_UINT64(iq, IntelIOMMUState), 2581 VMSTATE_UINT32(intr_size, IntelIOMMUState), 2582 VMSTATE_UINT16(iq_head, IntelIOMMUState), 2583 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 2584 VMSTATE_UINT16(iq_size, IntelIOMMUState), 2585 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 2586 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 2587 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 2588 VMSTATE_BOOL(root_extended, IntelIOMMUState), 2589 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 2590 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 2591 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 2592 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 2593 VMSTATE_END_OF_LIST() 2594 } 2595 }; 2596 2597 static const MemoryRegionOps vtd_mem_ops = { 2598 .read = vtd_mem_read, 2599 .write = vtd_mem_write, 2600 .endianness = DEVICE_LITTLE_ENDIAN, 2601 .impl = { 2602 .min_access_size = 4, 2603 .max_access_size = 8, 2604 }, 2605 .valid = { 2606 .min_access_size = 4, 2607 .max_access_size = 8, 2608 }, 2609 }; 2610 2611 static Property vtd_properties[] = { 2612 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 2613 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 2614 ON_OFF_AUTO_AUTO), 2615 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 2616 DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits, 2617 VTD_HOST_ADDRESS_WIDTH), 2618 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 2619 DEFINE_PROP_END_OF_LIST(), 2620 }; 2621 2622 /* Read IRTE entry with specific index */ 2623 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 2624 VTD_IR_TableEntry *entry, uint16_t sid) 2625 { 2626 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 2627 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 2628 dma_addr_t addr = 0x00; 2629 uint16_t mask, source_id; 2630 uint8_t bus, bus_max, bus_min; 2631 2632 addr = iommu->intr_root + index * sizeof(*entry); 2633 if (dma_memory_read(&address_space_memory, addr, entry, 2634 sizeof(*entry))) { 2635 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 2636 __func__, index, addr); 2637 return -VTD_FR_IR_ROOT_INVAL; 2638 } 2639 2640 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 2641 le64_to_cpu(entry->data[0])); 2642 2643 if (!entry->irte.present) { 2644 error_report_once("%s: detected non-present IRTE " 2645 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 2646 __func__, index, le64_to_cpu(entry->data[1]), 2647 le64_to_cpu(entry->data[0])); 2648 return -VTD_FR_IR_ENTRY_P; 2649 } 2650 2651 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 2652 entry->irte.__reserved_2) { 2653 error_report_once("%s: detected non-zero reserved IRTE " 2654 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 2655 __func__, index, le64_to_cpu(entry->data[1]), 2656 le64_to_cpu(entry->data[0])); 2657 return -VTD_FR_IR_IRTE_RSVD; 2658 } 2659 2660 if (sid != X86_IOMMU_SID_INVALID) { 2661 /* Validate IRTE SID */ 2662 source_id = le32_to_cpu(entry->irte.source_id); 2663 switch (entry->irte.sid_vtype) { 2664 case VTD_SVT_NONE: 2665 break; 2666 2667 case VTD_SVT_ALL: 2668 mask = vtd_svt_mask[entry->irte.sid_q]; 2669 if ((source_id & mask) != (sid & mask)) { 2670 error_report_once("%s: invalid IRTE SID " 2671 "(index=%u, sid=%u, source_id=%u)", 2672 __func__, index, sid, source_id); 2673 return -VTD_FR_IR_SID_ERR; 2674 } 2675 break; 2676 2677 case VTD_SVT_BUS: 2678 bus_max = source_id >> 8; 2679 bus_min = source_id & 0xff; 2680 bus = sid >> 8; 2681 if (bus > bus_max || bus < bus_min) { 2682 error_report_once("%s: invalid SVT_BUS " 2683 "(index=%u, bus=%u, min=%u, max=%u)", 2684 __func__, index, bus, bus_min, bus_max); 2685 return -VTD_FR_IR_SID_ERR; 2686 } 2687 break; 2688 2689 default: 2690 error_report_once("%s: detected invalid IRTE SVT " 2691 "(index=%u, type=%d)", __func__, 2692 index, entry->irte.sid_vtype); 2693 /* Take this as verification failure. */ 2694 return -VTD_FR_IR_SID_ERR; 2695 break; 2696 } 2697 } 2698 2699 return 0; 2700 } 2701 2702 /* Fetch IRQ information of specific IR index */ 2703 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 2704 VTDIrq *irq, uint16_t sid) 2705 { 2706 VTD_IR_TableEntry irte = {}; 2707 int ret = 0; 2708 2709 ret = vtd_irte_get(iommu, index, &irte, sid); 2710 if (ret) { 2711 return ret; 2712 } 2713 2714 irq->trigger_mode = irte.irte.trigger_mode; 2715 irq->vector = irte.irte.vector; 2716 irq->delivery_mode = irte.irte.delivery_mode; 2717 irq->dest = le32_to_cpu(irte.irte.dest_id); 2718 if (!iommu->intr_eime) { 2719 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 2720 #define VTD_IR_APIC_DEST_SHIFT (8) 2721 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 2722 VTD_IR_APIC_DEST_SHIFT; 2723 } 2724 irq->dest_mode = irte.irte.dest_mode; 2725 irq->redir_hint = irte.irte.redir_hint; 2726 2727 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 2728 irq->delivery_mode, irq->dest, irq->dest_mode); 2729 2730 return 0; 2731 } 2732 2733 /* Generate one MSI message from VTDIrq info */ 2734 static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out) 2735 { 2736 VTD_MSIMessage msg = {}; 2737 2738 /* Generate address bits */ 2739 msg.dest_mode = irq->dest_mode; 2740 msg.redir_hint = irq->redir_hint; 2741 msg.dest = irq->dest; 2742 msg.__addr_hi = irq->dest & 0xffffff00; 2743 msg.__addr_head = cpu_to_le32(0xfee); 2744 /* Keep this from original MSI address bits */ 2745 msg.__not_used = irq->msi_addr_last_bits; 2746 2747 /* Generate data bits */ 2748 msg.vector = irq->vector; 2749 msg.delivery_mode = irq->delivery_mode; 2750 msg.level = 1; 2751 msg.trigger_mode = irq->trigger_mode; 2752 2753 msg_out->address = msg.msi_addr; 2754 msg_out->data = msg.msi_data; 2755 } 2756 2757 /* Interrupt remapping for MSI/MSI-X entry */ 2758 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 2759 MSIMessage *origin, 2760 MSIMessage *translated, 2761 uint16_t sid) 2762 { 2763 int ret = 0; 2764 VTD_IR_MSIAddress addr; 2765 uint16_t index; 2766 VTDIrq irq = {}; 2767 2768 assert(origin && translated); 2769 2770 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 2771 2772 if (!iommu || !iommu->intr_enabled) { 2773 memcpy(translated, origin, sizeof(*origin)); 2774 goto out; 2775 } 2776 2777 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 2778 error_report_once("%s: MSI address high 32 bits non-zero detected: " 2779 "address=0x%" PRIx64, __func__, origin->address); 2780 return -VTD_FR_IR_REQ_RSVD; 2781 } 2782 2783 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 2784 if (addr.addr.__head != 0xfee) { 2785 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 2786 __func__, addr.data); 2787 return -VTD_FR_IR_REQ_RSVD; 2788 } 2789 2790 /* This is compatible mode. */ 2791 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 2792 memcpy(translated, origin, sizeof(*origin)); 2793 goto out; 2794 } 2795 2796 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 2797 2798 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 2799 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 2800 2801 if (addr.addr.sub_valid) { 2802 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 2803 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 2804 } 2805 2806 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 2807 if (ret) { 2808 return ret; 2809 } 2810 2811 if (addr.addr.sub_valid) { 2812 trace_vtd_ir_remap_type("MSI"); 2813 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 2814 error_report_once("%s: invalid IR MSI " 2815 "(sid=%u, address=0x%" PRIx64 2816 ", data=0x%" PRIx32 ")", 2817 __func__, sid, origin->address, origin->data); 2818 return -VTD_FR_IR_REQ_RSVD; 2819 } 2820 } else { 2821 uint8_t vector = origin->data & 0xff; 2822 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 2823 2824 trace_vtd_ir_remap_type("IOAPIC"); 2825 /* IOAPIC entry vector should be aligned with IRTE vector 2826 * (see vt-d spec 5.1.5.1). */ 2827 if (vector != irq.vector) { 2828 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 2829 } 2830 2831 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 2832 * (see vt-d spec 5.1.5.1). */ 2833 if (trigger_mode != irq.trigger_mode) { 2834 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 2835 irq.trigger_mode); 2836 } 2837 } 2838 2839 /* 2840 * We'd better keep the last two bits, assuming that guest OS 2841 * might modify it. Keep it does not hurt after all. 2842 */ 2843 irq.msi_addr_last_bits = addr.addr.__not_care; 2844 2845 /* Translate VTDIrq to MSI message */ 2846 vtd_generate_msi_message(&irq, translated); 2847 2848 out: 2849 trace_vtd_ir_remap_msi(origin->address, origin->data, 2850 translated->address, translated->data); 2851 return 0; 2852 } 2853 2854 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 2855 MSIMessage *dst, uint16_t sid) 2856 { 2857 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 2858 src, dst, sid); 2859 } 2860 2861 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 2862 uint64_t *data, unsigned size, 2863 MemTxAttrs attrs) 2864 { 2865 return MEMTX_OK; 2866 } 2867 2868 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 2869 uint64_t value, unsigned size, 2870 MemTxAttrs attrs) 2871 { 2872 int ret = 0; 2873 MSIMessage from = {}, to = {}; 2874 uint16_t sid = X86_IOMMU_SID_INVALID; 2875 2876 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 2877 from.data = (uint32_t) value; 2878 2879 if (!attrs.unspecified) { 2880 /* We have explicit Source ID */ 2881 sid = attrs.requester_id; 2882 } 2883 2884 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 2885 if (ret) { 2886 /* TODO: report error */ 2887 /* Drop this interrupt */ 2888 return MEMTX_ERROR; 2889 } 2890 2891 apic_get_class()->send_msi(&to); 2892 2893 return MEMTX_OK; 2894 } 2895 2896 static const MemoryRegionOps vtd_mem_ir_ops = { 2897 .read_with_attrs = vtd_mem_ir_read, 2898 .write_with_attrs = vtd_mem_ir_write, 2899 .endianness = DEVICE_LITTLE_ENDIAN, 2900 .impl = { 2901 .min_access_size = 4, 2902 .max_access_size = 4, 2903 }, 2904 .valid = { 2905 .min_access_size = 4, 2906 .max_access_size = 4, 2907 }, 2908 }; 2909 2910 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 2911 { 2912 uintptr_t key = (uintptr_t)bus; 2913 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 2914 VTDAddressSpace *vtd_dev_as; 2915 char name[128]; 2916 2917 if (!vtd_bus) { 2918 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 2919 *new_key = (uintptr_t)bus; 2920 /* No corresponding free() */ 2921 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 2922 PCI_DEVFN_MAX); 2923 vtd_bus->bus = bus; 2924 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 2925 } 2926 2927 vtd_dev_as = vtd_bus->dev_as[devfn]; 2928 2929 if (!vtd_dev_as) { 2930 snprintf(name, sizeof(name), "intel_iommu_devfn_%d", devfn); 2931 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 2932 2933 vtd_dev_as->bus = bus; 2934 vtd_dev_as->devfn = (uint8_t)devfn; 2935 vtd_dev_as->iommu_state = s; 2936 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 2937 vtd_dev_as->iova_tree = iova_tree_new(); 2938 2939 /* 2940 * Memory region relationships looks like (Address range shows 2941 * only lower 32 bits to make it short in length...): 2942 * 2943 * |-----------------+-------------------+----------| 2944 * | Name | Address range | Priority | 2945 * |-----------------+-------------------+----------+ 2946 * | vtd_root | 00000000-ffffffff | 0 | 2947 * | intel_iommu | 00000000-ffffffff | 1 | 2948 * | vtd_sys_alias | 00000000-ffffffff | 1 | 2949 * | intel_iommu_ir | fee00000-feefffff | 64 | 2950 * |-----------------+-------------------+----------| 2951 * 2952 * We enable/disable DMAR by switching enablement for 2953 * vtd_sys_alias and intel_iommu regions. IR region is always 2954 * enabled. 2955 */ 2956 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 2957 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 2958 "intel_iommu_dmar", 2959 UINT64_MAX); 2960 memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s), 2961 "vtd_sys_alias", get_system_memory(), 2962 0, memory_region_size(get_system_memory())); 2963 memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s), 2964 &vtd_mem_ir_ops, s, "intel_iommu_ir", 2965 VTD_INTERRUPT_ADDR_SIZE); 2966 memory_region_init(&vtd_dev_as->root, OBJECT(s), 2967 "vtd_root", UINT64_MAX); 2968 memory_region_add_subregion_overlap(&vtd_dev_as->root, 2969 VTD_INTERRUPT_ADDR_FIRST, 2970 &vtd_dev_as->iommu_ir, 64); 2971 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name); 2972 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 2973 &vtd_dev_as->sys_alias, 1); 2974 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 2975 MEMORY_REGION(&vtd_dev_as->iommu), 2976 1); 2977 vtd_switch_address_space(vtd_dev_as); 2978 } 2979 return vtd_dev_as; 2980 } 2981 2982 /* Unmap the whole range in the notifier's scope. */ 2983 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 2984 { 2985 IOMMUTLBEntry entry; 2986 hwaddr size; 2987 hwaddr start = n->start; 2988 hwaddr end = n->end; 2989 IntelIOMMUState *s = as->iommu_state; 2990 DMAMap map; 2991 2992 /* 2993 * Note: all the codes in this function has a assumption that IOVA 2994 * bits are no more than VTD_MGAW bits (which is restricted by 2995 * VT-d spec), otherwise we need to consider overflow of 64 bits. 2996 */ 2997 2998 if (end > VTD_ADDRESS_SIZE(s->aw_bits)) { 2999 /* 3000 * Don't need to unmap regions that is bigger than the whole 3001 * VT-d supported address space size 3002 */ 3003 end = VTD_ADDRESS_SIZE(s->aw_bits); 3004 } 3005 3006 assert(start <= end); 3007 size = end - start; 3008 3009 if (ctpop64(size) != 1) { 3010 /* 3011 * This size cannot format a correct mask. Let's enlarge it to 3012 * suite the minimum available mask. 3013 */ 3014 int n = 64 - clz64(size); 3015 if (n > s->aw_bits) { 3016 /* should not happen, but in case it happens, limit it */ 3017 n = s->aw_bits; 3018 } 3019 size = 1ULL << n; 3020 } 3021 3022 entry.target_as = &address_space_memory; 3023 /* Adjust iova for the size */ 3024 entry.iova = n->start & ~(size - 1); 3025 /* This field is meaningless for unmap */ 3026 entry.translated_addr = 0; 3027 entry.perm = IOMMU_NONE; 3028 entry.addr_mask = size - 1; 3029 3030 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3031 VTD_PCI_SLOT(as->devfn), 3032 VTD_PCI_FUNC(as->devfn), 3033 entry.iova, size); 3034 3035 map.iova = entry.iova; 3036 map.size = entry.addr_mask; 3037 iova_tree_remove(as->iova_tree, &map); 3038 3039 memory_region_notify_one(n, &entry); 3040 } 3041 3042 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3043 { 3044 VTDAddressSpace *vtd_as; 3045 IOMMUNotifier *n; 3046 3047 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3048 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3049 vtd_address_space_unmap(vtd_as, n); 3050 } 3051 } 3052 } 3053 3054 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) 3055 { 3056 memory_region_notify_one((IOMMUNotifier *)private, entry); 3057 return 0; 3058 } 3059 3060 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3061 { 3062 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3063 IntelIOMMUState *s = vtd_as->iommu_state; 3064 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3065 VTDContextEntry ce; 3066 3067 /* 3068 * The replay can be triggered by either a invalidation or a newly 3069 * created entry. No matter what, we release existing mappings 3070 * (it means flushing caches for UNMAP-only registers). 3071 */ 3072 vtd_address_space_unmap(vtd_as, n); 3073 3074 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3075 trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn), 3076 PCI_FUNC(vtd_as->devfn), 3077 VTD_CONTEXT_ENTRY_DID(ce.hi), 3078 ce.hi, ce.lo); 3079 if (vtd_as_has_map_notifier(vtd_as)) { 3080 /* This is required only for MAP typed notifiers */ 3081 vtd_page_walk_info info = { 3082 .hook_fn = vtd_replay_hook, 3083 .private = (void *)n, 3084 .notify_unmap = false, 3085 .aw = s->aw_bits, 3086 .as = vtd_as, 3087 .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), 3088 }; 3089 3090 vtd_page_walk(&ce, 0, ~0ULL, &info); 3091 } 3092 } else { 3093 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3094 PCI_FUNC(vtd_as->devfn)); 3095 } 3096 3097 return; 3098 } 3099 3100 /* Do the initialization. It will also be called when reset, so pay 3101 * attention when adding new initialization stuff. 3102 */ 3103 static void vtd_init(IntelIOMMUState *s) 3104 { 3105 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3106 3107 memset(s->csr, 0, DMAR_REG_SIZE); 3108 memset(s->wmask, 0, DMAR_REG_SIZE); 3109 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3110 memset(s->womask, 0, DMAR_REG_SIZE); 3111 3112 s->root = 0; 3113 s->root_extended = false; 3114 s->dmar_enabled = false; 3115 s->iq_head = 0; 3116 s->iq_tail = 0; 3117 s->iq = 0; 3118 s->iq_size = 0; 3119 s->qi_enabled = false; 3120 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3121 s->next_frcd_reg = 0; 3122 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3123 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3124 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3125 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3126 s->cap |= VTD_CAP_SAGAW_48bit; 3127 } 3128 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3129 3130 /* 3131 * Rsvd field masks for spte 3132 */ 3133 vtd_paging_entry_rsvd_field[0] = ~0ULL; 3134 vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits); 3135 vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3136 vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3137 vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3138 vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits); 3139 vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); 3140 vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); 3141 vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits); 3142 3143 if (x86_iommu->intr_supported) { 3144 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3145 if (s->intr_eim == ON_OFF_AUTO_ON) { 3146 s->ecap |= VTD_ECAP_EIM; 3147 } 3148 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3149 } 3150 3151 if (x86_iommu->dt_supported) { 3152 s->ecap |= VTD_ECAP_DT; 3153 } 3154 3155 if (x86_iommu->pt_supported) { 3156 s->ecap |= VTD_ECAP_PT; 3157 } 3158 3159 if (s->caching_mode) { 3160 s->cap |= VTD_CAP_CM; 3161 } 3162 3163 vtd_iommu_lock(s); 3164 vtd_reset_context_cache_locked(s); 3165 vtd_reset_iotlb_locked(s); 3166 vtd_iommu_unlock(s); 3167 3168 /* Define registers with default values and bit semantics */ 3169 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3170 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3171 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3172 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3173 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3174 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3175 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0); 3176 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3177 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3178 3179 /* Advanced Fault Logging not supported */ 3180 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3181 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3182 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3183 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3184 3185 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3186 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3187 */ 3188 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3189 3190 /* Treated as RO for implementations that PLMR and PHMR fields reported 3191 * as Clear in the CAP_REG. 3192 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3193 */ 3194 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3195 3196 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3197 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3198 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0); 3199 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3200 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3201 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3202 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3203 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3204 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3205 3206 /* IOTLB registers */ 3207 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3208 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3209 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3210 3211 /* Fault Recording Registers, 128-bit */ 3212 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3213 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3214 3215 /* 3216 * Interrupt remapping registers. 3217 */ 3218 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3219 } 3220 3221 /* Should not reset address_spaces when reset because devices will still use 3222 * the address space they got at first (won't ask the bus again). 3223 */ 3224 static void vtd_reset(DeviceState *dev) 3225 { 3226 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3227 3228 vtd_init(s); 3229 3230 /* 3231 * When device reset, throw away all mappings and external caches 3232 */ 3233 vtd_address_space_unmap_all(s); 3234 } 3235 3236 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3237 { 3238 IntelIOMMUState *s = opaque; 3239 VTDAddressSpace *vtd_as; 3240 3241 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3242 3243 vtd_as = vtd_find_add_as(s, bus, devfn); 3244 return &vtd_as->as; 3245 } 3246 3247 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3248 { 3249 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3250 3251 /* Currently Intel IOMMU IR only support "kernel-irqchip={off|split}" */ 3252 if (x86_iommu->intr_supported && kvm_irqchip_in_kernel() && 3253 !kvm_irqchip_is_split()) { 3254 error_setg(errp, "Intel Interrupt Remapping cannot work with " 3255 "kernel-irqchip=on, please use 'split|off'."); 3256 return false; 3257 } 3258 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu->intr_supported) { 3259 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3260 return false; 3261 } 3262 3263 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3264 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3265 && x86_iommu->intr_supported ? 3266 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3267 } 3268 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3269 if (!kvm_irqchip_in_kernel()) { 3270 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3271 return false; 3272 } 3273 if (!kvm_enable_x2apic()) { 3274 error_setg(errp, "eim=on requires support on the KVM side" 3275 "(X2APIC_API, first shipped in v4.7)"); 3276 return false; 3277 } 3278 } 3279 3280 /* Currently only address widths supported are 39 and 48 bits */ 3281 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3282 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3283 error_setg(errp, "Supported values for x-aw-bits are: %d, %d", 3284 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3285 return false; 3286 } 3287 3288 return true; 3289 } 3290 3291 static void vtd_realize(DeviceState *dev, Error **errp) 3292 { 3293 MachineState *ms = MACHINE(qdev_get_machine()); 3294 PCMachineState *pcms = PC_MACHINE(ms); 3295 PCIBus *bus = pcms->bus; 3296 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3297 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); 3298 3299 x86_iommu->type = TYPE_INTEL; 3300 3301 if (!vtd_decide_config(s, errp)) { 3302 return; 3303 } 3304 3305 QLIST_INIT(&s->vtd_as_with_notifiers); 3306 qemu_mutex_init(&s->iommu_lock); 3307 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3308 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3309 "intel_iommu", DMAR_REG_SIZE); 3310 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3311 /* No corresponding destroy */ 3312 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3313 g_free, g_free); 3314 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3315 g_free, g_free); 3316 vtd_init(s); 3317 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3318 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3319 /* Pseudo address space under root PCI bus. */ 3320 pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3321 } 3322 3323 static void vtd_class_init(ObjectClass *klass, void *data) 3324 { 3325 DeviceClass *dc = DEVICE_CLASS(klass); 3326 X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass); 3327 3328 dc->reset = vtd_reset; 3329 dc->vmsd = &vtd_vmstate; 3330 dc->props = vtd_properties; 3331 dc->hotpluggable = false; 3332 x86_class->realize = vtd_realize; 3333 x86_class->int_remap = vtd_int_remap; 3334 /* Supported by the pc-q35-* machine types */ 3335 dc->user_creatable = true; 3336 } 3337 3338 static const TypeInfo vtd_info = { 3339 .name = TYPE_INTEL_IOMMU_DEVICE, 3340 .parent = TYPE_X86_IOMMU_DEVICE, 3341 .instance_size = sizeof(IntelIOMMUState), 3342 .class_init = vtd_class_init, 3343 }; 3344 3345 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3346 void *data) 3347 { 3348 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3349 3350 imrc->translate = vtd_iommu_translate; 3351 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3352 imrc->replay = vtd_iommu_replay; 3353 } 3354 3355 static const TypeInfo vtd_iommu_memory_region_info = { 3356 .parent = TYPE_IOMMU_MEMORY_REGION, 3357 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3358 .class_init = vtd_iommu_memory_region_class_init, 3359 }; 3360 3361 static void vtd_register_types(void) 3362 { 3363 type_register_static(&vtd_info); 3364 type_register_static(&vtd_iommu_memory_region_info); 3365 } 3366 3367 type_init(vtd_register_types) 3368