1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qemu/main-loop.h" 25 #include "qapi/error.h" 26 #include "hw/sysbus.h" 27 #include "intel_iommu_internal.h" 28 #include "hw/pci/pci.h" 29 #include "hw/pci/pci_bus.h" 30 #include "hw/qdev-properties.h" 31 #include "hw/i386/pc.h" 32 #include "hw/i386/apic-msidef.h" 33 #include "hw/i386/x86-iommu.h" 34 #include "hw/pci-host/q35.h" 35 #include "sysemu/kvm.h" 36 #include "sysemu/dma.h" 37 #include "sysemu/sysemu.h" 38 #include "hw/i386/apic_internal.h" 39 #include "kvm/kvm_i386.h" 40 #include "migration/vmstate.h" 41 #include "trace.h" 42 43 /* context entry operations */ 44 #define VTD_CE_GET_RID2PASID(ce) \ 45 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) 46 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ 47 ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) 48 49 /* pe operations */ 50 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) 51 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) 52 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ 53 if (ret_fr) { \ 54 ret_fr = -ret_fr; \ 55 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ 56 trace_vtd_fault_disabled(); \ 57 } else { \ 58 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ 59 } \ 60 goto error; \ 61 } \ 62 } 63 64 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 65 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 66 67 static void vtd_panic_require_caching_mode(void) 68 { 69 error_report("We need to set caching-mode=on for intel-iommu to enable " 70 "device assignment with IOMMU protection."); 71 exit(1); 72 } 73 74 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 75 uint64_t wmask, uint64_t w1cmask) 76 { 77 stq_le_p(&s->csr[addr], val); 78 stq_le_p(&s->wmask[addr], wmask); 79 stq_le_p(&s->w1cmask[addr], w1cmask); 80 } 81 82 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 83 { 84 stq_le_p(&s->womask[addr], mask); 85 } 86 87 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 88 uint32_t wmask, uint32_t w1cmask) 89 { 90 stl_le_p(&s->csr[addr], val); 91 stl_le_p(&s->wmask[addr], wmask); 92 stl_le_p(&s->w1cmask[addr], w1cmask); 93 } 94 95 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 96 { 97 stl_le_p(&s->womask[addr], mask); 98 } 99 100 /* "External" get/set operations */ 101 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 102 { 103 uint64_t oldval = ldq_le_p(&s->csr[addr]); 104 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 105 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 106 stq_le_p(&s->csr[addr], 107 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 108 } 109 110 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 111 { 112 uint32_t oldval = ldl_le_p(&s->csr[addr]); 113 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 114 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 115 stl_le_p(&s->csr[addr], 116 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 117 } 118 119 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 120 { 121 uint64_t val = ldq_le_p(&s->csr[addr]); 122 uint64_t womask = ldq_le_p(&s->womask[addr]); 123 return val & ~womask; 124 } 125 126 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 127 { 128 uint32_t val = ldl_le_p(&s->csr[addr]); 129 uint32_t womask = ldl_le_p(&s->womask[addr]); 130 return val & ~womask; 131 } 132 133 /* "Internal" get/set operations */ 134 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 135 { 136 return ldq_le_p(&s->csr[addr]); 137 } 138 139 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 140 { 141 return ldl_le_p(&s->csr[addr]); 142 } 143 144 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 145 { 146 stq_le_p(&s->csr[addr], val); 147 } 148 149 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 150 uint32_t clear, uint32_t mask) 151 { 152 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 153 stl_le_p(&s->csr[addr], new_val); 154 return new_val; 155 } 156 157 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 158 uint64_t clear, uint64_t mask) 159 { 160 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 161 stq_le_p(&s->csr[addr], new_val); 162 return new_val; 163 } 164 165 static inline void vtd_iommu_lock(IntelIOMMUState *s) 166 { 167 qemu_mutex_lock(&s->iommu_lock); 168 } 169 170 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 171 { 172 qemu_mutex_unlock(&s->iommu_lock); 173 } 174 175 static void vtd_update_scalable_state(IntelIOMMUState *s) 176 { 177 uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 178 179 if (s->scalable_mode) { 180 s->root_scalable = val & VTD_RTADDR_SMT; 181 } 182 } 183 184 /* Whether the address space needs to notify new mappings */ 185 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 186 { 187 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 188 } 189 190 /* GHashTable functions */ 191 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 192 { 193 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 194 } 195 196 static guint vtd_uint64_hash(gconstpointer v) 197 { 198 return (guint)*(const uint64_t *)v; 199 } 200 201 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 202 gpointer user_data) 203 { 204 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 205 uint16_t domain_id = *(uint16_t *)user_data; 206 return entry->domain_id == domain_id; 207 } 208 209 /* The shift of an addr for a certain level of paging structure */ 210 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 211 { 212 assert(level != 0); 213 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 214 } 215 216 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 217 { 218 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 219 } 220 221 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 222 gpointer user_data) 223 { 224 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 225 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 226 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 227 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 228 return (entry->domain_id == info->domain_id) && 229 (((entry->gfn & info->mask) == gfn) || 230 (entry->gfn == gfn_tlb)); 231 } 232 233 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 234 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 235 */ 236 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 237 { 238 VTDAddressSpace *vtd_as; 239 VTDBus *vtd_bus; 240 GHashTableIter bus_it; 241 uint32_t devfn_it; 242 243 trace_vtd_context_cache_reset(); 244 245 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 246 247 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 248 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 249 vtd_as = vtd_bus->dev_as[devfn_it]; 250 if (!vtd_as) { 251 continue; 252 } 253 vtd_as->context_cache_entry.context_cache_gen = 0; 254 } 255 } 256 s->context_cache_gen = 1; 257 } 258 259 /* Must be called with IOMMU lock held. */ 260 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 261 { 262 assert(s->iotlb); 263 g_hash_table_remove_all(s->iotlb); 264 } 265 266 static void vtd_reset_iotlb(IntelIOMMUState *s) 267 { 268 vtd_iommu_lock(s); 269 vtd_reset_iotlb_locked(s); 270 vtd_iommu_unlock(s); 271 } 272 273 static void vtd_reset_caches(IntelIOMMUState *s) 274 { 275 vtd_iommu_lock(s); 276 vtd_reset_iotlb_locked(s); 277 vtd_reset_context_cache_locked(s); 278 vtd_iommu_unlock(s); 279 } 280 281 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 282 uint32_t level) 283 { 284 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 285 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 286 } 287 288 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 289 { 290 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 291 } 292 293 /* Must be called with IOMMU lock held */ 294 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 295 hwaddr addr) 296 { 297 VTDIOTLBEntry *entry; 298 uint64_t key; 299 int level; 300 301 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 302 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 303 source_id, level); 304 entry = g_hash_table_lookup(s->iotlb, &key); 305 if (entry) { 306 goto out; 307 } 308 } 309 310 out: 311 return entry; 312 } 313 314 /* Must be with IOMMU lock held */ 315 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 316 uint16_t domain_id, hwaddr addr, uint64_t slpte, 317 uint8_t access_flags, uint32_t level) 318 { 319 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 320 uint64_t *key = g_malloc(sizeof(*key)); 321 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 322 323 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 324 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 325 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 326 vtd_reset_iotlb_locked(s); 327 } 328 329 entry->gfn = gfn; 330 entry->domain_id = domain_id; 331 entry->slpte = slpte; 332 entry->access_flags = access_flags; 333 entry->mask = vtd_slpt_level_page_mask(level); 334 *key = vtd_get_iotlb_key(gfn, source_id, level); 335 g_hash_table_replace(s->iotlb, key, entry); 336 } 337 338 /* Given the reg addr of both the message data and address, generate an 339 * interrupt via MSI. 340 */ 341 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 342 hwaddr mesg_data_reg) 343 { 344 MSIMessage msi; 345 346 assert(mesg_data_reg < DMAR_REG_SIZE); 347 assert(mesg_addr_reg < DMAR_REG_SIZE); 348 349 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 350 msi.data = vtd_get_long_raw(s, mesg_data_reg); 351 352 trace_vtd_irq_generate(msi.address, msi.data); 353 354 apic_get_class()->send_msi(&msi); 355 } 356 357 /* Generate a fault event to software via MSI if conditions are met. 358 * Notice that the value of FSTS_REG being passed to it should be the one 359 * before any update. 360 */ 361 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 362 { 363 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 364 pre_fsts & VTD_FSTS_IQE) { 365 error_report_once("There are previous interrupt conditions " 366 "to be serviced by software, fault event " 367 "is not generated"); 368 return; 369 } 370 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 371 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 372 error_report_once("Interrupt Mask set, irq is not generated"); 373 } else { 374 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 375 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 376 } 377 } 378 379 /* Check if the Fault (F) field of the Fault Recording Register referenced by 380 * @index is Set. 381 */ 382 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 383 { 384 /* Each reg is 128-bit */ 385 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 386 addr += 8; /* Access the high 64-bit half */ 387 388 assert(index < DMAR_FRCD_REG_NR); 389 390 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 391 } 392 393 /* Update the PPF field of Fault Status Register. 394 * Should be called whenever change the F field of any fault recording 395 * registers. 396 */ 397 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 398 { 399 uint32_t i; 400 uint32_t ppf_mask = 0; 401 402 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 403 if (vtd_is_frcd_set(s, i)) { 404 ppf_mask = VTD_FSTS_PPF; 405 break; 406 } 407 } 408 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 409 trace_vtd_fsts_ppf(!!ppf_mask); 410 } 411 412 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 413 { 414 /* Each reg is 128-bit */ 415 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 416 addr += 8; /* Access the high 64-bit half */ 417 418 assert(index < DMAR_FRCD_REG_NR); 419 420 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 421 vtd_update_fsts_ppf(s); 422 } 423 424 /* Must not update F field now, should be done later */ 425 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 426 uint16_t source_id, hwaddr addr, 427 VTDFaultReason fault, bool is_write) 428 { 429 uint64_t hi = 0, lo; 430 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 431 432 assert(index < DMAR_FRCD_REG_NR); 433 434 lo = VTD_FRCD_FI(addr); 435 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 436 if (!is_write) { 437 hi |= VTD_FRCD_T; 438 } 439 vtd_set_quad_raw(s, frcd_reg_addr, lo); 440 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 441 442 trace_vtd_frr_new(index, hi, lo); 443 } 444 445 /* Try to collapse multiple pending faults from the same requester */ 446 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 447 { 448 uint32_t i; 449 uint64_t frcd_reg; 450 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 451 452 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 453 frcd_reg = vtd_get_quad_raw(s, addr); 454 if ((frcd_reg & VTD_FRCD_F) && 455 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 456 return true; 457 } 458 addr += 16; /* 128-bit for each */ 459 } 460 return false; 461 } 462 463 /* Log and report an DMAR (address translation) fault to software */ 464 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 465 hwaddr addr, VTDFaultReason fault, 466 bool is_write) 467 { 468 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 469 470 assert(fault < VTD_FR_MAX); 471 472 if (fault == VTD_FR_RESERVED_ERR) { 473 /* This is not a normal fault reason case. Drop it. */ 474 return; 475 } 476 477 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 478 479 if (fsts_reg & VTD_FSTS_PFO) { 480 error_report_once("New fault is not recorded due to " 481 "Primary Fault Overflow"); 482 return; 483 } 484 485 if (vtd_try_collapse_fault(s, source_id)) { 486 error_report_once("New fault is not recorded due to " 487 "compression of faults"); 488 return; 489 } 490 491 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 492 error_report_once("Next Fault Recording Reg is used, " 493 "new fault is not recorded, set PFO field"); 494 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 495 return; 496 } 497 498 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 499 500 if (fsts_reg & VTD_FSTS_PPF) { 501 error_report_once("There are pending faults already, " 502 "fault event is not generated"); 503 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 504 s->next_frcd_reg++; 505 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 506 s->next_frcd_reg = 0; 507 } 508 } else { 509 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 510 VTD_FSTS_FRI(s->next_frcd_reg)); 511 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 512 s->next_frcd_reg++; 513 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 514 s->next_frcd_reg = 0; 515 } 516 /* This case actually cause the PPF to be Set. 517 * So generate fault event (interrupt). 518 */ 519 vtd_generate_fault_event(s, fsts_reg); 520 } 521 } 522 523 /* Handle Invalidation Queue Errors of queued invalidation interface error 524 * conditions. 525 */ 526 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 527 { 528 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 529 530 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 531 vtd_generate_fault_event(s, fsts_reg); 532 } 533 534 /* Set the IWC field and try to generate an invalidation completion interrupt */ 535 static void vtd_generate_completion_event(IntelIOMMUState *s) 536 { 537 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 538 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 539 return; 540 } 541 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 542 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 543 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 544 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 545 "new event not generated"); 546 return; 547 } else { 548 /* Generate the interrupt event */ 549 trace_vtd_inv_desc_wait_irq("Generating complete event"); 550 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 551 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 552 } 553 } 554 555 static inline bool vtd_root_entry_present(IntelIOMMUState *s, 556 VTDRootEntry *re, 557 uint8_t devfn) 558 { 559 if (s->root_scalable && devfn > UINT8_MAX / 2) { 560 return re->hi & VTD_ROOT_ENTRY_P; 561 } 562 563 return re->lo & VTD_ROOT_ENTRY_P; 564 } 565 566 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 567 VTDRootEntry *re) 568 { 569 dma_addr_t addr; 570 571 addr = s->root + index * sizeof(*re); 572 if (dma_memory_read(&address_space_memory, addr, 573 re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) { 574 re->lo = 0; 575 return -VTD_FR_ROOT_TABLE_INV; 576 } 577 re->lo = le64_to_cpu(re->lo); 578 re->hi = le64_to_cpu(re->hi); 579 return 0; 580 } 581 582 static inline bool vtd_ce_present(VTDContextEntry *context) 583 { 584 return context->lo & VTD_CONTEXT_ENTRY_P; 585 } 586 587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s, 588 VTDRootEntry *re, 589 uint8_t index, 590 VTDContextEntry *ce) 591 { 592 dma_addr_t addr, ce_size; 593 594 /* we have checked that root entry is present */ 595 ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : 596 VTD_CTX_ENTRY_LEGACY_SIZE; 597 598 if (s->root_scalable && index > UINT8_MAX / 2) { 599 index = index & (~VTD_DEVFN_CHECK_MASK); 600 addr = re->hi & VTD_ROOT_ENTRY_CTP; 601 } else { 602 addr = re->lo & VTD_ROOT_ENTRY_CTP; 603 } 604 605 addr = addr + index * ce_size; 606 if (dma_memory_read(&address_space_memory, addr, 607 ce, ce_size, MEMTXATTRS_UNSPECIFIED)) { 608 return -VTD_FR_CONTEXT_TABLE_INV; 609 } 610 611 ce->lo = le64_to_cpu(ce->lo); 612 ce->hi = le64_to_cpu(ce->hi); 613 if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { 614 ce->val[2] = le64_to_cpu(ce->val[2]); 615 ce->val[3] = le64_to_cpu(ce->val[3]); 616 } 617 return 0; 618 } 619 620 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 621 { 622 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 623 } 624 625 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 626 { 627 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 628 } 629 630 /* Whether the pte indicates the address of the page frame */ 631 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 632 { 633 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 634 } 635 636 /* Get the content of a spte located in @base_addr[@index] */ 637 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 638 { 639 uint64_t slpte; 640 641 assert(index < VTD_SL_PT_ENTRY_NR); 642 643 if (dma_memory_read(&address_space_memory, 644 base_addr + index * sizeof(slpte), 645 &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) { 646 slpte = (uint64_t)-1; 647 return slpte; 648 } 649 slpte = le64_to_cpu(slpte); 650 return slpte; 651 } 652 653 /* Given an iova and the level of paging structure, return the offset 654 * of current level. 655 */ 656 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 657 { 658 return (iova >> vtd_slpt_level_shift(level)) & 659 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 660 } 661 662 /* Check Capability Register to see if the @level of page-table is supported */ 663 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 664 { 665 return VTD_CAP_SAGAW_MASK & s->cap & 666 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 667 } 668 669 /* Return true if check passed, otherwise false */ 670 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, 671 VTDPASIDEntry *pe) 672 { 673 switch (VTD_PE_GET_TYPE(pe)) { 674 case VTD_SM_PASID_ENTRY_FLT: 675 case VTD_SM_PASID_ENTRY_SLT: 676 case VTD_SM_PASID_ENTRY_NESTED: 677 break; 678 case VTD_SM_PASID_ENTRY_PT: 679 if (!x86_iommu->pt_supported) { 680 return false; 681 } 682 break; 683 default: 684 /* Unknown type */ 685 return false; 686 } 687 return true; 688 } 689 690 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) 691 { 692 return pdire->val & 1; 693 } 694 695 /** 696 * Caller of this function should check present bit if wants 697 * to use pdir entry for further usage except for fpd bit check. 698 */ 699 static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, 700 uint32_t pasid, 701 VTDPASIDDirEntry *pdire) 702 { 703 uint32_t index; 704 dma_addr_t addr, entry_size; 705 706 index = VTD_PASID_DIR_INDEX(pasid); 707 entry_size = VTD_PASID_DIR_ENTRY_SIZE; 708 addr = pasid_dir_base + index * entry_size; 709 if (dma_memory_read(&address_space_memory, addr, 710 pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) { 711 return -VTD_FR_PASID_TABLE_INV; 712 } 713 714 return 0; 715 } 716 717 static inline bool vtd_pe_present(VTDPASIDEntry *pe) 718 { 719 return pe->val[0] & VTD_PASID_ENTRY_P; 720 } 721 722 static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, 723 uint32_t pasid, 724 dma_addr_t addr, 725 VTDPASIDEntry *pe) 726 { 727 uint32_t index; 728 dma_addr_t entry_size; 729 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 730 731 index = VTD_PASID_TABLE_INDEX(pasid); 732 entry_size = VTD_PASID_ENTRY_SIZE; 733 addr = addr + index * entry_size; 734 if (dma_memory_read(&address_space_memory, addr, 735 pe, entry_size, MEMTXATTRS_UNSPECIFIED)) { 736 return -VTD_FR_PASID_TABLE_INV; 737 } 738 739 /* Do translation type check */ 740 if (!vtd_pe_type_check(x86_iommu, pe)) { 741 return -VTD_FR_PASID_TABLE_INV; 742 } 743 744 if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { 745 return -VTD_FR_PASID_TABLE_INV; 746 } 747 748 return 0; 749 } 750 751 /** 752 * Caller of this function should check present bit if wants 753 * to use pasid entry for further usage except for fpd bit check. 754 */ 755 static int vtd_get_pe_from_pdire(IntelIOMMUState *s, 756 uint32_t pasid, 757 VTDPASIDDirEntry *pdire, 758 VTDPASIDEntry *pe) 759 { 760 dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; 761 762 return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe); 763 } 764 765 /** 766 * This function gets a pasid entry from a specified pasid 767 * table (includes dir and leaf table) with a specified pasid. 768 * Sanity check should be done to ensure return a present 769 * pasid entry to caller. 770 */ 771 static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, 772 dma_addr_t pasid_dir_base, 773 uint32_t pasid, 774 VTDPASIDEntry *pe) 775 { 776 int ret; 777 VTDPASIDDirEntry pdire; 778 779 ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, 780 pasid, &pdire); 781 if (ret) { 782 return ret; 783 } 784 785 if (!vtd_pdire_present(&pdire)) { 786 return -VTD_FR_PASID_TABLE_INV; 787 } 788 789 ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); 790 if (ret) { 791 return ret; 792 } 793 794 if (!vtd_pe_present(pe)) { 795 return -VTD_FR_PASID_TABLE_INV; 796 } 797 798 return 0; 799 } 800 801 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, 802 VTDContextEntry *ce, 803 VTDPASIDEntry *pe) 804 { 805 uint32_t pasid; 806 dma_addr_t pasid_dir_base; 807 int ret = 0; 808 809 pasid = VTD_CE_GET_RID2PASID(ce); 810 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 811 ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); 812 813 return ret; 814 } 815 816 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, 817 VTDContextEntry *ce, 818 bool *pe_fpd_set) 819 { 820 int ret; 821 uint32_t pasid; 822 dma_addr_t pasid_dir_base; 823 VTDPASIDDirEntry pdire; 824 VTDPASIDEntry pe; 825 826 pasid = VTD_CE_GET_RID2PASID(ce); 827 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 828 829 /* 830 * No present bit check since fpd is meaningful even 831 * if the present bit is clear. 832 */ 833 ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire); 834 if (ret) { 835 return ret; 836 } 837 838 if (pdire.val & VTD_PASID_DIR_FPD) { 839 *pe_fpd_set = true; 840 return 0; 841 } 842 843 if (!vtd_pdire_present(&pdire)) { 844 return -VTD_FR_PASID_TABLE_INV; 845 } 846 847 /* 848 * No present bit check since fpd is meaningful even 849 * if the present bit is clear. 850 */ 851 ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe); 852 if (ret) { 853 return ret; 854 } 855 856 if (pe.val[0] & VTD_PASID_ENTRY_FPD) { 857 *pe_fpd_set = true; 858 } 859 860 return 0; 861 } 862 863 /* Get the page-table level that hardware should use for the second-level 864 * page-table walk from the Address Width field of context-entry. 865 */ 866 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 867 { 868 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 869 } 870 871 static uint32_t vtd_get_iova_level(IntelIOMMUState *s, 872 VTDContextEntry *ce) 873 { 874 VTDPASIDEntry pe; 875 876 if (s->root_scalable) { 877 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 878 return VTD_PE_GET_LEVEL(&pe); 879 } 880 881 return vtd_ce_get_level(ce); 882 } 883 884 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 885 { 886 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 887 } 888 889 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, 890 VTDContextEntry *ce) 891 { 892 VTDPASIDEntry pe; 893 894 if (s->root_scalable) { 895 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 896 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; 897 } 898 899 return vtd_ce_get_agaw(ce); 900 } 901 902 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 903 { 904 return ce->lo & VTD_CONTEXT_ENTRY_TT; 905 } 906 907 /* Only for Legacy Mode. Return true if check passed, otherwise false */ 908 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 909 VTDContextEntry *ce) 910 { 911 switch (vtd_ce_get_type(ce)) { 912 case VTD_CONTEXT_TT_MULTI_LEVEL: 913 /* Always supported */ 914 break; 915 case VTD_CONTEXT_TT_DEV_IOTLB: 916 if (!x86_iommu->dt_supported) { 917 error_report_once("%s: DT specified but not supported", __func__); 918 return false; 919 } 920 break; 921 case VTD_CONTEXT_TT_PASS_THROUGH: 922 if (!x86_iommu->pt_supported) { 923 error_report_once("%s: PT specified but not supported", __func__); 924 return false; 925 } 926 break; 927 default: 928 /* Unknown type */ 929 error_report_once("%s: unknown ce type: %"PRIu32, __func__, 930 vtd_ce_get_type(ce)); 931 return false; 932 } 933 return true; 934 } 935 936 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, 937 VTDContextEntry *ce, uint8_t aw) 938 { 939 uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); 940 return 1ULL << MIN(ce_agaw, aw); 941 } 942 943 /* Return true if IOVA passes range check, otherwise false. */ 944 static inline bool vtd_iova_range_check(IntelIOMMUState *s, 945 uint64_t iova, VTDContextEntry *ce, 946 uint8_t aw) 947 { 948 /* 949 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 950 * in CAP_REG and AW in context-entry. 951 */ 952 return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); 953 } 954 955 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, 956 VTDContextEntry *ce) 957 { 958 VTDPASIDEntry pe; 959 960 if (s->root_scalable) { 961 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 962 return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; 963 } 964 965 return vtd_ce_get_slpt_base(ce); 966 } 967 968 /* 969 * Rsvd field masks for spte: 970 * vtd_spte_rsvd 4k pages 971 * vtd_spte_rsvd_large large pages 972 */ 973 static uint64_t vtd_spte_rsvd[5]; 974 static uint64_t vtd_spte_rsvd_large[5]; 975 976 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 977 { 978 uint64_t rsvd_mask = vtd_spte_rsvd[level]; 979 980 if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && 981 (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { 982 /* large page */ 983 rsvd_mask = vtd_spte_rsvd_large[level]; 984 } 985 986 return slpte & rsvd_mask; 987 } 988 989 /* Find the VTD address space associated with a given bus number */ 990 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 991 { 992 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 993 GHashTableIter iter; 994 995 if (vtd_bus) { 996 return vtd_bus; 997 } 998 999 /* 1000 * Iterate over the registered buses to find the one which 1001 * currently holds this bus number and update the bus_num 1002 * lookup table. 1003 */ 1004 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1005 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1006 if (pci_bus_num(vtd_bus->bus) == bus_num) { 1007 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 1008 return vtd_bus; 1009 } 1010 } 1011 1012 return NULL; 1013 } 1014 1015 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 1016 * of the translation, can be used for deciding the size of large page. 1017 */ 1018 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, 1019 uint64_t iova, bool is_write, 1020 uint64_t *slptep, uint32_t *slpte_level, 1021 bool *reads, bool *writes, uint8_t aw_bits) 1022 { 1023 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1024 uint32_t level = vtd_get_iova_level(s, ce); 1025 uint32_t offset; 1026 uint64_t slpte; 1027 uint64_t access_right_check; 1028 1029 if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { 1030 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 1031 __func__, iova); 1032 return -VTD_FR_ADDR_BEYOND_MGAW; 1033 } 1034 1035 /* FIXME: what is the Atomics request here? */ 1036 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 1037 1038 while (true) { 1039 offset = vtd_iova_level_offset(iova, level); 1040 slpte = vtd_get_slpte(addr, offset); 1041 1042 if (slpte == (uint64_t)-1) { 1043 error_report_once("%s: detected read error on DMAR slpte " 1044 "(iova=0x%" PRIx64 ")", __func__, iova); 1045 if (level == vtd_get_iova_level(s, ce)) { 1046 /* Invalid programming of context-entry */ 1047 return -VTD_FR_CONTEXT_ENTRY_INV; 1048 } else { 1049 return -VTD_FR_PAGING_ENTRY_INV; 1050 } 1051 } 1052 *reads = (*reads) && (slpte & VTD_SL_R); 1053 *writes = (*writes) && (slpte & VTD_SL_W); 1054 if (!(slpte & access_right_check)) { 1055 error_report_once("%s: detected slpte permission error " 1056 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 1057 "slpte=0x%" PRIx64 ", write=%d)", __func__, 1058 iova, level, slpte, is_write); 1059 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 1060 } 1061 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1062 error_report_once("%s: detected splte reserve non-zero " 1063 "iova=0x%" PRIx64 ", level=0x%" PRIx32 1064 "slpte=0x%" PRIx64 ")", __func__, iova, 1065 level, slpte); 1066 return -VTD_FR_PAGING_ENTRY_RSVD; 1067 } 1068 1069 if (vtd_is_last_slpte(slpte, level)) { 1070 *slptep = slpte; 1071 *slpte_level = level; 1072 return 0; 1073 } 1074 addr = vtd_get_slpte_addr(slpte, aw_bits); 1075 level--; 1076 } 1077 } 1078 1079 typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); 1080 1081 /** 1082 * Constant information used during page walking 1083 * 1084 * @hook_fn: hook func to be called when detected page 1085 * @private: private data to be passed into hook func 1086 * @notify_unmap: whether we should notify invalid entries 1087 * @as: VT-d address space of the device 1088 * @aw: maximum address width 1089 * @domain: domain ID of the page walk 1090 */ 1091 typedef struct { 1092 VTDAddressSpace *as; 1093 vtd_page_walk_hook hook_fn; 1094 void *private; 1095 bool notify_unmap; 1096 uint8_t aw; 1097 uint16_t domain_id; 1098 } vtd_page_walk_info; 1099 1100 static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) 1101 { 1102 VTDAddressSpace *as = info->as; 1103 vtd_page_walk_hook hook_fn = info->hook_fn; 1104 void *private = info->private; 1105 IOMMUTLBEntry *entry = &event->entry; 1106 DMAMap target = { 1107 .iova = entry->iova, 1108 .size = entry->addr_mask, 1109 .translated_addr = entry->translated_addr, 1110 .perm = entry->perm, 1111 }; 1112 const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 1113 1114 if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { 1115 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1116 return 0; 1117 } 1118 1119 assert(hook_fn); 1120 1121 /* Update local IOVA mapped ranges */ 1122 if (event->type == IOMMU_NOTIFIER_MAP) { 1123 if (mapped) { 1124 /* If it's exactly the same translation, skip */ 1125 if (!memcmp(mapped, &target, sizeof(target))) { 1126 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 1127 entry->translated_addr); 1128 return 0; 1129 } else { 1130 /* 1131 * Translation changed. Normally this should not 1132 * happen, but it can happen when with buggy guest 1133 * OSes. Note that there will be a small window that 1134 * we don't have map at all. But that's the best 1135 * effort we can do. The ideal way to emulate this is 1136 * atomically modify the PTE to follow what has 1137 * changed, but we can't. One example is that vfio 1138 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 1139 * interface to modify a mapping (meanwhile it seems 1140 * meaningless to even provide one). Anyway, let's 1141 * mark this as a TODO in case one day we'll have 1142 * a better solution. 1143 */ 1144 IOMMUAccessFlags cache_perm = entry->perm; 1145 int ret; 1146 1147 /* Emulate an UNMAP */ 1148 event->type = IOMMU_NOTIFIER_UNMAP; 1149 entry->perm = IOMMU_NONE; 1150 trace_vtd_page_walk_one(info->domain_id, 1151 entry->iova, 1152 entry->translated_addr, 1153 entry->addr_mask, 1154 entry->perm); 1155 ret = hook_fn(event, private); 1156 if (ret) { 1157 return ret; 1158 } 1159 /* Drop any existing mapping */ 1160 iova_tree_remove(as->iova_tree, &target); 1161 /* Recover the correct type */ 1162 event->type = IOMMU_NOTIFIER_MAP; 1163 entry->perm = cache_perm; 1164 } 1165 } 1166 iova_tree_insert(as->iova_tree, &target); 1167 } else { 1168 if (!mapped) { 1169 /* Skip since we didn't map this range at all */ 1170 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1171 return 0; 1172 } 1173 iova_tree_remove(as->iova_tree, &target); 1174 } 1175 1176 trace_vtd_page_walk_one(info->domain_id, entry->iova, 1177 entry->translated_addr, entry->addr_mask, 1178 entry->perm); 1179 return hook_fn(event, private); 1180 } 1181 1182 /** 1183 * vtd_page_walk_level - walk over specific level for IOVA range 1184 * 1185 * @addr: base GPA addr to start the walk 1186 * @start: IOVA range start address 1187 * @end: IOVA range end address (start <= addr < end) 1188 * @read: whether parent level has read permission 1189 * @write: whether parent level has write permission 1190 * @info: constant information for the page walk 1191 */ 1192 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 1193 uint64_t end, uint32_t level, bool read, 1194 bool write, vtd_page_walk_info *info) 1195 { 1196 bool read_cur, write_cur, entry_valid; 1197 uint32_t offset; 1198 uint64_t slpte; 1199 uint64_t subpage_size, subpage_mask; 1200 IOMMUTLBEvent event; 1201 uint64_t iova = start; 1202 uint64_t iova_next; 1203 int ret = 0; 1204 1205 trace_vtd_page_walk_level(addr, level, start, end); 1206 1207 subpage_size = 1ULL << vtd_slpt_level_shift(level); 1208 subpage_mask = vtd_slpt_level_page_mask(level); 1209 1210 while (iova < end) { 1211 iova_next = (iova & subpage_mask) + subpage_size; 1212 1213 offset = vtd_iova_level_offset(iova, level); 1214 slpte = vtd_get_slpte(addr, offset); 1215 1216 if (slpte == (uint64_t)-1) { 1217 trace_vtd_page_walk_skip_read(iova, iova_next); 1218 goto next; 1219 } 1220 1221 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1222 trace_vtd_page_walk_skip_reserve(iova, iova_next); 1223 goto next; 1224 } 1225 1226 /* Permissions are stacked with parents' */ 1227 read_cur = read && (slpte & VTD_SL_R); 1228 write_cur = write && (slpte & VTD_SL_W); 1229 1230 /* 1231 * As long as we have either read/write permission, this is a 1232 * valid entry. The rule works for both page entries and page 1233 * table entries. 1234 */ 1235 entry_valid = read_cur | write_cur; 1236 1237 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 1238 /* 1239 * This is a valid PDE (or even bigger than PDE). We need 1240 * to walk one further level. 1241 */ 1242 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 1243 iova, MIN(iova_next, end), level - 1, 1244 read_cur, write_cur, info); 1245 } else { 1246 /* 1247 * This means we are either: 1248 * 1249 * (1) the real page entry (either 4K page, or huge page) 1250 * (2) the whole range is invalid 1251 * 1252 * In either case, we send an IOTLB notification down. 1253 */ 1254 event.entry.target_as = &address_space_memory; 1255 event.entry.iova = iova & subpage_mask; 1256 event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 1257 event.entry.addr_mask = ~subpage_mask; 1258 /* NOTE: this is only meaningful if entry_valid == true */ 1259 event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 1260 event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : 1261 IOMMU_NOTIFIER_UNMAP; 1262 ret = vtd_page_walk_one(&event, info); 1263 } 1264 1265 if (ret < 0) { 1266 return ret; 1267 } 1268 1269 next: 1270 iova = iova_next; 1271 } 1272 1273 return 0; 1274 } 1275 1276 /** 1277 * vtd_page_walk - walk specific IOVA range, and call the hook 1278 * 1279 * @s: intel iommu state 1280 * @ce: context entry to walk upon 1281 * @start: IOVA address to start the walk 1282 * @end: IOVA range end address (start <= addr < end) 1283 * @info: page walking information struct 1284 */ 1285 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, 1286 uint64_t start, uint64_t end, 1287 vtd_page_walk_info *info) 1288 { 1289 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1290 uint32_t level = vtd_get_iova_level(s, ce); 1291 1292 if (!vtd_iova_range_check(s, start, ce, info->aw)) { 1293 return -VTD_FR_ADDR_BEYOND_MGAW; 1294 } 1295 1296 if (!vtd_iova_range_check(s, end, ce, info->aw)) { 1297 /* Fix end so that it reaches the maximum */ 1298 end = vtd_iova_limit(s, ce, info->aw); 1299 } 1300 1301 return vtd_page_walk_level(addr, start, end, level, true, true, info); 1302 } 1303 1304 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, 1305 VTDRootEntry *re) 1306 { 1307 /* Legacy Mode reserved bits check */ 1308 if (!s->root_scalable && 1309 (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1310 goto rsvd_err; 1311 1312 /* Scalable Mode reserved bits check */ 1313 if (s->root_scalable && 1314 ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || 1315 (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1316 goto rsvd_err; 1317 1318 return 0; 1319 1320 rsvd_err: 1321 error_report_once("%s: invalid root entry: hi=0x%"PRIx64 1322 ", lo=0x%"PRIx64, 1323 __func__, re->hi, re->lo); 1324 return -VTD_FR_ROOT_ENTRY_RSVD; 1325 } 1326 1327 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, 1328 VTDContextEntry *ce) 1329 { 1330 if (!s->root_scalable && 1331 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || 1332 ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1333 error_report_once("%s: invalid context entry: hi=%"PRIx64 1334 ", lo=%"PRIx64" (reserved nonzero)", 1335 __func__, ce->hi, ce->lo); 1336 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1337 } 1338 1339 if (s->root_scalable && 1340 (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || 1341 ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || 1342 ce->val[2] || 1343 ce->val[3])) { 1344 error_report_once("%s: invalid context entry: val[3]=%"PRIx64 1345 ", val[2]=%"PRIx64 1346 ", val[1]=%"PRIx64 1347 ", val[0]=%"PRIx64" (reserved nonzero)", 1348 __func__, ce->val[3], ce->val[2], 1349 ce->val[1], ce->val[0]); 1350 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1351 } 1352 1353 return 0; 1354 } 1355 1356 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, 1357 VTDContextEntry *ce) 1358 { 1359 VTDPASIDEntry pe; 1360 1361 /* 1362 * Make sure in Scalable Mode, a present context entry 1363 * has valid rid2pasid setting, which includes valid 1364 * rid2pasid field and corresponding pasid entry setting 1365 */ 1366 return vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1367 } 1368 1369 /* Map a device to its corresponding domain (context-entry) */ 1370 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 1371 uint8_t devfn, VTDContextEntry *ce) 1372 { 1373 VTDRootEntry re; 1374 int ret_fr; 1375 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 1376 1377 ret_fr = vtd_get_root_entry(s, bus_num, &re); 1378 if (ret_fr) { 1379 return ret_fr; 1380 } 1381 1382 if (!vtd_root_entry_present(s, &re, devfn)) { 1383 /* Not error - it's okay we don't have root entry. */ 1384 trace_vtd_re_not_present(bus_num); 1385 return -VTD_FR_ROOT_ENTRY_P; 1386 } 1387 1388 ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); 1389 if (ret_fr) { 1390 return ret_fr; 1391 } 1392 1393 ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); 1394 if (ret_fr) { 1395 return ret_fr; 1396 } 1397 1398 if (!vtd_ce_present(ce)) { 1399 /* Not error - it's okay we don't have context entry. */ 1400 trace_vtd_ce_not_present(bus_num, devfn); 1401 return -VTD_FR_CONTEXT_ENTRY_P; 1402 } 1403 1404 ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); 1405 if (ret_fr) { 1406 return ret_fr; 1407 } 1408 1409 /* Check if the programming of context-entry is valid */ 1410 if (!s->root_scalable && 1411 !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1412 error_report_once("%s: invalid context entry: hi=%"PRIx64 1413 ", lo=%"PRIx64" (level %d not supported)", 1414 __func__, ce->hi, ce->lo, 1415 vtd_ce_get_level(ce)); 1416 return -VTD_FR_CONTEXT_ENTRY_INV; 1417 } 1418 1419 if (!s->root_scalable) { 1420 /* Do translation type check */ 1421 if (!vtd_ce_type_check(x86_iommu, ce)) { 1422 /* Errors dumped in vtd_ce_type_check() */ 1423 return -VTD_FR_CONTEXT_ENTRY_INV; 1424 } 1425 } else { 1426 /* 1427 * Check if the programming of context-entry.rid2pasid 1428 * and corresponding pasid setting is valid, and thus 1429 * avoids to check pasid entry fetching result in future 1430 * helper function calling. 1431 */ 1432 ret_fr = vtd_ce_rid2pasid_check(s, ce); 1433 if (ret_fr) { 1434 return ret_fr; 1435 } 1436 } 1437 1438 return 0; 1439 } 1440 1441 static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, 1442 void *private) 1443 { 1444 memory_region_notify_iommu(private, 0, *event); 1445 return 0; 1446 } 1447 1448 static uint16_t vtd_get_domain_id(IntelIOMMUState *s, 1449 VTDContextEntry *ce) 1450 { 1451 VTDPASIDEntry pe; 1452 1453 if (s->root_scalable) { 1454 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1455 return VTD_SM_PASID_ENTRY_DID(pe.val[1]); 1456 } 1457 1458 return VTD_CONTEXT_ENTRY_DID(ce->hi); 1459 } 1460 1461 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1462 VTDContextEntry *ce, 1463 hwaddr addr, hwaddr size) 1464 { 1465 IntelIOMMUState *s = vtd_as->iommu_state; 1466 vtd_page_walk_info info = { 1467 .hook_fn = vtd_sync_shadow_page_hook, 1468 .private = (void *)&vtd_as->iommu, 1469 .notify_unmap = true, 1470 .aw = s->aw_bits, 1471 .as = vtd_as, 1472 .domain_id = vtd_get_domain_id(s, ce), 1473 }; 1474 1475 return vtd_page_walk(s, ce, addr, addr + size, &info); 1476 } 1477 1478 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1479 { 1480 int ret; 1481 VTDContextEntry ce; 1482 IOMMUNotifier *n; 1483 1484 if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { 1485 return 0; 1486 } 1487 1488 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1489 pci_bus_num(vtd_as->bus), 1490 vtd_as->devfn, &ce); 1491 if (ret) { 1492 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1493 /* 1494 * It's a valid scenario to have a context entry that is 1495 * not present. For example, when a device is removed 1496 * from an existing domain then the context entry will be 1497 * zeroed by the guest before it was put into another 1498 * domain. When this happens, instead of synchronizing 1499 * the shadow pages we should invalidate all existing 1500 * mappings and notify the backends. 1501 */ 1502 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1503 vtd_address_space_unmap(vtd_as, n); 1504 } 1505 ret = 0; 1506 } 1507 return ret; 1508 } 1509 1510 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1511 } 1512 1513 /* 1514 * Check if specific device is configured to bypass address 1515 * translation for DMA requests. In Scalable Mode, bypass 1516 * 1st-level translation or 2nd-level translation, it depends 1517 * on PGTT setting. 1518 */ 1519 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1520 { 1521 IntelIOMMUState *s; 1522 VTDContextEntry ce; 1523 VTDPASIDEntry pe; 1524 int ret; 1525 1526 assert(as); 1527 1528 s = as->iommu_state; 1529 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1530 as->devfn, &ce); 1531 if (ret) { 1532 /* 1533 * Possibly failed to parse the context entry for some reason 1534 * (e.g., during init, or any guest configuration errors on 1535 * context entries). We should assume PT not enabled for 1536 * safety. 1537 */ 1538 return false; 1539 } 1540 1541 if (s->root_scalable) { 1542 ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); 1543 if (ret) { 1544 error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, 1545 __func__, ret); 1546 return false; 1547 } 1548 return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); 1549 } 1550 1551 return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); 1552 } 1553 1554 /* Return whether the device is using IOMMU translation. */ 1555 static bool vtd_switch_address_space(VTDAddressSpace *as) 1556 { 1557 bool use_iommu; 1558 /* Whether we need to take the BQL on our own */ 1559 bool take_bql = !qemu_mutex_iothread_locked(); 1560 1561 assert(as); 1562 1563 use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); 1564 1565 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1566 VTD_PCI_SLOT(as->devfn), 1567 VTD_PCI_FUNC(as->devfn), 1568 use_iommu); 1569 1570 /* 1571 * It's possible that we reach here without BQL, e.g., when called 1572 * from vtd_pt_enable_fast_path(). However the memory APIs need 1573 * it. We'd better make sure we have had it already, or, take it. 1574 */ 1575 if (take_bql) { 1576 qemu_mutex_lock_iothread(); 1577 } 1578 1579 /* Turn off first then on the other */ 1580 if (use_iommu) { 1581 memory_region_set_enabled(&as->nodmar, false); 1582 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1583 } else { 1584 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1585 memory_region_set_enabled(&as->nodmar, true); 1586 } 1587 1588 if (take_bql) { 1589 qemu_mutex_unlock_iothread(); 1590 } 1591 1592 return use_iommu; 1593 } 1594 1595 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1596 { 1597 GHashTableIter iter; 1598 VTDBus *vtd_bus; 1599 int i; 1600 1601 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1602 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1603 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1604 if (!vtd_bus->dev_as[i]) { 1605 continue; 1606 } 1607 vtd_switch_address_space(vtd_bus->dev_as[i]); 1608 } 1609 } 1610 } 1611 1612 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1613 { 1614 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1615 } 1616 1617 static const bool vtd_qualified_faults[] = { 1618 [VTD_FR_RESERVED] = false, 1619 [VTD_FR_ROOT_ENTRY_P] = false, 1620 [VTD_FR_CONTEXT_ENTRY_P] = true, 1621 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1622 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1623 [VTD_FR_WRITE] = true, 1624 [VTD_FR_READ] = true, 1625 [VTD_FR_PAGING_ENTRY_INV] = true, 1626 [VTD_FR_ROOT_TABLE_INV] = false, 1627 [VTD_FR_CONTEXT_TABLE_INV] = false, 1628 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1629 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1630 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1631 [VTD_FR_PASID_TABLE_INV] = false, 1632 [VTD_FR_RESERVED_ERR] = false, 1633 [VTD_FR_MAX] = false, 1634 }; 1635 1636 /* To see if a fault condition is "qualified", which is reported to software 1637 * only if the FPD field in the context-entry used to process the faulting 1638 * request is 0. 1639 */ 1640 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1641 { 1642 return vtd_qualified_faults[fault]; 1643 } 1644 1645 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1646 { 1647 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1648 } 1649 1650 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1651 { 1652 VTDBus *vtd_bus; 1653 VTDAddressSpace *vtd_as; 1654 bool success = false; 1655 1656 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1657 if (!vtd_bus) { 1658 goto out; 1659 } 1660 1661 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1662 if (!vtd_as) { 1663 goto out; 1664 } 1665 1666 if (vtd_switch_address_space(vtd_as) == false) { 1667 /* We switched off IOMMU region successfully. */ 1668 success = true; 1669 } 1670 1671 out: 1672 trace_vtd_pt_enable_fast_path(source_id, success); 1673 } 1674 1675 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1676 * translation. 1677 * 1678 * Called from RCU critical section. 1679 * 1680 * @bus_num: The bus number 1681 * @devfn: The devfn, which is the combined of device and function number 1682 * @is_write: The access is a write operation 1683 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1684 * 1685 * Returns true if translation is successful, otherwise false. 1686 */ 1687 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1688 uint8_t devfn, hwaddr addr, bool is_write, 1689 IOMMUTLBEntry *entry) 1690 { 1691 IntelIOMMUState *s = vtd_as->iommu_state; 1692 VTDContextEntry ce; 1693 uint8_t bus_num = pci_bus_num(bus); 1694 VTDContextCacheEntry *cc_entry; 1695 uint64_t slpte, page_mask; 1696 uint32_t level; 1697 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1698 int ret_fr; 1699 bool is_fpd_set = false; 1700 bool reads = true; 1701 bool writes = true; 1702 uint8_t access_flags; 1703 VTDIOTLBEntry *iotlb_entry; 1704 1705 /* 1706 * We have standalone memory region for interrupt addresses, we 1707 * should never receive translation requests in this region. 1708 */ 1709 assert(!vtd_is_interrupt_addr(addr)); 1710 1711 vtd_iommu_lock(s); 1712 1713 cc_entry = &vtd_as->context_cache_entry; 1714 1715 /* Try to fetch slpte form IOTLB */ 1716 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1717 if (iotlb_entry) { 1718 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1719 iotlb_entry->domain_id); 1720 slpte = iotlb_entry->slpte; 1721 access_flags = iotlb_entry->access_flags; 1722 page_mask = iotlb_entry->mask; 1723 goto out; 1724 } 1725 1726 /* Try to fetch context-entry from cache first */ 1727 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1728 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1729 cc_entry->context_entry.lo, 1730 cc_entry->context_cache_gen); 1731 ce = cc_entry->context_entry; 1732 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1733 if (!is_fpd_set && s->root_scalable) { 1734 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1735 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1736 } 1737 } else { 1738 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1739 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1740 if (!ret_fr && !is_fpd_set && s->root_scalable) { 1741 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1742 } 1743 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1744 /* Update context-cache */ 1745 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1746 cc_entry->context_cache_gen, 1747 s->context_cache_gen); 1748 cc_entry->context_entry = ce; 1749 cc_entry->context_cache_gen = s->context_cache_gen; 1750 } 1751 1752 /* 1753 * We don't need to translate for pass-through context entries. 1754 * Also, let's ignore IOTLB caching as well for PT devices. 1755 */ 1756 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1757 entry->iova = addr & VTD_PAGE_MASK_4K; 1758 entry->translated_addr = entry->iova; 1759 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1760 entry->perm = IOMMU_RW; 1761 trace_vtd_translate_pt(source_id, entry->iova); 1762 1763 /* 1764 * When this happens, it means firstly caching-mode is not 1765 * enabled, and this is the first passthrough translation for 1766 * the device. Let's enable the fast path for passthrough. 1767 * 1768 * When passthrough is disabled again for the device, we can 1769 * capture it via the context entry invalidation, then the 1770 * IOMMU region can be swapped back. 1771 */ 1772 vtd_pt_enable_fast_path(s, source_id); 1773 vtd_iommu_unlock(s); 1774 return true; 1775 } 1776 1777 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, 1778 &reads, &writes, s->aw_bits); 1779 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1780 1781 page_mask = vtd_slpt_level_page_mask(level); 1782 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1783 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, 1784 access_flags, level); 1785 out: 1786 vtd_iommu_unlock(s); 1787 entry->iova = addr & page_mask; 1788 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1789 entry->addr_mask = ~page_mask; 1790 entry->perm = access_flags; 1791 return true; 1792 1793 error: 1794 vtd_iommu_unlock(s); 1795 entry->iova = 0; 1796 entry->translated_addr = 0; 1797 entry->addr_mask = 0; 1798 entry->perm = IOMMU_NONE; 1799 return false; 1800 } 1801 1802 static void vtd_root_table_setup(IntelIOMMUState *s) 1803 { 1804 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1805 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1806 1807 vtd_update_scalable_state(s); 1808 1809 trace_vtd_reg_dmar_root(s->root, s->root_scalable); 1810 } 1811 1812 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1813 uint32_t index, uint32_t mask) 1814 { 1815 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1816 } 1817 1818 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1819 { 1820 uint64_t value = 0; 1821 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1822 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1823 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1824 s->intr_eime = value & VTD_IRTA_EIME; 1825 1826 /* Notify global invalidation */ 1827 vtd_iec_notify_all(s, true, 0, 0); 1828 1829 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1830 } 1831 1832 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1833 { 1834 VTDAddressSpace *vtd_as; 1835 1836 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1837 vtd_sync_shadow_page_table(vtd_as); 1838 } 1839 } 1840 1841 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1842 { 1843 trace_vtd_inv_desc_cc_global(); 1844 /* Protects context cache */ 1845 vtd_iommu_lock(s); 1846 s->context_cache_gen++; 1847 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1848 vtd_reset_context_cache_locked(s); 1849 } 1850 vtd_iommu_unlock(s); 1851 vtd_address_space_refresh_all(s); 1852 /* 1853 * From VT-d spec 6.5.2.1, a global context entry invalidation 1854 * should be followed by a IOTLB global invalidation, so we should 1855 * be safe even without this. Hoewever, let's replay the region as 1856 * well to be safer, and go back here when we need finer tunes for 1857 * VT-d emulation codes. 1858 */ 1859 vtd_iommu_replay_all(s); 1860 } 1861 1862 /* Do a context-cache device-selective invalidation. 1863 * @func_mask: FM field after shifting 1864 */ 1865 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1866 uint16_t source_id, 1867 uint16_t func_mask) 1868 { 1869 uint16_t mask; 1870 VTDBus *vtd_bus; 1871 VTDAddressSpace *vtd_as; 1872 uint8_t bus_n, devfn; 1873 uint16_t devfn_it; 1874 1875 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1876 1877 switch (func_mask & 3) { 1878 case 0: 1879 mask = 0; /* No bits in the SID field masked */ 1880 break; 1881 case 1: 1882 mask = 4; /* Mask bit 2 in the SID field */ 1883 break; 1884 case 2: 1885 mask = 6; /* Mask bit 2:1 in the SID field */ 1886 break; 1887 case 3: 1888 mask = 7; /* Mask bit 2:0 in the SID field */ 1889 break; 1890 default: 1891 g_assert_not_reached(); 1892 } 1893 mask = ~mask; 1894 1895 bus_n = VTD_SID_TO_BUS(source_id); 1896 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1897 if (vtd_bus) { 1898 devfn = VTD_SID_TO_DEVFN(source_id); 1899 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1900 vtd_as = vtd_bus->dev_as[devfn_it]; 1901 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1902 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1903 VTD_PCI_FUNC(devfn_it)); 1904 vtd_iommu_lock(s); 1905 vtd_as->context_cache_entry.context_cache_gen = 0; 1906 vtd_iommu_unlock(s); 1907 /* 1908 * Do switch address space when needed, in case if the 1909 * device passthrough bit is switched. 1910 */ 1911 vtd_switch_address_space(vtd_as); 1912 /* 1913 * So a device is moving out of (or moving into) a 1914 * domain, resync the shadow page table. 1915 * This won't bring bad even if we have no such 1916 * notifier registered - the IOMMU notification 1917 * framework will skip MAP notifications if that 1918 * happened. 1919 */ 1920 vtd_sync_shadow_page_table(vtd_as); 1921 } 1922 } 1923 } 1924 } 1925 1926 /* Context-cache invalidation 1927 * Returns the Context Actual Invalidation Granularity. 1928 * @val: the content of the CCMD_REG 1929 */ 1930 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1931 { 1932 uint64_t caig; 1933 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1934 1935 switch (type) { 1936 case VTD_CCMD_DOMAIN_INVL: 1937 /* Fall through */ 1938 case VTD_CCMD_GLOBAL_INVL: 1939 caig = VTD_CCMD_GLOBAL_INVL_A; 1940 vtd_context_global_invalidate(s); 1941 break; 1942 1943 case VTD_CCMD_DEVICE_INVL: 1944 caig = VTD_CCMD_DEVICE_INVL_A; 1945 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1946 break; 1947 1948 default: 1949 error_report_once("%s: invalid context: 0x%" PRIx64, 1950 __func__, val); 1951 caig = 0; 1952 } 1953 return caig; 1954 } 1955 1956 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1957 { 1958 trace_vtd_inv_desc_iotlb_global(); 1959 vtd_reset_iotlb(s); 1960 vtd_iommu_replay_all(s); 1961 } 1962 1963 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1964 { 1965 VTDContextEntry ce; 1966 VTDAddressSpace *vtd_as; 1967 1968 trace_vtd_inv_desc_iotlb_domain(domain_id); 1969 1970 vtd_iommu_lock(s); 1971 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1972 &domain_id); 1973 vtd_iommu_unlock(s); 1974 1975 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1976 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1977 vtd_as->devfn, &ce) && 1978 domain_id == vtd_get_domain_id(s, &ce)) { 1979 vtd_sync_shadow_page_table(vtd_as); 1980 } 1981 } 1982 } 1983 1984 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1985 uint16_t domain_id, hwaddr addr, 1986 uint8_t am) 1987 { 1988 VTDAddressSpace *vtd_as; 1989 VTDContextEntry ce; 1990 int ret; 1991 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1992 1993 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1994 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1995 vtd_as->devfn, &ce); 1996 if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { 1997 if (vtd_as_has_map_notifier(vtd_as)) { 1998 /* 1999 * As long as we have MAP notifications registered in 2000 * any of our IOMMU notifiers, we need to sync the 2001 * shadow page table. 2002 */ 2003 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 2004 } else { 2005 /* 2006 * For UNMAP-only notifiers, we don't need to walk the 2007 * page tables. We just deliver the PSI down to 2008 * invalidate caches. 2009 */ 2010 IOMMUTLBEvent event = { 2011 .type = IOMMU_NOTIFIER_UNMAP, 2012 .entry = { 2013 .target_as = &address_space_memory, 2014 .iova = addr, 2015 .translated_addr = 0, 2016 .addr_mask = size - 1, 2017 .perm = IOMMU_NONE, 2018 }, 2019 }; 2020 memory_region_notify_iommu(&vtd_as->iommu, 0, event); 2021 } 2022 } 2023 } 2024 } 2025 2026 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 2027 hwaddr addr, uint8_t am) 2028 { 2029 VTDIOTLBPageInvInfo info; 2030 2031 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 2032 2033 assert(am <= VTD_MAMV); 2034 info.domain_id = domain_id; 2035 info.addr = addr; 2036 info.mask = ~((1 << am) - 1); 2037 vtd_iommu_lock(s); 2038 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 2039 vtd_iommu_unlock(s); 2040 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 2041 } 2042 2043 /* Flush IOTLB 2044 * Returns the IOTLB Actual Invalidation Granularity. 2045 * @val: the content of the IOTLB_REG 2046 */ 2047 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 2048 { 2049 uint64_t iaig; 2050 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 2051 uint16_t domain_id; 2052 hwaddr addr; 2053 uint8_t am; 2054 2055 switch (type) { 2056 case VTD_TLB_GLOBAL_FLUSH: 2057 iaig = VTD_TLB_GLOBAL_FLUSH_A; 2058 vtd_iotlb_global_invalidate(s); 2059 break; 2060 2061 case VTD_TLB_DSI_FLUSH: 2062 domain_id = VTD_TLB_DID(val); 2063 iaig = VTD_TLB_DSI_FLUSH_A; 2064 vtd_iotlb_domain_invalidate(s, domain_id); 2065 break; 2066 2067 case VTD_TLB_PSI_FLUSH: 2068 domain_id = VTD_TLB_DID(val); 2069 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 2070 am = VTD_IVA_AM(addr); 2071 addr = VTD_IVA_ADDR(addr); 2072 if (am > VTD_MAMV) { 2073 error_report_once("%s: address mask overflow: 0x%" PRIx64, 2074 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 2075 iaig = 0; 2076 break; 2077 } 2078 iaig = VTD_TLB_PSI_FLUSH_A; 2079 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2080 break; 2081 2082 default: 2083 error_report_once("%s: invalid granularity: 0x%" PRIx64, 2084 __func__, val); 2085 iaig = 0; 2086 } 2087 return iaig; 2088 } 2089 2090 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 2091 2092 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 2093 { 2094 return s->qi_enabled && (s->iq_tail == s->iq_head) && 2095 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 2096 } 2097 2098 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 2099 { 2100 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 2101 2102 trace_vtd_inv_qi_enable(en); 2103 2104 if (en) { 2105 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 2106 /* 2^(x+8) entries */ 2107 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); 2108 s->qi_enabled = true; 2109 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 2110 /* Ok - report back to driver */ 2111 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 2112 2113 if (s->iq_tail != 0) { 2114 /* 2115 * This is a spec violation but Windows guests are known to set up 2116 * Queued Invalidation this way so we allow the write and process 2117 * Invalidation Descriptors right away. 2118 */ 2119 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 2120 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2121 vtd_fetch_inv_desc(s); 2122 } 2123 } 2124 } else { 2125 if (vtd_queued_inv_disable_check(s)) { 2126 /* disable Queued Invalidation */ 2127 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 2128 s->iq_head = 0; 2129 s->qi_enabled = false; 2130 /* Ok - report back to driver */ 2131 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 2132 } else { 2133 error_report_once("%s: detected improper state when disable QI " 2134 "(head=0x%x, tail=0x%x, last_type=%d)", 2135 __func__, 2136 s->iq_head, s->iq_tail, s->iq_last_desc_type); 2137 } 2138 } 2139 } 2140 2141 /* Set Root Table Pointer */ 2142 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 2143 { 2144 vtd_root_table_setup(s); 2145 /* Ok - report back to driver */ 2146 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 2147 vtd_reset_caches(s); 2148 vtd_address_space_refresh_all(s); 2149 } 2150 2151 /* Set Interrupt Remap Table Pointer */ 2152 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 2153 { 2154 vtd_interrupt_remap_table_setup(s); 2155 /* Ok - report back to driver */ 2156 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 2157 } 2158 2159 /* Handle Translation Enable/Disable */ 2160 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 2161 { 2162 if (s->dmar_enabled == en) { 2163 return; 2164 } 2165 2166 trace_vtd_dmar_enable(en); 2167 2168 if (en) { 2169 s->dmar_enabled = true; 2170 /* Ok - report back to driver */ 2171 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 2172 } else { 2173 s->dmar_enabled = false; 2174 2175 /* Clear the index of Fault Recording Register */ 2176 s->next_frcd_reg = 0; 2177 /* Ok - report back to driver */ 2178 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 2179 } 2180 2181 vtd_reset_caches(s); 2182 vtd_address_space_refresh_all(s); 2183 } 2184 2185 /* Handle Interrupt Remap Enable/Disable */ 2186 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 2187 { 2188 trace_vtd_ir_enable(en); 2189 2190 if (en) { 2191 s->intr_enabled = true; 2192 /* Ok - report back to driver */ 2193 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 2194 } else { 2195 s->intr_enabled = false; 2196 /* Ok - report back to driver */ 2197 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 2198 } 2199 } 2200 2201 /* Handle write to Global Command Register */ 2202 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 2203 { 2204 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 2205 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 2206 uint32_t changed = status ^ val; 2207 2208 trace_vtd_reg_write_gcmd(status, val); 2209 if (changed & VTD_GCMD_TE) { 2210 /* Translation enable/disable */ 2211 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 2212 } 2213 if (val & VTD_GCMD_SRTP) { 2214 /* Set/update the root-table pointer */ 2215 vtd_handle_gcmd_srtp(s); 2216 } 2217 if (changed & VTD_GCMD_QIE) { 2218 /* Queued Invalidation Enable */ 2219 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 2220 } 2221 if (val & VTD_GCMD_SIRTP) { 2222 /* Set/update the interrupt remapping root-table pointer */ 2223 vtd_handle_gcmd_sirtp(s); 2224 } 2225 if (changed & VTD_GCMD_IRE) { 2226 /* Interrupt remap enable/disable */ 2227 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 2228 } 2229 } 2230 2231 /* Handle write to Context Command Register */ 2232 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 2233 { 2234 uint64_t ret; 2235 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 2236 2237 /* Context-cache invalidation request */ 2238 if (val & VTD_CCMD_ICC) { 2239 if (s->qi_enabled) { 2240 error_report_once("Queued Invalidation enabled, " 2241 "should not use register-based invalidation"); 2242 return; 2243 } 2244 ret = vtd_context_cache_invalidate(s, val); 2245 /* Invalidation completed. Change something to show */ 2246 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 2247 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 2248 ret); 2249 } 2250 } 2251 2252 /* Handle write to IOTLB Invalidation Register */ 2253 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 2254 { 2255 uint64_t ret; 2256 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 2257 2258 /* IOTLB invalidation request */ 2259 if (val & VTD_TLB_IVT) { 2260 if (s->qi_enabled) { 2261 error_report_once("Queued Invalidation enabled, " 2262 "should not use register-based invalidation"); 2263 return; 2264 } 2265 ret = vtd_iotlb_flush(s, val); 2266 /* Invalidation completed. Change something to show */ 2267 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 2268 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 2269 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 2270 } 2271 } 2272 2273 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 2274 static bool vtd_get_inv_desc(IntelIOMMUState *s, 2275 VTDInvDesc *inv_desc) 2276 { 2277 dma_addr_t base_addr = s->iq; 2278 uint32_t offset = s->iq_head; 2279 uint32_t dw = s->iq_dw ? 32 : 16; 2280 dma_addr_t addr = base_addr + offset * dw; 2281 2282 if (dma_memory_read(&address_space_memory, addr, 2283 inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) { 2284 error_report_once("Read INV DESC failed."); 2285 return false; 2286 } 2287 inv_desc->lo = le64_to_cpu(inv_desc->lo); 2288 inv_desc->hi = le64_to_cpu(inv_desc->hi); 2289 if (dw == 32) { 2290 inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); 2291 inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); 2292 } 2293 return true; 2294 } 2295 2296 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2297 { 2298 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 2299 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 2300 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2301 " (reserved nonzero)", __func__, inv_desc->hi, 2302 inv_desc->lo); 2303 return false; 2304 } 2305 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 2306 /* Status Write */ 2307 uint32_t status_data = (uint32_t)(inv_desc->lo >> 2308 VTD_INV_DESC_WAIT_DATA_SHIFT); 2309 2310 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 2311 2312 /* FIXME: need to be masked with HAW? */ 2313 dma_addr_t status_addr = inv_desc->hi; 2314 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 2315 status_data = cpu_to_le32(status_data); 2316 if (dma_memory_write(&address_space_memory, status_addr, 2317 &status_data, sizeof(status_data), 2318 MEMTXATTRS_UNSPECIFIED)) { 2319 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 2320 return false; 2321 } 2322 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 2323 /* Interrupt flag */ 2324 vtd_generate_completion_event(s); 2325 } else { 2326 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2327 " (unknown type)", __func__, inv_desc->hi, 2328 inv_desc->lo); 2329 return false; 2330 } 2331 return true; 2332 } 2333 2334 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 2335 VTDInvDesc *inv_desc) 2336 { 2337 uint16_t sid, fmask; 2338 2339 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 2340 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2341 " (reserved nonzero)", __func__, inv_desc->hi, 2342 inv_desc->lo); 2343 return false; 2344 } 2345 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 2346 case VTD_INV_DESC_CC_DOMAIN: 2347 trace_vtd_inv_desc_cc_domain( 2348 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 2349 /* Fall through */ 2350 case VTD_INV_DESC_CC_GLOBAL: 2351 vtd_context_global_invalidate(s); 2352 break; 2353 2354 case VTD_INV_DESC_CC_DEVICE: 2355 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 2356 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 2357 vtd_context_device_invalidate(s, sid, fmask); 2358 break; 2359 2360 default: 2361 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2362 " (invalid type)", __func__, inv_desc->hi, 2363 inv_desc->lo); 2364 return false; 2365 } 2366 return true; 2367 } 2368 2369 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2370 { 2371 uint16_t domain_id; 2372 uint8_t am; 2373 hwaddr addr; 2374 2375 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 2376 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 2377 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2378 ", lo=0x%"PRIx64" (reserved bits unzero)", 2379 __func__, inv_desc->hi, inv_desc->lo); 2380 return false; 2381 } 2382 2383 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 2384 case VTD_INV_DESC_IOTLB_GLOBAL: 2385 vtd_iotlb_global_invalidate(s); 2386 break; 2387 2388 case VTD_INV_DESC_IOTLB_DOMAIN: 2389 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2390 vtd_iotlb_domain_invalidate(s, domain_id); 2391 break; 2392 2393 case VTD_INV_DESC_IOTLB_PAGE: 2394 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2395 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 2396 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 2397 if (am > VTD_MAMV) { 2398 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2399 ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)", 2400 __func__, inv_desc->hi, inv_desc->lo, 2401 am, (unsigned)VTD_MAMV); 2402 return false; 2403 } 2404 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2405 break; 2406 2407 default: 2408 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2409 ", lo=0x%"PRIx64" (type mismatch: 0x%llx)", 2410 __func__, inv_desc->hi, inv_desc->lo, 2411 inv_desc->lo & VTD_INV_DESC_IOTLB_G); 2412 return false; 2413 } 2414 return true; 2415 } 2416 2417 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 2418 VTDInvDesc *inv_desc) 2419 { 2420 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 2421 inv_desc->iec.index, 2422 inv_desc->iec.index_mask); 2423 2424 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 2425 inv_desc->iec.index, 2426 inv_desc->iec.index_mask); 2427 return true; 2428 } 2429 2430 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 2431 VTDInvDesc *inv_desc) 2432 { 2433 VTDAddressSpace *vtd_dev_as; 2434 IOMMUTLBEvent event; 2435 struct VTDBus *vtd_bus; 2436 hwaddr addr; 2437 uint64_t sz; 2438 uint16_t sid; 2439 uint8_t devfn; 2440 bool size; 2441 uint8_t bus_num; 2442 2443 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2444 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2445 devfn = sid & 0xff; 2446 bus_num = sid >> 8; 2447 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2448 2449 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2450 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2451 error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 2452 ", lo=%"PRIx64" (reserved nonzero)", __func__, 2453 inv_desc->hi, inv_desc->lo); 2454 return false; 2455 } 2456 2457 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2458 if (!vtd_bus) { 2459 goto done; 2460 } 2461 2462 vtd_dev_as = vtd_bus->dev_as[devfn]; 2463 if (!vtd_dev_as) { 2464 goto done; 2465 } 2466 2467 /* According to ATS spec table 2.4: 2468 * S = 0, bits 15:12 = xxxx range size: 4K 2469 * S = 1, bits 15:12 = xxx0 range size: 8K 2470 * S = 1, bits 15:12 = xx01 range size: 16K 2471 * S = 1, bits 15:12 = x011 range size: 32K 2472 * S = 1, bits 15:12 = 0111 range size: 64K 2473 * ... 2474 */ 2475 if (size) { 2476 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2477 addr &= ~(sz - 1); 2478 } else { 2479 sz = VTD_PAGE_SIZE; 2480 } 2481 2482 event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; 2483 event.entry.target_as = &vtd_dev_as->as; 2484 event.entry.addr_mask = sz - 1; 2485 event.entry.iova = addr; 2486 event.entry.perm = IOMMU_NONE; 2487 event.entry.translated_addr = 0; 2488 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); 2489 2490 done: 2491 return true; 2492 } 2493 2494 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2495 { 2496 VTDInvDesc inv_desc; 2497 uint8_t desc_type; 2498 2499 trace_vtd_inv_qi_head(s->iq_head); 2500 if (!vtd_get_inv_desc(s, &inv_desc)) { 2501 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2502 return false; 2503 } 2504 2505 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2506 /* FIXME: should update at first or at last? */ 2507 s->iq_last_desc_type = desc_type; 2508 2509 switch (desc_type) { 2510 case VTD_INV_DESC_CC: 2511 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2512 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2513 return false; 2514 } 2515 break; 2516 2517 case VTD_INV_DESC_IOTLB: 2518 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2519 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2520 return false; 2521 } 2522 break; 2523 2524 /* 2525 * TODO: the entity of below two cases will be implemented in future series. 2526 * To make guest (which integrates scalable mode support patch set in 2527 * iommu driver) work, just return true is enough so far. 2528 */ 2529 case VTD_INV_DESC_PC: 2530 break; 2531 2532 case VTD_INV_DESC_PIOTLB: 2533 break; 2534 2535 case VTD_INV_DESC_WAIT: 2536 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2537 if (!vtd_process_wait_desc(s, &inv_desc)) { 2538 return false; 2539 } 2540 break; 2541 2542 case VTD_INV_DESC_IEC: 2543 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2544 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2545 return false; 2546 } 2547 break; 2548 2549 case VTD_INV_DESC_DEVICE: 2550 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2551 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2552 return false; 2553 } 2554 break; 2555 2556 default: 2557 error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 2558 " (unknown type)", __func__, inv_desc.hi, 2559 inv_desc.lo); 2560 return false; 2561 } 2562 s->iq_head++; 2563 if (s->iq_head == s->iq_size) { 2564 s->iq_head = 0; 2565 } 2566 return true; 2567 } 2568 2569 /* Try to fetch and process more Invalidation Descriptors */ 2570 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2571 { 2572 int qi_shift; 2573 2574 /* Refer to 10.4.23 of VT-d spec 3.0 */ 2575 qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4; 2576 2577 trace_vtd_inv_qi_fetch(); 2578 2579 if (s->iq_tail >= s->iq_size) { 2580 /* Detects an invalid Tail pointer */ 2581 error_report_once("%s: detected invalid QI tail " 2582 "(tail=0x%x, size=0x%x)", 2583 __func__, s->iq_tail, s->iq_size); 2584 vtd_handle_inv_queue_error(s); 2585 return; 2586 } 2587 while (s->iq_head != s->iq_tail) { 2588 if (!vtd_process_inv_desc(s)) { 2589 /* Invalidation Queue Errors */ 2590 vtd_handle_inv_queue_error(s); 2591 break; 2592 } 2593 /* Must update the IQH_REG in time */ 2594 vtd_set_quad_raw(s, DMAR_IQH_REG, 2595 (((uint64_t)(s->iq_head)) << qi_shift) & 2596 VTD_IQH_QH_MASK); 2597 } 2598 } 2599 2600 /* Handle write to Invalidation Queue Tail Register */ 2601 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2602 { 2603 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2604 2605 if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { 2606 error_report_once("%s: RSV bit is set: val=0x%"PRIx64, 2607 __func__, val); 2608 return; 2609 } 2610 s->iq_tail = VTD_IQT_QT(s->iq_dw, val); 2611 trace_vtd_inv_qi_tail(s->iq_tail); 2612 2613 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2614 /* Process Invalidation Queue here */ 2615 vtd_fetch_inv_desc(s); 2616 } 2617 } 2618 2619 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2620 { 2621 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2622 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2623 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2624 2625 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2626 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2627 trace_vtd_fsts_clear_ip(); 2628 } 2629 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2630 * Descriptors if there are any when Queued Invalidation is enabled? 2631 */ 2632 } 2633 2634 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2635 { 2636 uint32_t fectl_reg; 2637 /* FIXME: when software clears the IM field, check the IP field. But do we 2638 * need to compare the old value and the new value to conclude that 2639 * software clears the IM field? Or just check if the IM field is zero? 2640 */ 2641 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2642 2643 trace_vtd_reg_write_fectl(fectl_reg); 2644 2645 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2646 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2647 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2648 } 2649 } 2650 2651 static void vtd_handle_ics_write(IntelIOMMUState *s) 2652 { 2653 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2654 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2655 2656 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2657 trace_vtd_reg_ics_clear_ip(); 2658 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2659 } 2660 } 2661 2662 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2663 { 2664 uint32_t iectl_reg; 2665 /* FIXME: when software clears the IM field, check the IP field. But do we 2666 * need to compare the old value and the new value to conclude that 2667 * software clears the IM field? Or just check if the IM field is zero? 2668 */ 2669 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2670 2671 trace_vtd_reg_write_iectl(iectl_reg); 2672 2673 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2674 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2675 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2676 } 2677 } 2678 2679 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2680 { 2681 IntelIOMMUState *s = opaque; 2682 uint64_t val; 2683 2684 trace_vtd_reg_read(addr, size); 2685 2686 if (addr + size > DMAR_REG_SIZE) { 2687 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2688 " size=0x%x", __func__, addr, size); 2689 return (uint64_t)-1; 2690 } 2691 2692 switch (addr) { 2693 /* Root Table Address Register, 64-bit */ 2694 case DMAR_RTADDR_REG: 2695 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 2696 if (size == 4) { 2697 val = val & ((1ULL << 32) - 1); 2698 } 2699 break; 2700 2701 case DMAR_RTADDR_REG_HI: 2702 assert(size == 4); 2703 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; 2704 break; 2705 2706 /* Invalidation Queue Address Register, 64-bit */ 2707 case DMAR_IQA_REG: 2708 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2709 if (size == 4) { 2710 val = val & ((1ULL << 32) - 1); 2711 } 2712 break; 2713 2714 case DMAR_IQA_REG_HI: 2715 assert(size == 4); 2716 val = s->iq >> 32; 2717 break; 2718 2719 default: 2720 if (size == 4) { 2721 val = vtd_get_long(s, addr); 2722 } else { 2723 val = vtd_get_quad(s, addr); 2724 } 2725 } 2726 2727 return val; 2728 } 2729 2730 static void vtd_mem_write(void *opaque, hwaddr addr, 2731 uint64_t val, unsigned size) 2732 { 2733 IntelIOMMUState *s = opaque; 2734 2735 trace_vtd_reg_write(addr, size, val); 2736 2737 if (addr + size > DMAR_REG_SIZE) { 2738 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2739 " size=0x%x", __func__, addr, size); 2740 return; 2741 } 2742 2743 switch (addr) { 2744 /* Global Command Register, 32-bit */ 2745 case DMAR_GCMD_REG: 2746 vtd_set_long(s, addr, val); 2747 vtd_handle_gcmd_write(s); 2748 break; 2749 2750 /* Context Command Register, 64-bit */ 2751 case DMAR_CCMD_REG: 2752 if (size == 4) { 2753 vtd_set_long(s, addr, val); 2754 } else { 2755 vtd_set_quad(s, addr, val); 2756 vtd_handle_ccmd_write(s); 2757 } 2758 break; 2759 2760 case DMAR_CCMD_REG_HI: 2761 assert(size == 4); 2762 vtd_set_long(s, addr, val); 2763 vtd_handle_ccmd_write(s); 2764 break; 2765 2766 /* IOTLB Invalidation Register, 64-bit */ 2767 case DMAR_IOTLB_REG: 2768 if (size == 4) { 2769 vtd_set_long(s, addr, val); 2770 } else { 2771 vtd_set_quad(s, addr, val); 2772 vtd_handle_iotlb_write(s); 2773 } 2774 break; 2775 2776 case DMAR_IOTLB_REG_HI: 2777 assert(size == 4); 2778 vtd_set_long(s, addr, val); 2779 vtd_handle_iotlb_write(s); 2780 break; 2781 2782 /* Invalidate Address Register, 64-bit */ 2783 case DMAR_IVA_REG: 2784 if (size == 4) { 2785 vtd_set_long(s, addr, val); 2786 } else { 2787 vtd_set_quad(s, addr, val); 2788 } 2789 break; 2790 2791 case DMAR_IVA_REG_HI: 2792 assert(size == 4); 2793 vtd_set_long(s, addr, val); 2794 break; 2795 2796 /* Fault Status Register, 32-bit */ 2797 case DMAR_FSTS_REG: 2798 assert(size == 4); 2799 vtd_set_long(s, addr, val); 2800 vtd_handle_fsts_write(s); 2801 break; 2802 2803 /* Fault Event Control Register, 32-bit */ 2804 case DMAR_FECTL_REG: 2805 assert(size == 4); 2806 vtd_set_long(s, addr, val); 2807 vtd_handle_fectl_write(s); 2808 break; 2809 2810 /* Fault Event Data Register, 32-bit */ 2811 case DMAR_FEDATA_REG: 2812 assert(size == 4); 2813 vtd_set_long(s, addr, val); 2814 break; 2815 2816 /* Fault Event Address Register, 32-bit */ 2817 case DMAR_FEADDR_REG: 2818 if (size == 4) { 2819 vtd_set_long(s, addr, val); 2820 } else { 2821 /* 2822 * While the register is 32-bit only, some guests (Xen...) write to 2823 * it with 64-bit. 2824 */ 2825 vtd_set_quad(s, addr, val); 2826 } 2827 break; 2828 2829 /* Fault Event Upper Address Register, 32-bit */ 2830 case DMAR_FEUADDR_REG: 2831 assert(size == 4); 2832 vtd_set_long(s, addr, val); 2833 break; 2834 2835 /* Protected Memory Enable Register, 32-bit */ 2836 case DMAR_PMEN_REG: 2837 assert(size == 4); 2838 vtd_set_long(s, addr, val); 2839 break; 2840 2841 /* Root Table Address Register, 64-bit */ 2842 case DMAR_RTADDR_REG: 2843 if (size == 4) { 2844 vtd_set_long(s, addr, val); 2845 } else { 2846 vtd_set_quad(s, addr, val); 2847 } 2848 break; 2849 2850 case DMAR_RTADDR_REG_HI: 2851 assert(size == 4); 2852 vtd_set_long(s, addr, val); 2853 break; 2854 2855 /* Invalidation Queue Tail Register, 64-bit */ 2856 case DMAR_IQT_REG: 2857 if (size == 4) { 2858 vtd_set_long(s, addr, val); 2859 } else { 2860 vtd_set_quad(s, addr, val); 2861 } 2862 vtd_handle_iqt_write(s); 2863 break; 2864 2865 case DMAR_IQT_REG_HI: 2866 assert(size == 4); 2867 vtd_set_long(s, addr, val); 2868 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2869 break; 2870 2871 /* Invalidation Queue Address Register, 64-bit */ 2872 case DMAR_IQA_REG: 2873 if (size == 4) { 2874 vtd_set_long(s, addr, val); 2875 } else { 2876 vtd_set_quad(s, addr, val); 2877 } 2878 if (s->ecap & VTD_ECAP_SMTS && 2879 val & VTD_IQA_DW_MASK) { 2880 s->iq_dw = true; 2881 } else { 2882 s->iq_dw = false; 2883 } 2884 break; 2885 2886 case DMAR_IQA_REG_HI: 2887 assert(size == 4); 2888 vtd_set_long(s, addr, val); 2889 break; 2890 2891 /* Invalidation Completion Status Register, 32-bit */ 2892 case DMAR_ICS_REG: 2893 assert(size == 4); 2894 vtd_set_long(s, addr, val); 2895 vtd_handle_ics_write(s); 2896 break; 2897 2898 /* Invalidation Event Control Register, 32-bit */ 2899 case DMAR_IECTL_REG: 2900 assert(size == 4); 2901 vtd_set_long(s, addr, val); 2902 vtd_handle_iectl_write(s); 2903 break; 2904 2905 /* Invalidation Event Data Register, 32-bit */ 2906 case DMAR_IEDATA_REG: 2907 assert(size == 4); 2908 vtd_set_long(s, addr, val); 2909 break; 2910 2911 /* Invalidation Event Address Register, 32-bit */ 2912 case DMAR_IEADDR_REG: 2913 assert(size == 4); 2914 vtd_set_long(s, addr, val); 2915 break; 2916 2917 /* Invalidation Event Upper Address Register, 32-bit */ 2918 case DMAR_IEUADDR_REG: 2919 assert(size == 4); 2920 vtd_set_long(s, addr, val); 2921 break; 2922 2923 /* Fault Recording Registers, 128-bit */ 2924 case DMAR_FRCD_REG_0_0: 2925 if (size == 4) { 2926 vtd_set_long(s, addr, val); 2927 } else { 2928 vtd_set_quad(s, addr, val); 2929 } 2930 break; 2931 2932 case DMAR_FRCD_REG_0_1: 2933 assert(size == 4); 2934 vtd_set_long(s, addr, val); 2935 break; 2936 2937 case DMAR_FRCD_REG_0_2: 2938 if (size == 4) { 2939 vtd_set_long(s, addr, val); 2940 } else { 2941 vtd_set_quad(s, addr, val); 2942 /* May clear bit 127 (Fault), update PPF */ 2943 vtd_update_fsts_ppf(s); 2944 } 2945 break; 2946 2947 case DMAR_FRCD_REG_0_3: 2948 assert(size == 4); 2949 vtd_set_long(s, addr, val); 2950 /* May clear bit 127 (Fault), update PPF */ 2951 vtd_update_fsts_ppf(s); 2952 break; 2953 2954 case DMAR_IRTA_REG: 2955 if (size == 4) { 2956 vtd_set_long(s, addr, val); 2957 } else { 2958 vtd_set_quad(s, addr, val); 2959 } 2960 break; 2961 2962 case DMAR_IRTA_REG_HI: 2963 assert(size == 4); 2964 vtd_set_long(s, addr, val); 2965 break; 2966 2967 default: 2968 if (size == 4) { 2969 vtd_set_long(s, addr, val); 2970 } else { 2971 vtd_set_quad(s, addr, val); 2972 } 2973 } 2974 } 2975 2976 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2977 IOMMUAccessFlags flag, int iommu_idx) 2978 { 2979 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2980 IntelIOMMUState *s = vtd_as->iommu_state; 2981 IOMMUTLBEntry iotlb = { 2982 /* We'll fill in the rest later. */ 2983 .target_as = &address_space_memory, 2984 }; 2985 bool success; 2986 2987 if (likely(s->dmar_enabled)) { 2988 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2989 addr, flag & IOMMU_WO, &iotlb); 2990 } else { 2991 /* DMAR disabled, passthrough, use 4k-page*/ 2992 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2993 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2994 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2995 iotlb.perm = IOMMU_RW; 2996 success = true; 2997 } 2998 2999 if (likely(success)) { 3000 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 3001 VTD_PCI_SLOT(vtd_as->devfn), 3002 VTD_PCI_FUNC(vtd_as->devfn), 3003 iotlb.iova, iotlb.translated_addr, 3004 iotlb.addr_mask); 3005 } else { 3006 error_report_once("%s: detected translation failure " 3007 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 3008 __func__, pci_bus_num(vtd_as->bus), 3009 VTD_PCI_SLOT(vtd_as->devfn), 3010 VTD_PCI_FUNC(vtd_as->devfn), 3011 addr); 3012 } 3013 3014 return iotlb; 3015 } 3016 3017 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 3018 IOMMUNotifierFlag old, 3019 IOMMUNotifierFlag new, 3020 Error **errp) 3021 { 3022 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 3023 IntelIOMMUState *s = vtd_as->iommu_state; 3024 3025 /* Update per-address-space notifier flags */ 3026 vtd_as->notifier_flags = new; 3027 3028 if (old == IOMMU_NOTIFIER_NONE) { 3029 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 3030 } else if (new == IOMMU_NOTIFIER_NONE) { 3031 QLIST_REMOVE(vtd_as, next); 3032 } 3033 return 0; 3034 } 3035 3036 static int vtd_post_load(void *opaque, int version_id) 3037 { 3038 IntelIOMMUState *iommu = opaque; 3039 3040 /* 3041 * Memory regions are dynamically turned on/off depending on 3042 * context entry configurations from the guest. After migration, 3043 * we need to make sure the memory regions are still correct. 3044 */ 3045 vtd_switch_address_space_all(iommu); 3046 3047 /* 3048 * We don't need to migrate the root_scalable because we can 3049 * simply do the calculation after the loading is complete. We 3050 * can actually do similar things with root, dmar_enabled, etc. 3051 * however since we've had them already so we'd better keep them 3052 * for compatibility of migration. 3053 */ 3054 vtd_update_scalable_state(iommu); 3055 3056 return 0; 3057 } 3058 3059 static const VMStateDescription vtd_vmstate = { 3060 .name = "iommu-intel", 3061 .version_id = 1, 3062 .minimum_version_id = 1, 3063 .priority = MIG_PRI_IOMMU, 3064 .post_load = vtd_post_load, 3065 .fields = (VMStateField[]) { 3066 VMSTATE_UINT64(root, IntelIOMMUState), 3067 VMSTATE_UINT64(intr_root, IntelIOMMUState), 3068 VMSTATE_UINT64(iq, IntelIOMMUState), 3069 VMSTATE_UINT32(intr_size, IntelIOMMUState), 3070 VMSTATE_UINT16(iq_head, IntelIOMMUState), 3071 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 3072 VMSTATE_UINT16(iq_size, IntelIOMMUState), 3073 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 3074 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 3075 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 3076 VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ 3077 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 3078 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 3079 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 3080 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 3081 VMSTATE_END_OF_LIST() 3082 } 3083 }; 3084 3085 static const MemoryRegionOps vtd_mem_ops = { 3086 .read = vtd_mem_read, 3087 .write = vtd_mem_write, 3088 .endianness = DEVICE_LITTLE_ENDIAN, 3089 .impl = { 3090 .min_access_size = 4, 3091 .max_access_size = 8, 3092 }, 3093 .valid = { 3094 .min_access_size = 4, 3095 .max_access_size = 8, 3096 }, 3097 }; 3098 3099 static Property vtd_properties[] = { 3100 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 3101 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 3102 ON_OFF_AUTO_AUTO), 3103 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 3104 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, 3105 VTD_HOST_ADDRESS_WIDTH), 3106 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 3107 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), 3108 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), 3109 DEFINE_PROP_END_OF_LIST(), 3110 }; 3111 3112 /* Read IRTE entry with specific index */ 3113 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 3114 VTD_IR_TableEntry *entry, uint16_t sid) 3115 { 3116 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 3117 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 3118 dma_addr_t addr = 0x00; 3119 uint16_t mask, source_id; 3120 uint8_t bus, bus_max, bus_min; 3121 3122 if (index >= iommu->intr_size) { 3123 error_report_once("%s: index too large: ind=0x%x", 3124 __func__, index); 3125 return -VTD_FR_IR_INDEX_OVER; 3126 } 3127 3128 addr = iommu->intr_root + index * sizeof(*entry); 3129 if (dma_memory_read(&address_space_memory, addr, 3130 entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) { 3131 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 3132 __func__, index, addr); 3133 return -VTD_FR_IR_ROOT_INVAL; 3134 } 3135 3136 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 3137 le64_to_cpu(entry->data[0])); 3138 3139 if (!entry->irte.present) { 3140 error_report_once("%s: detected non-present IRTE " 3141 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3142 __func__, index, le64_to_cpu(entry->data[1]), 3143 le64_to_cpu(entry->data[0])); 3144 return -VTD_FR_IR_ENTRY_P; 3145 } 3146 3147 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 3148 entry->irte.__reserved_2) { 3149 error_report_once("%s: detected non-zero reserved IRTE " 3150 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3151 __func__, index, le64_to_cpu(entry->data[1]), 3152 le64_to_cpu(entry->data[0])); 3153 return -VTD_FR_IR_IRTE_RSVD; 3154 } 3155 3156 if (sid != X86_IOMMU_SID_INVALID) { 3157 /* Validate IRTE SID */ 3158 source_id = le32_to_cpu(entry->irte.source_id); 3159 switch (entry->irte.sid_vtype) { 3160 case VTD_SVT_NONE: 3161 break; 3162 3163 case VTD_SVT_ALL: 3164 mask = vtd_svt_mask[entry->irte.sid_q]; 3165 if ((source_id & mask) != (sid & mask)) { 3166 error_report_once("%s: invalid IRTE SID " 3167 "(index=%u, sid=%u, source_id=%u)", 3168 __func__, index, sid, source_id); 3169 return -VTD_FR_IR_SID_ERR; 3170 } 3171 break; 3172 3173 case VTD_SVT_BUS: 3174 bus_max = source_id >> 8; 3175 bus_min = source_id & 0xff; 3176 bus = sid >> 8; 3177 if (bus > bus_max || bus < bus_min) { 3178 error_report_once("%s: invalid SVT_BUS " 3179 "(index=%u, bus=%u, min=%u, max=%u)", 3180 __func__, index, bus, bus_min, bus_max); 3181 return -VTD_FR_IR_SID_ERR; 3182 } 3183 break; 3184 3185 default: 3186 error_report_once("%s: detected invalid IRTE SVT " 3187 "(index=%u, type=%d)", __func__, 3188 index, entry->irte.sid_vtype); 3189 /* Take this as verification failure. */ 3190 return -VTD_FR_IR_SID_ERR; 3191 } 3192 } 3193 3194 return 0; 3195 } 3196 3197 /* Fetch IRQ information of specific IR index */ 3198 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 3199 X86IOMMUIrq *irq, uint16_t sid) 3200 { 3201 VTD_IR_TableEntry irte = {}; 3202 int ret = 0; 3203 3204 ret = vtd_irte_get(iommu, index, &irte, sid); 3205 if (ret) { 3206 return ret; 3207 } 3208 3209 irq->trigger_mode = irte.irte.trigger_mode; 3210 irq->vector = irte.irte.vector; 3211 irq->delivery_mode = irte.irte.delivery_mode; 3212 irq->dest = le32_to_cpu(irte.irte.dest_id); 3213 if (!iommu->intr_eime) { 3214 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 3215 #define VTD_IR_APIC_DEST_SHIFT (8) 3216 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 3217 VTD_IR_APIC_DEST_SHIFT; 3218 } 3219 irq->dest_mode = irte.irte.dest_mode; 3220 irq->redir_hint = irte.irte.redir_hint; 3221 3222 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 3223 irq->delivery_mode, irq->dest, irq->dest_mode); 3224 3225 return 0; 3226 } 3227 3228 /* Interrupt remapping for MSI/MSI-X entry */ 3229 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 3230 MSIMessage *origin, 3231 MSIMessage *translated, 3232 uint16_t sid) 3233 { 3234 int ret = 0; 3235 VTD_IR_MSIAddress addr; 3236 uint16_t index; 3237 X86IOMMUIrq irq = {}; 3238 3239 assert(origin && translated); 3240 3241 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 3242 3243 if (!iommu || !iommu->intr_enabled) { 3244 memcpy(translated, origin, sizeof(*origin)); 3245 goto out; 3246 } 3247 3248 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 3249 error_report_once("%s: MSI address high 32 bits non-zero detected: " 3250 "address=0x%" PRIx64, __func__, origin->address); 3251 return -VTD_FR_IR_REQ_RSVD; 3252 } 3253 3254 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 3255 if (addr.addr.__head != 0xfee) { 3256 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 3257 __func__, addr.data); 3258 return -VTD_FR_IR_REQ_RSVD; 3259 } 3260 3261 /* This is compatible mode. */ 3262 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 3263 memcpy(translated, origin, sizeof(*origin)); 3264 goto out; 3265 } 3266 3267 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 3268 3269 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 3270 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 3271 3272 if (addr.addr.sub_valid) { 3273 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 3274 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 3275 } 3276 3277 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 3278 if (ret) { 3279 return ret; 3280 } 3281 3282 if (addr.addr.sub_valid) { 3283 trace_vtd_ir_remap_type("MSI"); 3284 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 3285 error_report_once("%s: invalid IR MSI " 3286 "(sid=%u, address=0x%" PRIx64 3287 ", data=0x%" PRIx32 ")", 3288 __func__, sid, origin->address, origin->data); 3289 return -VTD_FR_IR_REQ_RSVD; 3290 } 3291 } else { 3292 uint8_t vector = origin->data & 0xff; 3293 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 3294 3295 trace_vtd_ir_remap_type("IOAPIC"); 3296 /* IOAPIC entry vector should be aligned with IRTE vector 3297 * (see vt-d spec 5.1.5.1). */ 3298 if (vector != irq.vector) { 3299 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 3300 } 3301 3302 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 3303 * (see vt-d spec 5.1.5.1). */ 3304 if (trigger_mode != irq.trigger_mode) { 3305 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 3306 irq.trigger_mode); 3307 } 3308 } 3309 3310 /* 3311 * We'd better keep the last two bits, assuming that guest OS 3312 * might modify it. Keep it does not hurt after all. 3313 */ 3314 irq.msi_addr_last_bits = addr.addr.__not_care; 3315 3316 /* Translate X86IOMMUIrq to MSI message */ 3317 x86_iommu_irq_to_msi_message(&irq, translated); 3318 3319 out: 3320 trace_vtd_ir_remap_msi(origin->address, origin->data, 3321 translated->address, translated->data); 3322 return 0; 3323 } 3324 3325 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 3326 MSIMessage *dst, uint16_t sid) 3327 { 3328 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 3329 src, dst, sid); 3330 } 3331 3332 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 3333 uint64_t *data, unsigned size, 3334 MemTxAttrs attrs) 3335 { 3336 return MEMTX_OK; 3337 } 3338 3339 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 3340 uint64_t value, unsigned size, 3341 MemTxAttrs attrs) 3342 { 3343 int ret = 0; 3344 MSIMessage from = {}, to = {}; 3345 uint16_t sid = X86_IOMMU_SID_INVALID; 3346 3347 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 3348 from.data = (uint32_t) value; 3349 3350 if (!attrs.unspecified) { 3351 /* We have explicit Source ID */ 3352 sid = attrs.requester_id; 3353 } 3354 3355 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 3356 if (ret) { 3357 /* TODO: report error */ 3358 /* Drop this interrupt */ 3359 return MEMTX_ERROR; 3360 } 3361 3362 apic_get_class()->send_msi(&to); 3363 3364 return MEMTX_OK; 3365 } 3366 3367 static const MemoryRegionOps vtd_mem_ir_ops = { 3368 .read_with_attrs = vtd_mem_ir_read, 3369 .write_with_attrs = vtd_mem_ir_write, 3370 .endianness = DEVICE_LITTLE_ENDIAN, 3371 .impl = { 3372 .min_access_size = 4, 3373 .max_access_size = 4, 3374 }, 3375 .valid = { 3376 .min_access_size = 4, 3377 .max_access_size = 4, 3378 }, 3379 }; 3380 3381 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 3382 { 3383 uintptr_t key = (uintptr_t)bus; 3384 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 3385 VTDAddressSpace *vtd_dev_as; 3386 char name[128]; 3387 3388 if (!vtd_bus) { 3389 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 3390 *new_key = (uintptr_t)bus; 3391 /* No corresponding free() */ 3392 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 3393 PCI_DEVFN_MAX); 3394 vtd_bus->bus = bus; 3395 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 3396 } 3397 3398 vtd_dev_as = vtd_bus->dev_as[devfn]; 3399 3400 if (!vtd_dev_as) { 3401 snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), 3402 PCI_FUNC(devfn)); 3403 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 3404 3405 vtd_dev_as->bus = bus; 3406 vtd_dev_as->devfn = (uint8_t)devfn; 3407 vtd_dev_as->iommu_state = s; 3408 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 3409 vtd_dev_as->iova_tree = iova_tree_new(); 3410 3411 memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); 3412 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); 3413 3414 /* 3415 * Build the DMAR-disabled container with aliases to the 3416 * shared MRs. Note that aliasing to a shared memory region 3417 * could help the memory API to detect same FlatViews so we 3418 * can have devices to share the same FlatView when DMAR is 3419 * disabled (either by not providing "intel_iommu=on" or with 3420 * "iommu=pt"). It will greatly reduce the total number of 3421 * FlatViews of the system hence VM runs faster. 3422 */ 3423 memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), 3424 "vtd-nodmar", &s->mr_nodmar, 0, 3425 memory_region_size(&s->mr_nodmar)); 3426 3427 /* 3428 * Build the per-device DMAR-enabled container. 3429 * 3430 * TODO: currently we have per-device IOMMU memory region only 3431 * because we have per-device IOMMU notifiers for devices. If 3432 * one day we can abstract the IOMMU notifiers out of the 3433 * memory regions then we can also share the same memory 3434 * region here just like what we've done above with the nodmar 3435 * region. 3436 */ 3437 strcat(name, "-dmar"); 3438 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 3439 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 3440 name, UINT64_MAX); 3441 memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", 3442 &s->mr_ir, 0, memory_region_size(&s->mr_ir)); 3443 memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), 3444 VTD_INTERRUPT_ADDR_FIRST, 3445 &vtd_dev_as->iommu_ir, 1); 3446 3447 /* 3448 * Hook both the containers under the root container, we 3449 * switch between DMAR & noDMAR by enable/disable 3450 * corresponding sub-containers 3451 */ 3452 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3453 MEMORY_REGION(&vtd_dev_as->iommu), 3454 0); 3455 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3456 &vtd_dev_as->nodmar, 0); 3457 3458 vtd_switch_address_space(vtd_dev_as); 3459 } 3460 return vtd_dev_as; 3461 } 3462 3463 /* Unmap the whole range in the notifier's scope. */ 3464 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 3465 { 3466 hwaddr size, remain; 3467 hwaddr start = n->start; 3468 hwaddr end = n->end; 3469 IntelIOMMUState *s = as->iommu_state; 3470 DMAMap map; 3471 3472 /* 3473 * Note: all the codes in this function has a assumption that IOVA 3474 * bits are no more than VTD_MGAW bits (which is restricted by 3475 * VT-d spec), otherwise we need to consider overflow of 64 bits. 3476 */ 3477 3478 if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { 3479 /* 3480 * Don't need to unmap regions that is bigger than the whole 3481 * VT-d supported address space size 3482 */ 3483 end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; 3484 } 3485 3486 assert(start <= end); 3487 size = remain = end - start + 1; 3488 3489 while (remain >= VTD_PAGE_SIZE) { 3490 IOMMUTLBEvent event; 3491 uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits); 3492 uint64_t size = mask + 1; 3493 3494 assert(size); 3495 3496 event.type = IOMMU_NOTIFIER_UNMAP; 3497 event.entry.iova = start; 3498 event.entry.addr_mask = mask; 3499 event.entry.target_as = &address_space_memory; 3500 event.entry.perm = IOMMU_NONE; 3501 /* This field is meaningless for unmap */ 3502 event.entry.translated_addr = 0; 3503 3504 memory_region_notify_iommu_one(n, &event); 3505 3506 start += size; 3507 remain -= size; 3508 } 3509 3510 assert(!remain); 3511 3512 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3513 VTD_PCI_SLOT(as->devfn), 3514 VTD_PCI_FUNC(as->devfn), 3515 n->start, size); 3516 3517 map.iova = n->start; 3518 map.size = size; 3519 iova_tree_remove(as->iova_tree, &map); 3520 } 3521 3522 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3523 { 3524 VTDAddressSpace *vtd_as; 3525 IOMMUNotifier *n; 3526 3527 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3528 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3529 vtd_address_space_unmap(vtd_as, n); 3530 } 3531 } 3532 } 3533 3534 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3535 { 3536 vtd_address_space_unmap_all(s); 3537 vtd_switch_address_space_all(s); 3538 } 3539 3540 static int vtd_replay_hook(IOMMUTLBEvent *event, void *private) 3541 { 3542 memory_region_notify_iommu_one(private, event); 3543 return 0; 3544 } 3545 3546 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3547 { 3548 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3549 IntelIOMMUState *s = vtd_as->iommu_state; 3550 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3551 VTDContextEntry ce; 3552 3553 /* 3554 * The replay can be triggered by either a invalidation or a newly 3555 * created entry. No matter what, we release existing mappings 3556 * (it means flushing caches for UNMAP-only registers). 3557 */ 3558 vtd_address_space_unmap(vtd_as, n); 3559 3560 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3561 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : 3562 "legacy mode", 3563 bus_n, PCI_SLOT(vtd_as->devfn), 3564 PCI_FUNC(vtd_as->devfn), 3565 vtd_get_domain_id(s, &ce), 3566 ce.hi, ce.lo); 3567 if (vtd_as_has_map_notifier(vtd_as)) { 3568 /* This is required only for MAP typed notifiers */ 3569 vtd_page_walk_info info = { 3570 .hook_fn = vtd_replay_hook, 3571 .private = (void *)n, 3572 .notify_unmap = false, 3573 .aw = s->aw_bits, 3574 .as = vtd_as, 3575 .domain_id = vtd_get_domain_id(s, &ce), 3576 }; 3577 3578 vtd_page_walk(s, &ce, 0, ~0ULL, &info); 3579 } 3580 } else { 3581 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3582 PCI_FUNC(vtd_as->devfn)); 3583 } 3584 3585 return; 3586 } 3587 3588 /* Do the initialization. It will also be called when reset, so pay 3589 * attention when adding new initialization stuff. 3590 */ 3591 static void vtd_init(IntelIOMMUState *s) 3592 { 3593 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3594 3595 memset(s->csr, 0, DMAR_REG_SIZE); 3596 memset(s->wmask, 0, DMAR_REG_SIZE); 3597 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3598 memset(s->womask, 0, DMAR_REG_SIZE); 3599 3600 s->root = 0; 3601 s->root_scalable = false; 3602 s->dmar_enabled = false; 3603 s->intr_enabled = false; 3604 s->iq_head = 0; 3605 s->iq_tail = 0; 3606 s->iq = 0; 3607 s->iq_size = 0; 3608 s->qi_enabled = false; 3609 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3610 s->iq_dw = false; 3611 s->next_frcd_reg = 0; 3612 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3613 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3614 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3615 if (s->dma_drain) { 3616 s->cap |= VTD_CAP_DRAIN; 3617 } 3618 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3619 s->cap |= VTD_CAP_SAGAW_48bit; 3620 } 3621 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3622 3623 /* 3624 * Rsvd field masks for spte 3625 */ 3626 vtd_spte_rsvd[0] = ~0ULL; 3627 vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, 3628 x86_iommu->dt_supported); 3629 vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3630 vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3631 vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3632 3633 vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, 3634 x86_iommu->dt_supported); 3635 vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, 3636 x86_iommu->dt_supported); 3637 3638 if (s->scalable_mode) { 3639 vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; 3640 vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; 3641 vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; 3642 } 3643 3644 if (x86_iommu_ir_supported(x86_iommu)) { 3645 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3646 if (s->intr_eim == ON_OFF_AUTO_ON) { 3647 s->ecap |= VTD_ECAP_EIM; 3648 } 3649 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3650 } 3651 3652 if (x86_iommu->dt_supported) { 3653 s->ecap |= VTD_ECAP_DT; 3654 } 3655 3656 if (x86_iommu->pt_supported) { 3657 s->ecap |= VTD_ECAP_PT; 3658 } 3659 3660 if (s->caching_mode) { 3661 s->cap |= VTD_CAP_CM; 3662 } 3663 3664 /* TODO: read cap/ecap from host to decide which cap to be exposed. */ 3665 if (s->scalable_mode) { 3666 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; 3667 } 3668 3669 vtd_reset_caches(s); 3670 3671 /* Define registers with default values and bit semantics */ 3672 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3673 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3674 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3675 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3676 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3677 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3678 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); 3679 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3680 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3681 3682 /* Advanced Fault Logging not supported */ 3683 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3684 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3685 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3686 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3687 3688 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3689 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3690 */ 3691 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3692 3693 /* Treated as RO for implementations that PLMR and PHMR fields reported 3694 * as Clear in the CAP_REG. 3695 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3696 */ 3697 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3698 3699 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3700 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3701 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); 3702 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3703 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3704 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3705 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3706 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3707 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3708 3709 /* IOTLB registers */ 3710 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3711 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3712 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3713 3714 /* Fault Recording Registers, 128-bit */ 3715 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3716 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3717 3718 /* 3719 * Interrupt remapping registers. 3720 */ 3721 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3722 } 3723 3724 /* Should not reset address_spaces when reset because devices will still use 3725 * the address space they got at first (won't ask the bus again). 3726 */ 3727 static void vtd_reset(DeviceState *dev) 3728 { 3729 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3730 3731 vtd_init(s); 3732 vtd_address_space_refresh_all(s); 3733 } 3734 3735 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3736 { 3737 IntelIOMMUState *s = opaque; 3738 VTDAddressSpace *vtd_as; 3739 3740 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3741 3742 vtd_as = vtd_find_add_as(s, bus, devfn); 3743 return &vtd_as->as; 3744 } 3745 3746 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3747 { 3748 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3749 3750 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { 3751 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3752 return false; 3753 } 3754 3755 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3756 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3757 && x86_iommu_ir_supported(x86_iommu) ? 3758 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3759 } 3760 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3761 if (!kvm_irqchip_in_kernel()) { 3762 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3763 return false; 3764 } 3765 if (!kvm_enable_x2apic()) { 3766 error_setg(errp, "eim=on requires support on the KVM side" 3767 "(X2APIC_API, first shipped in v4.7)"); 3768 return false; 3769 } 3770 } 3771 3772 /* Currently only address widths supported are 39 and 48 bits */ 3773 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3774 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3775 error_setg(errp, "Supported values for aw-bits are: %d, %d", 3776 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3777 return false; 3778 } 3779 3780 if (s->scalable_mode && !s->dma_drain) { 3781 error_setg(errp, "Need to set dma_drain for scalable mode"); 3782 return false; 3783 } 3784 3785 return true; 3786 } 3787 3788 static int vtd_machine_done_notify_one(Object *child, void *unused) 3789 { 3790 IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); 3791 3792 /* 3793 * We hard-coded here because vfio-pci is the only special case 3794 * here. Let's be more elegant in the future when we can, but so 3795 * far there seems to be no better way. 3796 */ 3797 if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { 3798 vtd_panic_require_caching_mode(); 3799 } 3800 3801 return 0; 3802 } 3803 3804 static void vtd_machine_done_hook(Notifier *notifier, void *unused) 3805 { 3806 object_child_foreach_recursive(object_get_root(), 3807 vtd_machine_done_notify_one, NULL); 3808 } 3809 3810 static Notifier vtd_machine_done_notify = { 3811 .notify = vtd_machine_done_hook, 3812 }; 3813 3814 static void vtd_realize(DeviceState *dev, Error **errp) 3815 { 3816 MachineState *ms = MACHINE(qdev_get_machine()); 3817 PCMachineState *pcms = PC_MACHINE(ms); 3818 X86MachineState *x86ms = X86_MACHINE(ms); 3819 PCIBus *bus = pcms->bus; 3820 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3821 3822 if (!vtd_decide_config(s, errp)) { 3823 return; 3824 } 3825 3826 QLIST_INIT(&s->vtd_as_with_notifiers); 3827 qemu_mutex_init(&s->iommu_lock); 3828 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3829 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3830 "intel_iommu", DMAR_REG_SIZE); 3831 3832 /* Create the shared memory regions by all devices */ 3833 memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", 3834 UINT64_MAX); 3835 memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, 3836 s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); 3837 memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), 3838 "vtd-sys-alias", get_system_memory(), 0, 3839 memory_region_size(get_system_memory())); 3840 memory_region_add_subregion_overlap(&s->mr_nodmar, 0, 3841 &s->mr_sys_alias, 0); 3842 memory_region_add_subregion_overlap(&s->mr_nodmar, 3843 VTD_INTERRUPT_ADDR_FIRST, 3844 &s->mr_ir, 1); 3845 3846 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3847 /* No corresponding destroy */ 3848 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3849 g_free, g_free); 3850 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3851 g_free, g_free); 3852 vtd_init(s); 3853 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3854 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3855 /* Pseudo address space under root PCI bus. */ 3856 x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3857 qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); 3858 } 3859 3860 static void vtd_class_init(ObjectClass *klass, void *data) 3861 { 3862 DeviceClass *dc = DEVICE_CLASS(klass); 3863 X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); 3864 3865 dc->reset = vtd_reset; 3866 dc->vmsd = &vtd_vmstate; 3867 device_class_set_props(dc, vtd_properties); 3868 dc->hotpluggable = false; 3869 x86_class->realize = vtd_realize; 3870 x86_class->int_remap = vtd_int_remap; 3871 /* Supported by the pc-q35-* machine types */ 3872 dc->user_creatable = true; 3873 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3874 dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; 3875 } 3876 3877 static const TypeInfo vtd_info = { 3878 .name = TYPE_INTEL_IOMMU_DEVICE, 3879 .parent = TYPE_X86_IOMMU_DEVICE, 3880 .instance_size = sizeof(IntelIOMMUState), 3881 .class_init = vtd_class_init, 3882 }; 3883 3884 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3885 void *data) 3886 { 3887 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3888 3889 imrc->translate = vtd_iommu_translate; 3890 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3891 imrc->replay = vtd_iommu_replay; 3892 } 3893 3894 static const TypeInfo vtd_iommu_memory_region_info = { 3895 .parent = TYPE_IOMMU_MEMORY_REGION, 3896 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3897 .class_init = vtd_iommu_memory_region_class_init, 3898 }; 3899 3900 static void vtd_register_types(void) 3901 { 3902 type_register_static(&vtd_info); 3903 type_register_static(&vtd_iommu_memory_region_info); 3904 } 3905 3906 type_init(vtd_register_types) 3907