1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qemu/main-loop.h" 25 #include "qapi/error.h" 26 #include "hw/sysbus.h" 27 #include "intel_iommu_internal.h" 28 #include "hw/pci/pci.h" 29 #include "hw/pci/pci_bus.h" 30 #include "hw/qdev-properties.h" 31 #include "hw/i386/pc.h" 32 #include "hw/i386/apic-msidef.h" 33 #include "hw/i386/x86-iommu.h" 34 #include "hw/pci-host/q35.h" 35 #include "sysemu/kvm.h" 36 #include "sysemu/dma.h" 37 #include "sysemu/sysemu.h" 38 #include "hw/i386/apic_internal.h" 39 #include "kvm/kvm_i386.h" 40 #include "migration/vmstate.h" 41 #include "trace.h" 42 43 /* context entry operations */ 44 #define VTD_CE_GET_RID2PASID(ce) \ 45 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) 46 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ 47 ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) 48 49 /* pe operations */ 50 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) 51 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) 52 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ 53 if (ret_fr) { \ 54 ret_fr = -ret_fr; \ 55 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ 56 trace_vtd_fault_disabled(); \ 57 } else { \ 58 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ 59 } \ 60 goto error; \ 61 } \ 62 } 63 64 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 65 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 66 67 static void vtd_panic_require_caching_mode(void) 68 { 69 error_report("We need to set caching-mode=on for intel-iommu to enable " 70 "device assignment with IOMMU protection."); 71 exit(1); 72 } 73 74 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 75 uint64_t wmask, uint64_t w1cmask) 76 { 77 stq_le_p(&s->csr[addr], val); 78 stq_le_p(&s->wmask[addr], wmask); 79 stq_le_p(&s->w1cmask[addr], w1cmask); 80 } 81 82 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 83 { 84 stq_le_p(&s->womask[addr], mask); 85 } 86 87 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 88 uint32_t wmask, uint32_t w1cmask) 89 { 90 stl_le_p(&s->csr[addr], val); 91 stl_le_p(&s->wmask[addr], wmask); 92 stl_le_p(&s->w1cmask[addr], w1cmask); 93 } 94 95 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 96 { 97 stl_le_p(&s->womask[addr], mask); 98 } 99 100 /* "External" get/set operations */ 101 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 102 { 103 uint64_t oldval = ldq_le_p(&s->csr[addr]); 104 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 105 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 106 stq_le_p(&s->csr[addr], 107 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 108 } 109 110 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 111 { 112 uint32_t oldval = ldl_le_p(&s->csr[addr]); 113 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 114 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 115 stl_le_p(&s->csr[addr], 116 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 117 } 118 119 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 120 { 121 uint64_t val = ldq_le_p(&s->csr[addr]); 122 uint64_t womask = ldq_le_p(&s->womask[addr]); 123 return val & ~womask; 124 } 125 126 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 127 { 128 uint32_t val = ldl_le_p(&s->csr[addr]); 129 uint32_t womask = ldl_le_p(&s->womask[addr]); 130 return val & ~womask; 131 } 132 133 /* "Internal" get/set operations */ 134 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 135 { 136 return ldq_le_p(&s->csr[addr]); 137 } 138 139 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 140 { 141 return ldl_le_p(&s->csr[addr]); 142 } 143 144 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 145 { 146 stq_le_p(&s->csr[addr], val); 147 } 148 149 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 150 uint32_t clear, uint32_t mask) 151 { 152 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 153 stl_le_p(&s->csr[addr], new_val); 154 return new_val; 155 } 156 157 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 158 uint64_t clear, uint64_t mask) 159 { 160 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 161 stq_le_p(&s->csr[addr], new_val); 162 return new_val; 163 } 164 165 static inline void vtd_iommu_lock(IntelIOMMUState *s) 166 { 167 qemu_mutex_lock(&s->iommu_lock); 168 } 169 170 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 171 { 172 qemu_mutex_unlock(&s->iommu_lock); 173 } 174 175 static void vtd_update_scalable_state(IntelIOMMUState *s) 176 { 177 uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 178 179 if (s->scalable_mode) { 180 s->root_scalable = val & VTD_RTADDR_SMT; 181 } 182 } 183 184 /* Whether the address space needs to notify new mappings */ 185 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 186 { 187 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 188 } 189 190 /* GHashTable functions */ 191 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 192 { 193 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 194 } 195 196 static guint vtd_uint64_hash(gconstpointer v) 197 { 198 return (guint)*(const uint64_t *)v; 199 } 200 201 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 202 gpointer user_data) 203 { 204 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 205 uint16_t domain_id = *(uint16_t *)user_data; 206 return entry->domain_id == domain_id; 207 } 208 209 /* The shift of an addr for a certain level of paging structure */ 210 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 211 { 212 assert(level != 0); 213 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 214 } 215 216 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 217 { 218 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 219 } 220 221 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 222 gpointer user_data) 223 { 224 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 225 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 226 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 227 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 228 return (entry->domain_id == info->domain_id) && 229 (((entry->gfn & info->mask) == gfn) || 230 (entry->gfn == gfn_tlb)); 231 } 232 233 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 234 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 235 */ 236 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 237 { 238 VTDAddressSpace *vtd_as; 239 VTDBus *vtd_bus; 240 GHashTableIter bus_it; 241 uint32_t devfn_it; 242 243 trace_vtd_context_cache_reset(); 244 245 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 246 247 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 248 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 249 vtd_as = vtd_bus->dev_as[devfn_it]; 250 if (!vtd_as) { 251 continue; 252 } 253 vtd_as->context_cache_entry.context_cache_gen = 0; 254 } 255 } 256 s->context_cache_gen = 1; 257 } 258 259 /* Must be called with IOMMU lock held. */ 260 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 261 { 262 assert(s->iotlb); 263 g_hash_table_remove_all(s->iotlb); 264 } 265 266 static void vtd_reset_iotlb(IntelIOMMUState *s) 267 { 268 vtd_iommu_lock(s); 269 vtd_reset_iotlb_locked(s); 270 vtd_iommu_unlock(s); 271 } 272 273 static void vtd_reset_caches(IntelIOMMUState *s) 274 { 275 vtd_iommu_lock(s); 276 vtd_reset_iotlb_locked(s); 277 vtd_reset_context_cache_locked(s); 278 vtd_iommu_unlock(s); 279 } 280 281 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 282 uint32_t level) 283 { 284 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 285 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 286 } 287 288 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 289 { 290 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 291 } 292 293 /* Must be called with IOMMU lock held */ 294 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 295 hwaddr addr) 296 { 297 VTDIOTLBEntry *entry; 298 uint64_t key; 299 int level; 300 301 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 302 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 303 source_id, level); 304 entry = g_hash_table_lookup(s->iotlb, &key); 305 if (entry) { 306 goto out; 307 } 308 } 309 310 out: 311 return entry; 312 } 313 314 /* Must be with IOMMU lock held */ 315 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 316 uint16_t domain_id, hwaddr addr, uint64_t slpte, 317 uint8_t access_flags, uint32_t level) 318 { 319 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 320 uint64_t *key = g_malloc(sizeof(*key)); 321 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 322 323 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 324 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 325 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 326 vtd_reset_iotlb_locked(s); 327 } 328 329 entry->gfn = gfn; 330 entry->domain_id = domain_id; 331 entry->slpte = slpte; 332 entry->access_flags = access_flags; 333 entry->mask = vtd_slpt_level_page_mask(level); 334 *key = vtd_get_iotlb_key(gfn, source_id, level); 335 g_hash_table_replace(s->iotlb, key, entry); 336 } 337 338 /* Given the reg addr of both the message data and address, generate an 339 * interrupt via MSI. 340 */ 341 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 342 hwaddr mesg_data_reg) 343 { 344 MSIMessage msi; 345 346 assert(mesg_data_reg < DMAR_REG_SIZE); 347 assert(mesg_addr_reg < DMAR_REG_SIZE); 348 349 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 350 msi.data = vtd_get_long_raw(s, mesg_data_reg); 351 352 trace_vtd_irq_generate(msi.address, msi.data); 353 354 apic_get_class()->send_msi(&msi); 355 } 356 357 /* Generate a fault event to software via MSI if conditions are met. 358 * Notice that the value of FSTS_REG being passed to it should be the one 359 * before any update. 360 */ 361 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 362 { 363 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 364 pre_fsts & VTD_FSTS_IQE) { 365 error_report_once("There are previous interrupt conditions " 366 "to be serviced by software, fault event " 367 "is not generated"); 368 return; 369 } 370 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 371 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 372 error_report_once("Interrupt Mask set, irq is not generated"); 373 } else { 374 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 375 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 376 } 377 } 378 379 /* Check if the Fault (F) field of the Fault Recording Register referenced by 380 * @index is Set. 381 */ 382 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 383 { 384 /* Each reg is 128-bit */ 385 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 386 addr += 8; /* Access the high 64-bit half */ 387 388 assert(index < DMAR_FRCD_REG_NR); 389 390 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 391 } 392 393 /* Update the PPF field of Fault Status Register. 394 * Should be called whenever change the F field of any fault recording 395 * registers. 396 */ 397 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 398 { 399 uint32_t i; 400 uint32_t ppf_mask = 0; 401 402 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 403 if (vtd_is_frcd_set(s, i)) { 404 ppf_mask = VTD_FSTS_PPF; 405 break; 406 } 407 } 408 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 409 trace_vtd_fsts_ppf(!!ppf_mask); 410 } 411 412 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 413 { 414 /* Each reg is 128-bit */ 415 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 416 addr += 8; /* Access the high 64-bit half */ 417 418 assert(index < DMAR_FRCD_REG_NR); 419 420 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 421 vtd_update_fsts_ppf(s); 422 } 423 424 /* Must not update F field now, should be done later */ 425 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 426 uint16_t source_id, hwaddr addr, 427 VTDFaultReason fault, bool is_write) 428 { 429 uint64_t hi = 0, lo; 430 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 431 432 assert(index < DMAR_FRCD_REG_NR); 433 434 lo = VTD_FRCD_FI(addr); 435 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 436 if (!is_write) { 437 hi |= VTD_FRCD_T; 438 } 439 vtd_set_quad_raw(s, frcd_reg_addr, lo); 440 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 441 442 trace_vtd_frr_new(index, hi, lo); 443 } 444 445 /* Try to collapse multiple pending faults from the same requester */ 446 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 447 { 448 uint32_t i; 449 uint64_t frcd_reg; 450 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 451 452 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 453 frcd_reg = vtd_get_quad_raw(s, addr); 454 if ((frcd_reg & VTD_FRCD_F) && 455 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 456 return true; 457 } 458 addr += 16; /* 128-bit for each */ 459 } 460 return false; 461 } 462 463 /* Log and report an DMAR (address translation) fault to software */ 464 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 465 hwaddr addr, VTDFaultReason fault, 466 bool is_write) 467 { 468 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 469 470 assert(fault < VTD_FR_MAX); 471 472 if (fault == VTD_FR_RESERVED_ERR) { 473 /* This is not a normal fault reason case. Drop it. */ 474 return; 475 } 476 477 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 478 479 if (fsts_reg & VTD_FSTS_PFO) { 480 error_report_once("New fault is not recorded due to " 481 "Primary Fault Overflow"); 482 return; 483 } 484 485 if (vtd_try_collapse_fault(s, source_id)) { 486 error_report_once("New fault is not recorded due to " 487 "compression of faults"); 488 return; 489 } 490 491 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 492 error_report_once("Next Fault Recording Reg is used, " 493 "new fault is not recorded, set PFO field"); 494 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 495 return; 496 } 497 498 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 499 500 if (fsts_reg & VTD_FSTS_PPF) { 501 error_report_once("There are pending faults already, " 502 "fault event is not generated"); 503 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 504 s->next_frcd_reg++; 505 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 506 s->next_frcd_reg = 0; 507 } 508 } else { 509 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 510 VTD_FSTS_FRI(s->next_frcd_reg)); 511 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 512 s->next_frcd_reg++; 513 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 514 s->next_frcd_reg = 0; 515 } 516 /* This case actually cause the PPF to be Set. 517 * So generate fault event (interrupt). 518 */ 519 vtd_generate_fault_event(s, fsts_reg); 520 } 521 } 522 523 /* Handle Invalidation Queue Errors of queued invalidation interface error 524 * conditions. 525 */ 526 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 527 { 528 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 529 530 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 531 vtd_generate_fault_event(s, fsts_reg); 532 } 533 534 /* Set the IWC field and try to generate an invalidation completion interrupt */ 535 static void vtd_generate_completion_event(IntelIOMMUState *s) 536 { 537 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 538 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 539 return; 540 } 541 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 542 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 543 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 544 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 545 "new event not generated"); 546 return; 547 } else { 548 /* Generate the interrupt event */ 549 trace_vtd_inv_desc_wait_irq("Generating complete event"); 550 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 551 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 552 } 553 } 554 555 static inline bool vtd_root_entry_present(IntelIOMMUState *s, 556 VTDRootEntry *re, 557 uint8_t devfn) 558 { 559 if (s->root_scalable && devfn > UINT8_MAX / 2) { 560 return re->hi & VTD_ROOT_ENTRY_P; 561 } 562 563 return re->lo & VTD_ROOT_ENTRY_P; 564 } 565 566 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 567 VTDRootEntry *re) 568 { 569 dma_addr_t addr; 570 571 addr = s->root + index * sizeof(*re); 572 if (dma_memory_read(&address_space_memory, addr, 573 re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) { 574 re->lo = 0; 575 return -VTD_FR_ROOT_TABLE_INV; 576 } 577 re->lo = le64_to_cpu(re->lo); 578 re->hi = le64_to_cpu(re->hi); 579 return 0; 580 } 581 582 static inline bool vtd_ce_present(VTDContextEntry *context) 583 { 584 return context->lo & VTD_CONTEXT_ENTRY_P; 585 } 586 587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s, 588 VTDRootEntry *re, 589 uint8_t index, 590 VTDContextEntry *ce) 591 { 592 dma_addr_t addr, ce_size; 593 594 /* we have checked that root entry is present */ 595 ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : 596 VTD_CTX_ENTRY_LEGACY_SIZE; 597 598 if (s->root_scalable && index > UINT8_MAX / 2) { 599 index = index & (~VTD_DEVFN_CHECK_MASK); 600 addr = re->hi & VTD_ROOT_ENTRY_CTP; 601 } else { 602 addr = re->lo & VTD_ROOT_ENTRY_CTP; 603 } 604 605 addr = addr + index * ce_size; 606 if (dma_memory_read(&address_space_memory, addr, 607 ce, ce_size, MEMTXATTRS_UNSPECIFIED)) { 608 return -VTD_FR_CONTEXT_TABLE_INV; 609 } 610 611 ce->lo = le64_to_cpu(ce->lo); 612 ce->hi = le64_to_cpu(ce->hi); 613 if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { 614 ce->val[2] = le64_to_cpu(ce->val[2]); 615 ce->val[3] = le64_to_cpu(ce->val[3]); 616 } 617 return 0; 618 } 619 620 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 621 { 622 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 623 } 624 625 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 626 { 627 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 628 } 629 630 /* Whether the pte indicates the address of the page frame */ 631 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 632 { 633 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 634 } 635 636 /* Get the content of a spte located in @base_addr[@index] */ 637 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 638 { 639 uint64_t slpte; 640 641 assert(index < VTD_SL_PT_ENTRY_NR); 642 643 if (dma_memory_read(&address_space_memory, 644 base_addr + index * sizeof(slpte), 645 &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) { 646 slpte = (uint64_t)-1; 647 return slpte; 648 } 649 slpte = le64_to_cpu(slpte); 650 return slpte; 651 } 652 653 /* Given an iova and the level of paging structure, return the offset 654 * of current level. 655 */ 656 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 657 { 658 return (iova >> vtd_slpt_level_shift(level)) & 659 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 660 } 661 662 /* Check Capability Register to see if the @level of page-table is supported */ 663 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 664 { 665 return VTD_CAP_SAGAW_MASK & s->cap & 666 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 667 } 668 669 /* Return true if check passed, otherwise false */ 670 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, 671 VTDPASIDEntry *pe) 672 { 673 switch (VTD_PE_GET_TYPE(pe)) { 674 case VTD_SM_PASID_ENTRY_FLT: 675 case VTD_SM_PASID_ENTRY_SLT: 676 case VTD_SM_PASID_ENTRY_NESTED: 677 break; 678 case VTD_SM_PASID_ENTRY_PT: 679 if (!x86_iommu->pt_supported) { 680 return false; 681 } 682 break; 683 default: 684 /* Unknown type */ 685 return false; 686 } 687 return true; 688 } 689 690 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) 691 { 692 return pdire->val & 1; 693 } 694 695 /** 696 * Caller of this function should check present bit if wants 697 * to use pdir entry for further usage except for fpd bit check. 698 */ 699 static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, 700 uint32_t pasid, 701 VTDPASIDDirEntry *pdire) 702 { 703 uint32_t index; 704 dma_addr_t addr, entry_size; 705 706 index = VTD_PASID_DIR_INDEX(pasid); 707 entry_size = VTD_PASID_DIR_ENTRY_SIZE; 708 addr = pasid_dir_base + index * entry_size; 709 if (dma_memory_read(&address_space_memory, addr, 710 pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) { 711 return -VTD_FR_PASID_TABLE_INV; 712 } 713 714 return 0; 715 } 716 717 static inline bool vtd_pe_present(VTDPASIDEntry *pe) 718 { 719 return pe->val[0] & VTD_PASID_ENTRY_P; 720 } 721 722 static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, 723 uint32_t pasid, 724 dma_addr_t addr, 725 VTDPASIDEntry *pe) 726 { 727 uint32_t index; 728 dma_addr_t entry_size; 729 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 730 731 index = VTD_PASID_TABLE_INDEX(pasid); 732 entry_size = VTD_PASID_ENTRY_SIZE; 733 addr = addr + index * entry_size; 734 if (dma_memory_read(&address_space_memory, addr, 735 pe, entry_size, MEMTXATTRS_UNSPECIFIED)) { 736 return -VTD_FR_PASID_TABLE_INV; 737 } 738 739 /* Do translation type check */ 740 if (!vtd_pe_type_check(x86_iommu, pe)) { 741 return -VTD_FR_PASID_TABLE_INV; 742 } 743 744 if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { 745 return -VTD_FR_PASID_TABLE_INV; 746 } 747 748 return 0; 749 } 750 751 /** 752 * Caller of this function should check present bit if wants 753 * to use pasid entry for further usage except for fpd bit check. 754 */ 755 static int vtd_get_pe_from_pdire(IntelIOMMUState *s, 756 uint32_t pasid, 757 VTDPASIDDirEntry *pdire, 758 VTDPASIDEntry *pe) 759 { 760 dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; 761 762 return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe); 763 } 764 765 /** 766 * This function gets a pasid entry from a specified pasid 767 * table (includes dir and leaf table) with a specified pasid. 768 * Sanity check should be done to ensure return a present 769 * pasid entry to caller. 770 */ 771 static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, 772 dma_addr_t pasid_dir_base, 773 uint32_t pasid, 774 VTDPASIDEntry *pe) 775 { 776 int ret; 777 VTDPASIDDirEntry pdire; 778 779 ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, 780 pasid, &pdire); 781 if (ret) { 782 return ret; 783 } 784 785 if (!vtd_pdire_present(&pdire)) { 786 return -VTD_FR_PASID_TABLE_INV; 787 } 788 789 ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); 790 if (ret) { 791 return ret; 792 } 793 794 if (!vtd_pe_present(pe)) { 795 return -VTD_FR_PASID_TABLE_INV; 796 } 797 798 return 0; 799 } 800 801 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, 802 VTDContextEntry *ce, 803 VTDPASIDEntry *pe) 804 { 805 uint32_t pasid; 806 dma_addr_t pasid_dir_base; 807 int ret = 0; 808 809 pasid = VTD_CE_GET_RID2PASID(ce); 810 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 811 ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); 812 813 return ret; 814 } 815 816 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, 817 VTDContextEntry *ce, 818 bool *pe_fpd_set) 819 { 820 int ret; 821 uint32_t pasid; 822 dma_addr_t pasid_dir_base; 823 VTDPASIDDirEntry pdire; 824 VTDPASIDEntry pe; 825 826 pasid = VTD_CE_GET_RID2PASID(ce); 827 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 828 829 /* 830 * No present bit check since fpd is meaningful even 831 * if the present bit is clear. 832 */ 833 ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire); 834 if (ret) { 835 return ret; 836 } 837 838 if (pdire.val & VTD_PASID_DIR_FPD) { 839 *pe_fpd_set = true; 840 return 0; 841 } 842 843 if (!vtd_pdire_present(&pdire)) { 844 return -VTD_FR_PASID_TABLE_INV; 845 } 846 847 /* 848 * No present bit check since fpd is meaningful even 849 * if the present bit is clear. 850 */ 851 ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe); 852 if (ret) { 853 return ret; 854 } 855 856 if (pe.val[0] & VTD_PASID_ENTRY_FPD) { 857 *pe_fpd_set = true; 858 } 859 860 return 0; 861 } 862 863 /* Get the page-table level that hardware should use for the second-level 864 * page-table walk from the Address Width field of context-entry. 865 */ 866 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 867 { 868 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 869 } 870 871 static uint32_t vtd_get_iova_level(IntelIOMMUState *s, 872 VTDContextEntry *ce) 873 { 874 VTDPASIDEntry pe; 875 876 if (s->root_scalable) { 877 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 878 return VTD_PE_GET_LEVEL(&pe); 879 } 880 881 return vtd_ce_get_level(ce); 882 } 883 884 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 885 { 886 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 887 } 888 889 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, 890 VTDContextEntry *ce) 891 { 892 VTDPASIDEntry pe; 893 894 if (s->root_scalable) { 895 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 896 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; 897 } 898 899 return vtd_ce_get_agaw(ce); 900 } 901 902 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 903 { 904 return ce->lo & VTD_CONTEXT_ENTRY_TT; 905 } 906 907 /* Only for Legacy Mode. Return true if check passed, otherwise false */ 908 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 909 VTDContextEntry *ce) 910 { 911 switch (vtd_ce_get_type(ce)) { 912 case VTD_CONTEXT_TT_MULTI_LEVEL: 913 /* Always supported */ 914 break; 915 case VTD_CONTEXT_TT_DEV_IOTLB: 916 if (!x86_iommu->dt_supported) { 917 error_report_once("%s: DT specified but not supported", __func__); 918 return false; 919 } 920 break; 921 case VTD_CONTEXT_TT_PASS_THROUGH: 922 if (!x86_iommu->pt_supported) { 923 error_report_once("%s: PT specified but not supported", __func__); 924 return false; 925 } 926 break; 927 default: 928 /* Unknown type */ 929 error_report_once("%s: unknown ce type: %"PRIu32, __func__, 930 vtd_ce_get_type(ce)); 931 return false; 932 } 933 return true; 934 } 935 936 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, 937 VTDContextEntry *ce, uint8_t aw) 938 { 939 uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); 940 return 1ULL << MIN(ce_agaw, aw); 941 } 942 943 /* Return true if IOVA passes range check, otherwise false. */ 944 static inline bool vtd_iova_range_check(IntelIOMMUState *s, 945 uint64_t iova, VTDContextEntry *ce, 946 uint8_t aw) 947 { 948 /* 949 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 950 * in CAP_REG and AW in context-entry. 951 */ 952 return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); 953 } 954 955 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, 956 VTDContextEntry *ce) 957 { 958 VTDPASIDEntry pe; 959 960 if (s->root_scalable) { 961 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 962 return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; 963 } 964 965 return vtd_ce_get_slpt_base(ce); 966 } 967 968 /* 969 * Rsvd field masks for spte: 970 * vtd_spte_rsvd 4k pages 971 * vtd_spte_rsvd_large large pages 972 */ 973 static uint64_t vtd_spte_rsvd[5]; 974 static uint64_t vtd_spte_rsvd_large[5]; 975 976 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 977 { 978 uint64_t rsvd_mask = vtd_spte_rsvd[level]; 979 980 if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && 981 (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { 982 /* large page */ 983 rsvd_mask = vtd_spte_rsvd_large[level]; 984 } 985 986 return slpte & rsvd_mask; 987 } 988 989 /* Find the VTD address space associated with a given bus number */ 990 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 991 { 992 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 993 GHashTableIter iter; 994 995 if (vtd_bus) { 996 return vtd_bus; 997 } 998 999 /* 1000 * Iterate over the registered buses to find the one which 1001 * currently holds this bus number and update the bus_num 1002 * lookup table. 1003 */ 1004 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1005 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1006 if (pci_bus_num(vtd_bus->bus) == bus_num) { 1007 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 1008 return vtd_bus; 1009 } 1010 } 1011 1012 return NULL; 1013 } 1014 1015 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 1016 * of the translation, can be used for deciding the size of large page. 1017 */ 1018 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, 1019 uint64_t iova, bool is_write, 1020 uint64_t *slptep, uint32_t *slpte_level, 1021 bool *reads, bool *writes, uint8_t aw_bits) 1022 { 1023 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1024 uint32_t level = vtd_get_iova_level(s, ce); 1025 uint32_t offset; 1026 uint64_t slpte; 1027 uint64_t access_right_check; 1028 1029 if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { 1030 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 1031 __func__, iova); 1032 return -VTD_FR_ADDR_BEYOND_MGAW; 1033 } 1034 1035 /* FIXME: what is the Atomics request here? */ 1036 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 1037 1038 while (true) { 1039 offset = vtd_iova_level_offset(iova, level); 1040 slpte = vtd_get_slpte(addr, offset); 1041 1042 if (slpte == (uint64_t)-1) { 1043 error_report_once("%s: detected read error on DMAR slpte " 1044 "(iova=0x%" PRIx64 ")", __func__, iova); 1045 if (level == vtd_get_iova_level(s, ce)) { 1046 /* Invalid programming of context-entry */ 1047 return -VTD_FR_CONTEXT_ENTRY_INV; 1048 } else { 1049 return -VTD_FR_PAGING_ENTRY_INV; 1050 } 1051 } 1052 *reads = (*reads) && (slpte & VTD_SL_R); 1053 *writes = (*writes) && (slpte & VTD_SL_W); 1054 if (!(slpte & access_right_check)) { 1055 error_report_once("%s: detected slpte permission error " 1056 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 1057 "slpte=0x%" PRIx64 ", write=%d)", __func__, 1058 iova, level, slpte, is_write); 1059 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 1060 } 1061 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1062 error_report_once("%s: detected splte reserve non-zero " 1063 "iova=0x%" PRIx64 ", level=0x%" PRIx32 1064 "slpte=0x%" PRIx64 ")", __func__, iova, 1065 level, slpte); 1066 return -VTD_FR_PAGING_ENTRY_RSVD; 1067 } 1068 1069 if (vtd_is_last_slpte(slpte, level)) { 1070 *slptep = slpte; 1071 *slpte_level = level; 1072 return 0; 1073 } 1074 addr = vtd_get_slpte_addr(slpte, aw_bits); 1075 level--; 1076 } 1077 } 1078 1079 typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); 1080 1081 /** 1082 * Constant information used during page walking 1083 * 1084 * @hook_fn: hook func to be called when detected page 1085 * @private: private data to be passed into hook func 1086 * @notify_unmap: whether we should notify invalid entries 1087 * @as: VT-d address space of the device 1088 * @aw: maximum address width 1089 * @domain: domain ID of the page walk 1090 */ 1091 typedef struct { 1092 VTDAddressSpace *as; 1093 vtd_page_walk_hook hook_fn; 1094 void *private; 1095 bool notify_unmap; 1096 uint8_t aw; 1097 uint16_t domain_id; 1098 } vtd_page_walk_info; 1099 1100 static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) 1101 { 1102 VTDAddressSpace *as = info->as; 1103 vtd_page_walk_hook hook_fn = info->hook_fn; 1104 void *private = info->private; 1105 IOMMUTLBEntry *entry = &event->entry; 1106 DMAMap target = { 1107 .iova = entry->iova, 1108 .size = entry->addr_mask, 1109 .translated_addr = entry->translated_addr, 1110 .perm = entry->perm, 1111 }; 1112 const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 1113 1114 if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { 1115 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1116 return 0; 1117 } 1118 1119 assert(hook_fn); 1120 1121 /* Update local IOVA mapped ranges */ 1122 if (event->type == IOMMU_NOTIFIER_MAP) { 1123 if (mapped) { 1124 /* If it's exactly the same translation, skip */ 1125 if (!memcmp(mapped, &target, sizeof(target))) { 1126 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 1127 entry->translated_addr); 1128 return 0; 1129 } else { 1130 /* 1131 * Translation changed. Normally this should not 1132 * happen, but it can happen when with buggy guest 1133 * OSes. Note that there will be a small window that 1134 * we don't have map at all. But that's the best 1135 * effort we can do. The ideal way to emulate this is 1136 * atomically modify the PTE to follow what has 1137 * changed, but we can't. One example is that vfio 1138 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 1139 * interface to modify a mapping (meanwhile it seems 1140 * meaningless to even provide one). Anyway, let's 1141 * mark this as a TODO in case one day we'll have 1142 * a better solution. 1143 */ 1144 IOMMUAccessFlags cache_perm = entry->perm; 1145 int ret; 1146 1147 /* Emulate an UNMAP */ 1148 event->type = IOMMU_NOTIFIER_UNMAP; 1149 entry->perm = IOMMU_NONE; 1150 trace_vtd_page_walk_one(info->domain_id, 1151 entry->iova, 1152 entry->translated_addr, 1153 entry->addr_mask, 1154 entry->perm); 1155 ret = hook_fn(event, private); 1156 if (ret) { 1157 return ret; 1158 } 1159 /* Drop any existing mapping */ 1160 iova_tree_remove(as->iova_tree, &target); 1161 /* Recover the correct type */ 1162 event->type = IOMMU_NOTIFIER_MAP; 1163 entry->perm = cache_perm; 1164 } 1165 } 1166 iova_tree_insert(as->iova_tree, &target); 1167 } else { 1168 if (!mapped) { 1169 /* Skip since we didn't map this range at all */ 1170 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1171 return 0; 1172 } 1173 iova_tree_remove(as->iova_tree, &target); 1174 } 1175 1176 trace_vtd_page_walk_one(info->domain_id, entry->iova, 1177 entry->translated_addr, entry->addr_mask, 1178 entry->perm); 1179 return hook_fn(event, private); 1180 } 1181 1182 /** 1183 * vtd_page_walk_level - walk over specific level for IOVA range 1184 * 1185 * @addr: base GPA addr to start the walk 1186 * @start: IOVA range start address 1187 * @end: IOVA range end address (start <= addr < end) 1188 * @read: whether parent level has read permission 1189 * @write: whether parent level has write permission 1190 * @info: constant information for the page walk 1191 */ 1192 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 1193 uint64_t end, uint32_t level, bool read, 1194 bool write, vtd_page_walk_info *info) 1195 { 1196 bool read_cur, write_cur, entry_valid; 1197 uint32_t offset; 1198 uint64_t slpte; 1199 uint64_t subpage_size, subpage_mask; 1200 IOMMUTLBEvent event; 1201 uint64_t iova = start; 1202 uint64_t iova_next; 1203 int ret = 0; 1204 1205 trace_vtd_page_walk_level(addr, level, start, end); 1206 1207 subpage_size = 1ULL << vtd_slpt_level_shift(level); 1208 subpage_mask = vtd_slpt_level_page_mask(level); 1209 1210 while (iova < end) { 1211 iova_next = (iova & subpage_mask) + subpage_size; 1212 1213 offset = vtd_iova_level_offset(iova, level); 1214 slpte = vtd_get_slpte(addr, offset); 1215 1216 if (slpte == (uint64_t)-1) { 1217 trace_vtd_page_walk_skip_read(iova, iova_next); 1218 goto next; 1219 } 1220 1221 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1222 trace_vtd_page_walk_skip_reserve(iova, iova_next); 1223 goto next; 1224 } 1225 1226 /* Permissions are stacked with parents' */ 1227 read_cur = read && (slpte & VTD_SL_R); 1228 write_cur = write && (slpte & VTD_SL_W); 1229 1230 /* 1231 * As long as we have either read/write permission, this is a 1232 * valid entry. The rule works for both page entries and page 1233 * table entries. 1234 */ 1235 entry_valid = read_cur | write_cur; 1236 1237 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 1238 /* 1239 * This is a valid PDE (or even bigger than PDE). We need 1240 * to walk one further level. 1241 */ 1242 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 1243 iova, MIN(iova_next, end), level - 1, 1244 read_cur, write_cur, info); 1245 } else { 1246 /* 1247 * This means we are either: 1248 * 1249 * (1) the real page entry (either 4K page, or huge page) 1250 * (2) the whole range is invalid 1251 * 1252 * In either case, we send an IOTLB notification down. 1253 */ 1254 event.entry.target_as = &address_space_memory; 1255 event.entry.iova = iova & subpage_mask; 1256 event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 1257 event.entry.addr_mask = ~subpage_mask; 1258 /* NOTE: this is only meaningful if entry_valid == true */ 1259 event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 1260 event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : 1261 IOMMU_NOTIFIER_UNMAP; 1262 ret = vtd_page_walk_one(&event, info); 1263 } 1264 1265 if (ret < 0) { 1266 return ret; 1267 } 1268 1269 next: 1270 iova = iova_next; 1271 } 1272 1273 return 0; 1274 } 1275 1276 /** 1277 * vtd_page_walk - walk specific IOVA range, and call the hook 1278 * 1279 * @s: intel iommu state 1280 * @ce: context entry to walk upon 1281 * @start: IOVA address to start the walk 1282 * @end: IOVA range end address (start <= addr < end) 1283 * @info: page walking information struct 1284 */ 1285 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, 1286 uint64_t start, uint64_t end, 1287 vtd_page_walk_info *info) 1288 { 1289 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1290 uint32_t level = vtd_get_iova_level(s, ce); 1291 1292 if (!vtd_iova_range_check(s, start, ce, info->aw)) { 1293 return -VTD_FR_ADDR_BEYOND_MGAW; 1294 } 1295 1296 if (!vtd_iova_range_check(s, end, ce, info->aw)) { 1297 /* Fix end so that it reaches the maximum */ 1298 end = vtd_iova_limit(s, ce, info->aw); 1299 } 1300 1301 return vtd_page_walk_level(addr, start, end, level, true, true, info); 1302 } 1303 1304 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, 1305 VTDRootEntry *re) 1306 { 1307 /* Legacy Mode reserved bits check */ 1308 if (!s->root_scalable && 1309 (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1310 goto rsvd_err; 1311 1312 /* Scalable Mode reserved bits check */ 1313 if (s->root_scalable && 1314 ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || 1315 (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1316 goto rsvd_err; 1317 1318 return 0; 1319 1320 rsvd_err: 1321 error_report_once("%s: invalid root entry: hi=0x%"PRIx64 1322 ", lo=0x%"PRIx64, 1323 __func__, re->hi, re->lo); 1324 return -VTD_FR_ROOT_ENTRY_RSVD; 1325 } 1326 1327 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, 1328 VTDContextEntry *ce) 1329 { 1330 if (!s->root_scalable && 1331 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || 1332 ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1333 error_report_once("%s: invalid context entry: hi=%"PRIx64 1334 ", lo=%"PRIx64" (reserved nonzero)", 1335 __func__, ce->hi, ce->lo); 1336 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1337 } 1338 1339 if (s->root_scalable && 1340 (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || 1341 ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || 1342 ce->val[2] || 1343 ce->val[3])) { 1344 error_report_once("%s: invalid context entry: val[3]=%"PRIx64 1345 ", val[2]=%"PRIx64 1346 ", val[1]=%"PRIx64 1347 ", val[0]=%"PRIx64" (reserved nonzero)", 1348 __func__, ce->val[3], ce->val[2], 1349 ce->val[1], ce->val[0]); 1350 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1351 } 1352 1353 return 0; 1354 } 1355 1356 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, 1357 VTDContextEntry *ce) 1358 { 1359 VTDPASIDEntry pe; 1360 1361 /* 1362 * Make sure in Scalable Mode, a present context entry 1363 * has valid rid2pasid setting, which includes valid 1364 * rid2pasid field and corresponding pasid entry setting 1365 */ 1366 return vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1367 } 1368 1369 /* Map a device to its corresponding domain (context-entry) */ 1370 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 1371 uint8_t devfn, VTDContextEntry *ce) 1372 { 1373 VTDRootEntry re; 1374 int ret_fr; 1375 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 1376 1377 ret_fr = vtd_get_root_entry(s, bus_num, &re); 1378 if (ret_fr) { 1379 return ret_fr; 1380 } 1381 1382 if (!vtd_root_entry_present(s, &re, devfn)) { 1383 /* Not error - it's okay we don't have root entry. */ 1384 trace_vtd_re_not_present(bus_num); 1385 return -VTD_FR_ROOT_ENTRY_P; 1386 } 1387 1388 ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); 1389 if (ret_fr) { 1390 return ret_fr; 1391 } 1392 1393 ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); 1394 if (ret_fr) { 1395 return ret_fr; 1396 } 1397 1398 if (!vtd_ce_present(ce)) { 1399 /* Not error - it's okay we don't have context entry. */ 1400 trace_vtd_ce_not_present(bus_num, devfn); 1401 return -VTD_FR_CONTEXT_ENTRY_P; 1402 } 1403 1404 ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); 1405 if (ret_fr) { 1406 return ret_fr; 1407 } 1408 1409 /* Check if the programming of context-entry is valid */ 1410 if (!s->root_scalable && 1411 !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1412 error_report_once("%s: invalid context entry: hi=%"PRIx64 1413 ", lo=%"PRIx64" (level %d not supported)", 1414 __func__, ce->hi, ce->lo, 1415 vtd_ce_get_level(ce)); 1416 return -VTD_FR_CONTEXT_ENTRY_INV; 1417 } 1418 1419 if (!s->root_scalable) { 1420 /* Do translation type check */ 1421 if (!vtd_ce_type_check(x86_iommu, ce)) { 1422 /* Errors dumped in vtd_ce_type_check() */ 1423 return -VTD_FR_CONTEXT_ENTRY_INV; 1424 } 1425 } else { 1426 /* 1427 * Check if the programming of context-entry.rid2pasid 1428 * and corresponding pasid setting is valid, and thus 1429 * avoids to check pasid entry fetching result in future 1430 * helper function calling. 1431 */ 1432 ret_fr = vtd_ce_rid2pasid_check(s, ce); 1433 if (ret_fr) { 1434 return ret_fr; 1435 } 1436 } 1437 1438 return 0; 1439 } 1440 1441 static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, 1442 void *private) 1443 { 1444 memory_region_notify_iommu(private, 0, *event); 1445 return 0; 1446 } 1447 1448 static uint16_t vtd_get_domain_id(IntelIOMMUState *s, 1449 VTDContextEntry *ce) 1450 { 1451 VTDPASIDEntry pe; 1452 1453 if (s->root_scalable) { 1454 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1455 return VTD_SM_PASID_ENTRY_DID(pe.val[1]); 1456 } 1457 1458 return VTD_CONTEXT_ENTRY_DID(ce->hi); 1459 } 1460 1461 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1462 VTDContextEntry *ce, 1463 hwaddr addr, hwaddr size) 1464 { 1465 IntelIOMMUState *s = vtd_as->iommu_state; 1466 vtd_page_walk_info info = { 1467 .hook_fn = vtd_sync_shadow_page_hook, 1468 .private = (void *)&vtd_as->iommu, 1469 .notify_unmap = true, 1470 .aw = s->aw_bits, 1471 .as = vtd_as, 1472 .domain_id = vtd_get_domain_id(s, ce), 1473 }; 1474 1475 return vtd_page_walk(s, ce, addr, addr + size, &info); 1476 } 1477 1478 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1479 { 1480 int ret; 1481 VTDContextEntry ce; 1482 IOMMUNotifier *n; 1483 1484 if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { 1485 return 0; 1486 } 1487 1488 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1489 pci_bus_num(vtd_as->bus), 1490 vtd_as->devfn, &ce); 1491 if (ret) { 1492 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1493 /* 1494 * It's a valid scenario to have a context entry that is 1495 * not present. For example, when a device is removed 1496 * from an existing domain then the context entry will be 1497 * zeroed by the guest before it was put into another 1498 * domain. When this happens, instead of synchronizing 1499 * the shadow pages we should invalidate all existing 1500 * mappings and notify the backends. 1501 */ 1502 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1503 vtd_address_space_unmap(vtd_as, n); 1504 } 1505 ret = 0; 1506 } 1507 return ret; 1508 } 1509 1510 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1511 } 1512 1513 /* 1514 * Check if specific device is configured to bypass address 1515 * translation for DMA requests. In Scalable Mode, bypass 1516 * 1st-level translation or 2nd-level translation, it depends 1517 * on PGTT setting. 1518 */ 1519 static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce) 1520 { 1521 VTDPASIDEntry pe; 1522 int ret; 1523 1524 if (s->root_scalable) { 1525 ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1526 if (ret) { 1527 error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, 1528 __func__, ret); 1529 return false; 1530 } 1531 return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); 1532 } 1533 1534 return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH); 1535 1536 } 1537 1538 static bool vtd_as_pt_enabled(VTDAddressSpace *as) 1539 { 1540 IntelIOMMUState *s; 1541 VTDContextEntry ce; 1542 int ret; 1543 1544 assert(as); 1545 1546 s = as->iommu_state; 1547 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1548 as->devfn, &ce); 1549 if (ret) { 1550 /* 1551 * Possibly failed to parse the context entry for some reason 1552 * (e.g., during init, or any guest configuration errors on 1553 * context entries). We should assume PT not enabled for 1554 * safety. 1555 */ 1556 return false; 1557 } 1558 1559 return vtd_dev_pt_enabled(s, &ce); 1560 } 1561 1562 /* Return whether the device is using IOMMU translation. */ 1563 static bool vtd_switch_address_space(VTDAddressSpace *as) 1564 { 1565 bool use_iommu; 1566 /* Whether we need to take the BQL on our own */ 1567 bool take_bql = !qemu_mutex_iothread_locked(); 1568 1569 assert(as); 1570 1571 use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as); 1572 1573 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1574 VTD_PCI_SLOT(as->devfn), 1575 VTD_PCI_FUNC(as->devfn), 1576 use_iommu); 1577 1578 /* 1579 * It's possible that we reach here without BQL, e.g., when called 1580 * from vtd_pt_enable_fast_path(). However the memory APIs need 1581 * it. We'd better make sure we have had it already, or, take it. 1582 */ 1583 if (take_bql) { 1584 qemu_mutex_lock_iothread(); 1585 } 1586 1587 /* Turn off first then on the other */ 1588 if (use_iommu) { 1589 memory_region_set_enabled(&as->nodmar, false); 1590 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1591 } else { 1592 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1593 memory_region_set_enabled(&as->nodmar, true); 1594 } 1595 1596 if (take_bql) { 1597 qemu_mutex_unlock_iothread(); 1598 } 1599 1600 return use_iommu; 1601 } 1602 1603 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1604 { 1605 GHashTableIter iter; 1606 VTDBus *vtd_bus; 1607 int i; 1608 1609 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1610 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1611 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1612 if (!vtd_bus->dev_as[i]) { 1613 continue; 1614 } 1615 vtd_switch_address_space(vtd_bus->dev_as[i]); 1616 } 1617 } 1618 } 1619 1620 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1621 { 1622 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1623 } 1624 1625 static const bool vtd_qualified_faults[] = { 1626 [VTD_FR_RESERVED] = false, 1627 [VTD_FR_ROOT_ENTRY_P] = false, 1628 [VTD_FR_CONTEXT_ENTRY_P] = true, 1629 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1630 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1631 [VTD_FR_WRITE] = true, 1632 [VTD_FR_READ] = true, 1633 [VTD_FR_PAGING_ENTRY_INV] = true, 1634 [VTD_FR_ROOT_TABLE_INV] = false, 1635 [VTD_FR_CONTEXT_TABLE_INV] = false, 1636 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1637 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1638 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1639 [VTD_FR_PASID_TABLE_INV] = false, 1640 [VTD_FR_RESERVED_ERR] = false, 1641 [VTD_FR_MAX] = false, 1642 }; 1643 1644 /* To see if a fault condition is "qualified", which is reported to software 1645 * only if the FPD field in the context-entry used to process the faulting 1646 * request is 0. 1647 */ 1648 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1649 { 1650 return vtd_qualified_faults[fault]; 1651 } 1652 1653 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1654 { 1655 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1656 } 1657 1658 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1659 { 1660 VTDBus *vtd_bus; 1661 VTDAddressSpace *vtd_as; 1662 bool success = false; 1663 1664 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1665 if (!vtd_bus) { 1666 goto out; 1667 } 1668 1669 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1670 if (!vtd_as) { 1671 goto out; 1672 } 1673 1674 if (vtd_switch_address_space(vtd_as) == false) { 1675 /* We switched off IOMMU region successfully. */ 1676 success = true; 1677 } 1678 1679 out: 1680 trace_vtd_pt_enable_fast_path(source_id, success); 1681 } 1682 1683 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1684 * translation. 1685 * 1686 * Called from RCU critical section. 1687 * 1688 * @bus_num: The bus number 1689 * @devfn: The devfn, which is the combined of device and function number 1690 * @is_write: The access is a write operation 1691 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1692 * 1693 * Returns true if translation is successful, otherwise false. 1694 */ 1695 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1696 uint8_t devfn, hwaddr addr, bool is_write, 1697 IOMMUTLBEntry *entry) 1698 { 1699 IntelIOMMUState *s = vtd_as->iommu_state; 1700 VTDContextEntry ce; 1701 uint8_t bus_num = pci_bus_num(bus); 1702 VTDContextCacheEntry *cc_entry; 1703 uint64_t slpte, page_mask; 1704 uint32_t level; 1705 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1706 int ret_fr; 1707 bool is_fpd_set = false; 1708 bool reads = true; 1709 bool writes = true; 1710 uint8_t access_flags; 1711 VTDIOTLBEntry *iotlb_entry; 1712 1713 /* 1714 * We have standalone memory region for interrupt addresses, we 1715 * should never receive translation requests in this region. 1716 */ 1717 assert(!vtd_is_interrupt_addr(addr)); 1718 1719 vtd_iommu_lock(s); 1720 1721 cc_entry = &vtd_as->context_cache_entry; 1722 1723 /* Try to fetch slpte form IOTLB */ 1724 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1725 if (iotlb_entry) { 1726 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1727 iotlb_entry->domain_id); 1728 slpte = iotlb_entry->slpte; 1729 access_flags = iotlb_entry->access_flags; 1730 page_mask = iotlb_entry->mask; 1731 goto out; 1732 } 1733 1734 /* Try to fetch context-entry from cache first */ 1735 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1736 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1737 cc_entry->context_entry.lo, 1738 cc_entry->context_cache_gen); 1739 ce = cc_entry->context_entry; 1740 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1741 if (!is_fpd_set && s->root_scalable) { 1742 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1743 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1744 } 1745 } else { 1746 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1747 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1748 if (!ret_fr && !is_fpd_set && s->root_scalable) { 1749 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1750 } 1751 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1752 /* Update context-cache */ 1753 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1754 cc_entry->context_cache_gen, 1755 s->context_cache_gen); 1756 cc_entry->context_entry = ce; 1757 cc_entry->context_cache_gen = s->context_cache_gen; 1758 } 1759 1760 /* 1761 * We don't need to translate for pass-through context entries. 1762 * Also, let's ignore IOTLB caching as well for PT devices. 1763 */ 1764 if (vtd_dev_pt_enabled(s, &ce)) { 1765 entry->iova = addr & VTD_PAGE_MASK_4K; 1766 entry->translated_addr = entry->iova; 1767 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1768 entry->perm = IOMMU_RW; 1769 trace_vtd_translate_pt(source_id, entry->iova); 1770 1771 /* 1772 * When this happens, it means firstly caching-mode is not 1773 * enabled, and this is the first passthrough translation for 1774 * the device. Let's enable the fast path for passthrough. 1775 * 1776 * When passthrough is disabled again for the device, we can 1777 * capture it via the context entry invalidation, then the 1778 * IOMMU region can be swapped back. 1779 */ 1780 vtd_pt_enable_fast_path(s, source_id); 1781 vtd_iommu_unlock(s); 1782 return true; 1783 } 1784 1785 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, 1786 &reads, &writes, s->aw_bits); 1787 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1788 1789 page_mask = vtd_slpt_level_page_mask(level); 1790 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1791 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, 1792 access_flags, level); 1793 out: 1794 vtd_iommu_unlock(s); 1795 entry->iova = addr & page_mask; 1796 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1797 entry->addr_mask = ~page_mask; 1798 entry->perm = access_flags; 1799 return true; 1800 1801 error: 1802 vtd_iommu_unlock(s); 1803 entry->iova = 0; 1804 entry->translated_addr = 0; 1805 entry->addr_mask = 0; 1806 entry->perm = IOMMU_NONE; 1807 return false; 1808 } 1809 1810 static void vtd_root_table_setup(IntelIOMMUState *s) 1811 { 1812 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1813 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1814 1815 vtd_update_scalable_state(s); 1816 1817 trace_vtd_reg_dmar_root(s->root, s->root_scalable); 1818 } 1819 1820 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1821 uint32_t index, uint32_t mask) 1822 { 1823 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1824 } 1825 1826 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1827 { 1828 uint64_t value = 0; 1829 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1830 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1831 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1832 s->intr_eime = value & VTD_IRTA_EIME; 1833 1834 /* Notify global invalidation */ 1835 vtd_iec_notify_all(s, true, 0, 0); 1836 1837 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1838 } 1839 1840 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1841 { 1842 VTDAddressSpace *vtd_as; 1843 1844 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1845 vtd_sync_shadow_page_table(vtd_as); 1846 } 1847 } 1848 1849 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1850 { 1851 trace_vtd_inv_desc_cc_global(); 1852 /* Protects context cache */ 1853 vtd_iommu_lock(s); 1854 s->context_cache_gen++; 1855 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1856 vtd_reset_context_cache_locked(s); 1857 } 1858 vtd_iommu_unlock(s); 1859 vtd_address_space_refresh_all(s); 1860 /* 1861 * From VT-d spec 6.5.2.1, a global context entry invalidation 1862 * should be followed by a IOTLB global invalidation, so we should 1863 * be safe even without this. Hoewever, let's replay the region as 1864 * well to be safer, and go back here when we need finer tunes for 1865 * VT-d emulation codes. 1866 */ 1867 vtd_iommu_replay_all(s); 1868 } 1869 1870 /* Do a context-cache device-selective invalidation. 1871 * @func_mask: FM field after shifting 1872 */ 1873 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1874 uint16_t source_id, 1875 uint16_t func_mask) 1876 { 1877 uint16_t mask; 1878 VTDBus *vtd_bus; 1879 VTDAddressSpace *vtd_as; 1880 uint8_t bus_n, devfn; 1881 uint16_t devfn_it; 1882 1883 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1884 1885 switch (func_mask & 3) { 1886 case 0: 1887 mask = 0; /* No bits in the SID field masked */ 1888 break; 1889 case 1: 1890 mask = 4; /* Mask bit 2 in the SID field */ 1891 break; 1892 case 2: 1893 mask = 6; /* Mask bit 2:1 in the SID field */ 1894 break; 1895 case 3: 1896 mask = 7; /* Mask bit 2:0 in the SID field */ 1897 break; 1898 default: 1899 g_assert_not_reached(); 1900 } 1901 mask = ~mask; 1902 1903 bus_n = VTD_SID_TO_BUS(source_id); 1904 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1905 if (vtd_bus) { 1906 devfn = VTD_SID_TO_DEVFN(source_id); 1907 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1908 vtd_as = vtd_bus->dev_as[devfn_it]; 1909 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1910 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1911 VTD_PCI_FUNC(devfn_it)); 1912 vtd_iommu_lock(s); 1913 vtd_as->context_cache_entry.context_cache_gen = 0; 1914 vtd_iommu_unlock(s); 1915 /* 1916 * Do switch address space when needed, in case if the 1917 * device passthrough bit is switched. 1918 */ 1919 vtd_switch_address_space(vtd_as); 1920 /* 1921 * So a device is moving out of (or moving into) a 1922 * domain, resync the shadow page table. 1923 * This won't bring bad even if we have no such 1924 * notifier registered - the IOMMU notification 1925 * framework will skip MAP notifications if that 1926 * happened. 1927 */ 1928 vtd_sync_shadow_page_table(vtd_as); 1929 } 1930 } 1931 } 1932 } 1933 1934 /* Context-cache invalidation 1935 * Returns the Context Actual Invalidation Granularity. 1936 * @val: the content of the CCMD_REG 1937 */ 1938 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1939 { 1940 uint64_t caig; 1941 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1942 1943 switch (type) { 1944 case VTD_CCMD_DOMAIN_INVL: 1945 /* Fall through */ 1946 case VTD_CCMD_GLOBAL_INVL: 1947 caig = VTD_CCMD_GLOBAL_INVL_A; 1948 vtd_context_global_invalidate(s); 1949 break; 1950 1951 case VTD_CCMD_DEVICE_INVL: 1952 caig = VTD_CCMD_DEVICE_INVL_A; 1953 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1954 break; 1955 1956 default: 1957 error_report_once("%s: invalid context: 0x%" PRIx64, 1958 __func__, val); 1959 caig = 0; 1960 } 1961 return caig; 1962 } 1963 1964 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1965 { 1966 trace_vtd_inv_desc_iotlb_global(); 1967 vtd_reset_iotlb(s); 1968 vtd_iommu_replay_all(s); 1969 } 1970 1971 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1972 { 1973 VTDContextEntry ce; 1974 VTDAddressSpace *vtd_as; 1975 1976 trace_vtd_inv_desc_iotlb_domain(domain_id); 1977 1978 vtd_iommu_lock(s); 1979 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1980 &domain_id); 1981 vtd_iommu_unlock(s); 1982 1983 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1984 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1985 vtd_as->devfn, &ce) && 1986 domain_id == vtd_get_domain_id(s, &ce)) { 1987 vtd_sync_shadow_page_table(vtd_as); 1988 } 1989 } 1990 } 1991 1992 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1993 uint16_t domain_id, hwaddr addr, 1994 uint8_t am) 1995 { 1996 VTDAddressSpace *vtd_as; 1997 VTDContextEntry ce; 1998 int ret; 1999 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 2000 2001 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 2002 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 2003 vtd_as->devfn, &ce); 2004 if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { 2005 if (vtd_as_has_map_notifier(vtd_as)) { 2006 /* 2007 * As long as we have MAP notifications registered in 2008 * any of our IOMMU notifiers, we need to sync the 2009 * shadow page table. 2010 */ 2011 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 2012 } else { 2013 /* 2014 * For UNMAP-only notifiers, we don't need to walk the 2015 * page tables. We just deliver the PSI down to 2016 * invalidate caches. 2017 */ 2018 IOMMUTLBEvent event = { 2019 .type = IOMMU_NOTIFIER_UNMAP, 2020 .entry = { 2021 .target_as = &address_space_memory, 2022 .iova = addr, 2023 .translated_addr = 0, 2024 .addr_mask = size - 1, 2025 .perm = IOMMU_NONE, 2026 }, 2027 }; 2028 memory_region_notify_iommu(&vtd_as->iommu, 0, event); 2029 } 2030 } 2031 } 2032 } 2033 2034 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 2035 hwaddr addr, uint8_t am) 2036 { 2037 VTDIOTLBPageInvInfo info; 2038 2039 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 2040 2041 assert(am <= VTD_MAMV); 2042 info.domain_id = domain_id; 2043 info.addr = addr; 2044 info.mask = ~((1 << am) - 1); 2045 vtd_iommu_lock(s); 2046 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 2047 vtd_iommu_unlock(s); 2048 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 2049 } 2050 2051 /* Flush IOTLB 2052 * Returns the IOTLB Actual Invalidation Granularity. 2053 * @val: the content of the IOTLB_REG 2054 */ 2055 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 2056 { 2057 uint64_t iaig; 2058 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 2059 uint16_t domain_id; 2060 hwaddr addr; 2061 uint8_t am; 2062 2063 switch (type) { 2064 case VTD_TLB_GLOBAL_FLUSH: 2065 iaig = VTD_TLB_GLOBAL_FLUSH_A; 2066 vtd_iotlb_global_invalidate(s); 2067 break; 2068 2069 case VTD_TLB_DSI_FLUSH: 2070 domain_id = VTD_TLB_DID(val); 2071 iaig = VTD_TLB_DSI_FLUSH_A; 2072 vtd_iotlb_domain_invalidate(s, domain_id); 2073 break; 2074 2075 case VTD_TLB_PSI_FLUSH: 2076 domain_id = VTD_TLB_DID(val); 2077 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 2078 am = VTD_IVA_AM(addr); 2079 addr = VTD_IVA_ADDR(addr); 2080 if (am > VTD_MAMV) { 2081 error_report_once("%s: address mask overflow: 0x%" PRIx64, 2082 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 2083 iaig = 0; 2084 break; 2085 } 2086 iaig = VTD_TLB_PSI_FLUSH_A; 2087 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2088 break; 2089 2090 default: 2091 error_report_once("%s: invalid granularity: 0x%" PRIx64, 2092 __func__, val); 2093 iaig = 0; 2094 } 2095 return iaig; 2096 } 2097 2098 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 2099 2100 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 2101 { 2102 return s->qi_enabled && (s->iq_tail == s->iq_head) && 2103 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 2104 } 2105 2106 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 2107 { 2108 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 2109 2110 trace_vtd_inv_qi_enable(en); 2111 2112 if (en) { 2113 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 2114 /* 2^(x+8) entries */ 2115 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); 2116 s->qi_enabled = true; 2117 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 2118 /* Ok - report back to driver */ 2119 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 2120 2121 if (s->iq_tail != 0) { 2122 /* 2123 * This is a spec violation but Windows guests are known to set up 2124 * Queued Invalidation this way so we allow the write and process 2125 * Invalidation Descriptors right away. 2126 */ 2127 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 2128 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2129 vtd_fetch_inv_desc(s); 2130 } 2131 } 2132 } else { 2133 if (vtd_queued_inv_disable_check(s)) { 2134 /* disable Queued Invalidation */ 2135 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 2136 s->iq_head = 0; 2137 s->qi_enabled = false; 2138 /* Ok - report back to driver */ 2139 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 2140 } else { 2141 error_report_once("%s: detected improper state when disable QI " 2142 "(head=0x%x, tail=0x%x, last_type=%d)", 2143 __func__, 2144 s->iq_head, s->iq_tail, s->iq_last_desc_type); 2145 } 2146 } 2147 } 2148 2149 /* Set Root Table Pointer */ 2150 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 2151 { 2152 vtd_root_table_setup(s); 2153 /* Ok - report back to driver */ 2154 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 2155 vtd_reset_caches(s); 2156 vtd_address_space_refresh_all(s); 2157 } 2158 2159 /* Set Interrupt Remap Table Pointer */ 2160 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 2161 { 2162 vtd_interrupt_remap_table_setup(s); 2163 /* Ok - report back to driver */ 2164 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 2165 } 2166 2167 /* Handle Translation Enable/Disable */ 2168 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 2169 { 2170 if (s->dmar_enabled == en) { 2171 return; 2172 } 2173 2174 trace_vtd_dmar_enable(en); 2175 2176 if (en) { 2177 s->dmar_enabled = true; 2178 /* Ok - report back to driver */ 2179 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 2180 } else { 2181 s->dmar_enabled = false; 2182 2183 /* Clear the index of Fault Recording Register */ 2184 s->next_frcd_reg = 0; 2185 /* Ok - report back to driver */ 2186 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 2187 } 2188 2189 vtd_reset_caches(s); 2190 vtd_address_space_refresh_all(s); 2191 } 2192 2193 /* Handle Interrupt Remap Enable/Disable */ 2194 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 2195 { 2196 trace_vtd_ir_enable(en); 2197 2198 if (en) { 2199 s->intr_enabled = true; 2200 /* Ok - report back to driver */ 2201 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 2202 } else { 2203 s->intr_enabled = false; 2204 /* Ok - report back to driver */ 2205 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 2206 } 2207 } 2208 2209 /* Handle write to Global Command Register */ 2210 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 2211 { 2212 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 2213 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 2214 uint32_t changed = status ^ val; 2215 2216 trace_vtd_reg_write_gcmd(status, val); 2217 if (changed & VTD_GCMD_TE) { 2218 /* Translation enable/disable */ 2219 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 2220 } 2221 if (val & VTD_GCMD_SRTP) { 2222 /* Set/update the root-table pointer */ 2223 vtd_handle_gcmd_srtp(s); 2224 } 2225 if (changed & VTD_GCMD_QIE) { 2226 /* Queued Invalidation Enable */ 2227 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 2228 } 2229 if (val & VTD_GCMD_SIRTP) { 2230 /* Set/update the interrupt remapping root-table pointer */ 2231 vtd_handle_gcmd_sirtp(s); 2232 } 2233 if (changed & VTD_GCMD_IRE) { 2234 /* Interrupt remap enable/disable */ 2235 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 2236 } 2237 } 2238 2239 /* Handle write to Context Command Register */ 2240 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 2241 { 2242 uint64_t ret; 2243 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 2244 2245 /* Context-cache invalidation request */ 2246 if (val & VTD_CCMD_ICC) { 2247 if (s->qi_enabled) { 2248 error_report_once("Queued Invalidation enabled, " 2249 "should not use register-based invalidation"); 2250 return; 2251 } 2252 ret = vtd_context_cache_invalidate(s, val); 2253 /* Invalidation completed. Change something to show */ 2254 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 2255 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 2256 ret); 2257 } 2258 } 2259 2260 /* Handle write to IOTLB Invalidation Register */ 2261 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 2262 { 2263 uint64_t ret; 2264 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 2265 2266 /* IOTLB invalidation request */ 2267 if (val & VTD_TLB_IVT) { 2268 if (s->qi_enabled) { 2269 error_report_once("Queued Invalidation enabled, " 2270 "should not use register-based invalidation"); 2271 return; 2272 } 2273 ret = vtd_iotlb_flush(s, val); 2274 /* Invalidation completed. Change something to show */ 2275 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 2276 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 2277 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 2278 } 2279 } 2280 2281 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 2282 static bool vtd_get_inv_desc(IntelIOMMUState *s, 2283 VTDInvDesc *inv_desc) 2284 { 2285 dma_addr_t base_addr = s->iq; 2286 uint32_t offset = s->iq_head; 2287 uint32_t dw = s->iq_dw ? 32 : 16; 2288 dma_addr_t addr = base_addr + offset * dw; 2289 2290 if (dma_memory_read(&address_space_memory, addr, 2291 inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) { 2292 error_report_once("Read INV DESC failed."); 2293 return false; 2294 } 2295 inv_desc->lo = le64_to_cpu(inv_desc->lo); 2296 inv_desc->hi = le64_to_cpu(inv_desc->hi); 2297 if (dw == 32) { 2298 inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); 2299 inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); 2300 } 2301 return true; 2302 } 2303 2304 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2305 { 2306 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 2307 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 2308 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2309 " (reserved nonzero)", __func__, inv_desc->hi, 2310 inv_desc->lo); 2311 return false; 2312 } 2313 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 2314 /* Status Write */ 2315 uint32_t status_data = (uint32_t)(inv_desc->lo >> 2316 VTD_INV_DESC_WAIT_DATA_SHIFT); 2317 2318 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 2319 2320 /* FIXME: need to be masked with HAW? */ 2321 dma_addr_t status_addr = inv_desc->hi; 2322 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 2323 status_data = cpu_to_le32(status_data); 2324 if (dma_memory_write(&address_space_memory, status_addr, 2325 &status_data, sizeof(status_data), 2326 MEMTXATTRS_UNSPECIFIED)) { 2327 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 2328 return false; 2329 } 2330 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 2331 /* Interrupt flag */ 2332 vtd_generate_completion_event(s); 2333 } else { 2334 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2335 " (unknown type)", __func__, inv_desc->hi, 2336 inv_desc->lo); 2337 return false; 2338 } 2339 return true; 2340 } 2341 2342 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 2343 VTDInvDesc *inv_desc) 2344 { 2345 uint16_t sid, fmask; 2346 2347 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 2348 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2349 " (reserved nonzero)", __func__, inv_desc->hi, 2350 inv_desc->lo); 2351 return false; 2352 } 2353 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 2354 case VTD_INV_DESC_CC_DOMAIN: 2355 trace_vtd_inv_desc_cc_domain( 2356 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 2357 /* Fall through */ 2358 case VTD_INV_DESC_CC_GLOBAL: 2359 vtd_context_global_invalidate(s); 2360 break; 2361 2362 case VTD_INV_DESC_CC_DEVICE: 2363 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 2364 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 2365 vtd_context_device_invalidate(s, sid, fmask); 2366 break; 2367 2368 default: 2369 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2370 " (invalid type)", __func__, inv_desc->hi, 2371 inv_desc->lo); 2372 return false; 2373 } 2374 return true; 2375 } 2376 2377 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2378 { 2379 uint16_t domain_id; 2380 uint8_t am; 2381 hwaddr addr; 2382 2383 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 2384 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 2385 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2386 ", lo=0x%"PRIx64" (reserved bits unzero)", 2387 __func__, inv_desc->hi, inv_desc->lo); 2388 return false; 2389 } 2390 2391 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 2392 case VTD_INV_DESC_IOTLB_GLOBAL: 2393 vtd_iotlb_global_invalidate(s); 2394 break; 2395 2396 case VTD_INV_DESC_IOTLB_DOMAIN: 2397 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2398 vtd_iotlb_domain_invalidate(s, domain_id); 2399 break; 2400 2401 case VTD_INV_DESC_IOTLB_PAGE: 2402 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2403 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 2404 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 2405 if (am > VTD_MAMV) { 2406 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2407 ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)", 2408 __func__, inv_desc->hi, inv_desc->lo, 2409 am, (unsigned)VTD_MAMV); 2410 return false; 2411 } 2412 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2413 break; 2414 2415 default: 2416 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2417 ", lo=0x%"PRIx64" (type mismatch: 0x%llx)", 2418 __func__, inv_desc->hi, inv_desc->lo, 2419 inv_desc->lo & VTD_INV_DESC_IOTLB_G); 2420 return false; 2421 } 2422 return true; 2423 } 2424 2425 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 2426 VTDInvDesc *inv_desc) 2427 { 2428 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 2429 inv_desc->iec.index, 2430 inv_desc->iec.index_mask); 2431 2432 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 2433 inv_desc->iec.index, 2434 inv_desc->iec.index_mask); 2435 return true; 2436 } 2437 2438 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 2439 VTDInvDesc *inv_desc) 2440 { 2441 VTDAddressSpace *vtd_dev_as; 2442 IOMMUTLBEvent event; 2443 struct VTDBus *vtd_bus; 2444 hwaddr addr; 2445 uint64_t sz; 2446 uint16_t sid; 2447 uint8_t devfn; 2448 bool size; 2449 uint8_t bus_num; 2450 2451 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2452 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2453 devfn = sid & 0xff; 2454 bus_num = sid >> 8; 2455 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2456 2457 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2458 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2459 error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 2460 ", lo=%"PRIx64" (reserved nonzero)", __func__, 2461 inv_desc->hi, inv_desc->lo); 2462 return false; 2463 } 2464 2465 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2466 if (!vtd_bus) { 2467 goto done; 2468 } 2469 2470 vtd_dev_as = vtd_bus->dev_as[devfn]; 2471 if (!vtd_dev_as) { 2472 goto done; 2473 } 2474 2475 /* According to ATS spec table 2.4: 2476 * S = 0, bits 15:12 = xxxx range size: 4K 2477 * S = 1, bits 15:12 = xxx0 range size: 8K 2478 * S = 1, bits 15:12 = xx01 range size: 16K 2479 * S = 1, bits 15:12 = x011 range size: 32K 2480 * S = 1, bits 15:12 = 0111 range size: 64K 2481 * ... 2482 */ 2483 if (size) { 2484 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2485 addr &= ~(sz - 1); 2486 } else { 2487 sz = VTD_PAGE_SIZE; 2488 } 2489 2490 event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; 2491 event.entry.target_as = &vtd_dev_as->as; 2492 event.entry.addr_mask = sz - 1; 2493 event.entry.iova = addr; 2494 event.entry.perm = IOMMU_NONE; 2495 event.entry.translated_addr = 0; 2496 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); 2497 2498 done: 2499 return true; 2500 } 2501 2502 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2503 { 2504 VTDInvDesc inv_desc; 2505 uint8_t desc_type; 2506 2507 trace_vtd_inv_qi_head(s->iq_head); 2508 if (!vtd_get_inv_desc(s, &inv_desc)) { 2509 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2510 return false; 2511 } 2512 2513 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2514 /* FIXME: should update at first or at last? */ 2515 s->iq_last_desc_type = desc_type; 2516 2517 switch (desc_type) { 2518 case VTD_INV_DESC_CC: 2519 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2520 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2521 return false; 2522 } 2523 break; 2524 2525 case VTD_INV_DESC_IOTLB: 2526 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2527 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2528 return false; 2529 } 2530 break; 2531 2532 /* 2533 * TODO: the entity of below two cases will be implemented in future series. 2534 * To make guest (which integrates scalable mode support patch set in 2535 * iommu driver) work, just return true is enough so far. 2536 */ 2537 case VTD_INV_DESC_PC: 2538 break; 2539 2540 case VTD_INV_DESC_PIOTLB: 2541 break; 2542 2543 case VTD_INV_DESC_WAIT: 2544 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2545 if (!vtd_process_wait_desc(s, &inv_desc)) { 2546 return false; 2547 } 2548 break; 2549 2550 case VTD_INV_DESC_IEC: 2551 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2552 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2553 return false; 2554 } 2555 break; 2556 2557 case VTD_INV_DESC_DEVICE: 2558 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2559 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2560 return false; 2561 } 2562 break; 2563 2564 default: 2565 error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 2566 " (unknown type)", __func__, inv_desc.hi, 2567 inv_desc.lo); 2568 return false; 2569 } 2570 s->iq_head++; 2571 if (s->iq_head == s->iq_size) { 2572 s->iq_head = 0; 2573 } 2574 return true; 2575 } 2576 2577 /* Try to fetch and process more Invalidation Descriptors */ 2578 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2579 { 2580 int qi_shift; 2581 2582 /* Refer to 10.4.23 of VT-d spec 3.0 */ 2583 qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4; 2584 2585 trace_vtd_inv_qi_fetch(); 2586 2587 if (s->iq_tail >= s->iq_size) { 2588 /* Detects an invalid Tail pointer */ 2589 error_report_once("%s: detected invalid QI tail " 2590 "(tail=0x%x, size=0x%x)", 2591 __func__, s->iq_tail, s->iq_size); 2592 vtd_handle_inv_queue_error(s); 2593 return; 2594 } 2595 while (s->iq_head != s->iq_tail) { 2596 if (!vtd_process_inv_desc(s)) { 2597 /* Invalidation Queue Errors */ 2598 vtd_handle_inv_queue_error(s); 2599 break; 2600 } 2601 /* Must update the IQH_REG in time */ 2602 vtd_set_quad_raw(s, DMAR_IQH_REG, 2603 (((uint64_t)(s->iq_head)) << qi_shift) & 2604 VTD_IQH_QH_MASK); 2605 } 2606 } 2607 2608 /* Handle write to Invalidation Queue Tail Register */ 2609 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2610 { 2611 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2612 2613 if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { 2614 error_report_once("%s: RSV bit is set: val=0x%"PRIx64, 2615 __func__, val); 2616 return; 2617 } 2618 s->iq_tail = VTD_IQT_QT(s->iq_dw, val); 2619 trace_vtd_inv_qi_tail(s->iq_tail); 2620 2621 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2622 /* Process Invalidation Queue here */ 2623 vtd_fetch_inv_desc(s); 2624 } 2625 } 2626 2627 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2628 { 2629 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2630 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2631 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2632 2633 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2634 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2635 trace_vtd_fsts_clear_ip(); 2636 } 2637 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2638 * Descriptors if there are any when Queued Invalidation is enabled? 2639 */ 2640 } 2641 2642 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2643 { 2644 uint32_t fectl_reg; 2645 /* FIXME: when software clears the IM field, check the IP field. But do we 2646 * need to compare the old value and the new value to conclude that 2647 * software clears the IM field? Or just check if the IM field is zero? 2648 */ 2649 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2650 2651 trace_vtd_reg_write_fectl(fectl_reg); 2652 2653 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2654 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2655 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2656 } 2657 } 2658 2659 static void vtd_handle_ics_write(IntelIOMMUState *s) 2660 { 2661 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2662 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2663 2664 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2665 trace_vtd_reg_ics_clear_ip(); 2666 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2667 } 2668 } 2669 2670 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2671 { 2672 uint32_t iectl_reg; 2673 /* FIXME: when software clears the IM field, check the IP field. But do we 2674 * need to compare the old value and the new value to conclude that 2675 * software clears the IM field? Or just check if the IM field is zero? 2676 */ 2677 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2678 2679 trace_vtd_reg_write_iectl(iectl_reg); 2680 2681 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2682 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2683 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2684 } 2685 } 2686 2687 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2688 { 2689 IntelIOMMUState *s = opaque; 2690 uint64_t val; 2691 2692 trace_vtd_reg_read(addr, size); 2693 2694 if (addr + size > DMAR_REG_SIZE) { 2695 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2696 " size=0x%x", __func__, addr, size); 2697 return (uint64_t)-1; 2698 } 2699 2700 switch (addr) { 2701 /* Root Table Address Register, 64-bit */ 2702 case DMAR_RTADDR_REG: 2703 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 2704 if (size == 4) { 2705 val = val & ((1ULL << 32) - 1); 2706 } 2707 break; 2708 2709 case DMAR_RTADDR_REG_HI: 2710 assert(size == 4); 2711 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; 2712 break; 2713 2714 /* Invalidation Queue Address Register, 64-bit */ 2715 case DMAR_IQA_REG: 2716 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2717 if (size == 4) { 2718 val = val & ((1ULL << 32) - 1); 2719 } 2720 break; 2721 2722 case DMAR_IQA_REG_HI: 2723 assert(size == 4); 2724 val = s->iq >> 32; 2725 break; 2726 2727 default: 2728 if (size == 4) { 2729 val = vtd_get_long(s, addr); 2730 } else { 2731 val = vtd_get_quad(s, addr); 2732 } 2733 } 2734 2735 return val; 2736 } 2737 2738 static void vtd_mem_write(void *opaque, hwaddr addr, 2739 uint64_t val, unsigned size) 2740 { 2741 IntelIOMMUState *s = opaque; 2742 2743 trace_vtd_reg_write(addr, size, val); 2744 2745 if (addr + size > DMAR_REG_SIZE) { 2746 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2747 " size=0x%x", __func__, addr, size); 2748 return; 2749 } 2750 2751 switch (addr) { 2752 /* Global Command Register, 32-bit */ 2753 case DMAR_GCMD_REG: 2754 vtd_set_long(s, addr, val); 2755 vtd_handle_gcmd_write(s); 2756 break; 2757 2758 /* Context Command Register, 64-bit */ 2759 case DMAR_CCMD_REG: 2760 if (size == 4) { 2761 vtd_set_long(s, addr, val); 2762 } else { 2763 vtd_set_quad(s, addr, val); 2764 vtd_handle_ccmd_write(s); 2765 } 2766 break; 2767 2768 case DMAR_CCMD_REG_HI: 2769 assert(size == 4); 2770 vtd_set_long(s, addr, val); 2771 vtd_handle_ccmd_write(s); 2772 break; 2773 2774 /* IOTLB Invalidation Register, 64-bit */ 2775 case DMAR_IOTLB_REG: 2776 if (size == 4) { 2777 vtd_set_long(s, addr, val); 2778 } else { 2779 vtd_set_quad(s, addr, val); 2780 vtd_handle_iotlb_write(s); 2781 } 2782 break; 2783 2784 case DMAR_IOTLB_REG_HI: 2785 assert(size == 4); 2786 vtd_set_long(s, addr, val); 2787 vtd_handle_iotlb_write(s); 2788 break; 2789 2790 /* Invalidate Address Register, 64-bit */ 2791 case DMAR_IVA_REG: 2792 if (size == 4) { 2793 vtd_set_long(s, addr, val); 2794 } else { 2795 vtd_set_quad(s, addr, val); 2796 } 2797 break; 2798 2799 case DMAR_IVA_REG_HI: 2800 assert(size == 4); 2801 vtd_set_long(s, addr, val); 2802 break; 2803 2804 /* Fault Status Register, 32-bit */ 2805 case DMAR_FSTS_REG: 2806 assert(size == 4); 2807 vtd_set_long(s, addr, val); 2808 vtd_handle_fsts_write(s); 2809 break; 2810 2811 /* Fault Event Control Register, 32-bit */ 2812 case DMAR_FECTL_REG: 2813 assert(size == 4); 2814 vtd_set_long(s, addr, val); 2815 vtd_handle_fectl_write(s); 2816 break; 2817 2818 /* Fault Event Data Register, 32-bit */ 2819 case DMAR_FEDATA_REG: 2820 assert(size == 4); 2821 vtd_set_long(s, addr, val); 2822 break; 2823 2824 /* Fault Event Address Register, 32-bit */ 2825 case DMAR_FEADDR_REG: 2826 if (size == 4) { 2827 vtd_set_long(s, addr, val); 2828 } else { 2829 /* 2830 * While the register is 32-bit only, some guests (Xen...) write to 2831 * it with 64-bit. 2832 */ 2833 vtd_set_quad(s, addr, val); 2834 } 2835 break; 2836 2837 /* Fault Event Upper Address Register, 32-bit */ 2838 case DMAR_FEUADDR_REG: 2839 assert(size == 4); 2840 vtd_set_long(s, addr, val); 2841 break; 2842 2843 /* Protected Memory Enable Register, 32-bit */ 2844 case DMAR_PMEN_REG: 2845 assert(size == 4); 2846 vtd_set_long(s, addr, val); 2847 break; 2848 2849 /* Root Table Address Register, 64-bit */ 2850 case DMAR_RTADDR_REG: 2851 if (size == 4) { 2852 vtd_set_long(s, addr, val); 2853 } else { 2854 vtd_set_quad(s, addr, val); 2855 } 2856 break; 2857 2858 case DMAR_RTADDR_REG_HI: 2859 assert(size == 4); 2860 vtd_set_long(s, addr, val); 2861 break; 2862 2863 /* Invalidation Queue Tail Register, 64-bit */ 2864 case DMAR_IQT_REG: 2865 if (size == 4) { 2866 vtd_set_long(s, addr, val); 2867 } else { 2868 vtd_set_quad(s, addr, val); 2869 } 2870 vtd_handle_iqt_write(s); 2871 break; 2872 2873 case DMAR_IQT_REG_HI: 2874 assert(size == 4); 2875 vtd_set_long(s, addr, val); 2876 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2877 break; 2878 2879 /* Invalidation Queue Address Register, 64-bit */ 2880 case DMAR_IQA_REG: 2881 if (size == 4) { 2882 vtd_set_long(s, addr, val); 2883 } else { 2884 vtd_set_quad(s, addr, val); 2885 } 2886 if (s->ecap & VTD_ECAP_SMTS && 2887 val & VTD_IQA_DW_MASK) { 2888 s->iq_dw = true; 2889 } else { 2890 s->iq_dw = false; 2891 } 2892 break; 2893 2894 case DMAR_IQA_REG_HI: 2895 assert(size == 4); 2896 vtd_set_long(s, addr, val); 2897 break; 2898 2899 /* Invalidation Completion Status Register, 32-bit */ 2900 case DMAR_ICS_REG: 2901 assert(size == 4); 2902 vtd_set_long(s, addr, val); 2903 vtd_handle_ics_write(s); 2904 break; 2905 2906 /* Invalidation Event Control Register, 32-bit */ 2907 case DMAR_IECTL_REG: 2908 assert(size == 4); 2909 vtd_set_long(s, addr, val); 2910 vtd_handle_iectl_write(s); 2911 break; 2912 2913 /* Invalidation Event Data Register, 32-bit */ 2914 case DMAR_IEDATA_REG: 2915 assert(size == 4); 2916 vtd_set_long(s, addr, val); 2917 break; 2918 2919 /* Invalidation Event Address Register, 32-bit */ 2920 case DMAR_IEADDR_REG: 2921 assert(size == 4); 2922 vtd_set_long(s, addr, val); 2923 break; 2924 2925 /* Invalidation Event Upper Address Register, 32-bit */ 2926 case DMAR_IEUADDR_REG: 2927 assert(size == 4); 2928 vtd_set_long(s, addr, val); 2929 break; 2930 2931 /* Fault Recording Registers, 128-bit */ 2932 case DMAR_FRCD_REG_0_0: 2933 if (size == 4) { 2934 vtd_set_long(s, addr, val); 2935 } else { 2936 vtd_set_quad(s, addr, val); 2937 } 2938 break; 2939 2940 case DMAR_FRCD_REG_0_1: 2941 assert(size == 4); 2942 vtd_set_long(s, addr, val); 2943 break; 2944 2945 case DMAR_FRCD_REG_0_2: 2946 if (size == 4) { 2947 vtd_set_long(s, addr, val); 2948 } else { 2949 vtd_set_quad(s, addr, val); 2950 /* May clear bit 127 (Fault), update PPF */ 2951 vtd_update_fsts_ppf(s); 2952 } 2953 break; 2954 2955 case DMAR_FRCD_REG_0_3: 2956 assert(size == 4); 2957 vtd_set_long(s, addr, val); 2958 /* May clear bit 127 (Fault), update PPF */ 2959 vtd_update_fsts_ppf(s); 2960 break; 2961 2962 case DMAR_IRTA_REG: 2963 if (size == 4) { 2964 vtd_set_long(s, addr, val); 2965 } else { 2966 vtd_set_quad(s, addr, val); 2967 } 2968 break; 2969 2970 case DMAR_IRTA_REG_HI: 2971 assert(size == 4); 2972 vtd_set_long(s, addr, val); 2973 break; 2974 2975 default: 2976 if (size == 4) { 2977 vtd_set_long(s, addr, val); 2978 } else { 2979 vtd_set_quad(s, addr, val); 2980 } 2981 } 2982 } 2983 2984 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2985 IOMMUAccessFlags flag, int iommu_idx) 2986 { 2987 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2988 IntelIOMMUState *s = vtd_as->iommu_state; 2989 IOMMUTLBEntry iotlb = { 2990 /* We'll fill in the rest later. */ 2991 .target_as = &address_space_memory, 2992 }; 2993 bool success; 2994 2995 if (likely(s->dmar_enabled)) { 2996 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2997 addr, flag & IOMMU_WO, &iotlb); 2998 } else { 2999 /* DMAR disabled, passthrough, use 4k-page*/ 3000 iotlb.iova = addr & VTD_PAGE_MASK_4K; 3001 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 3002 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 3003 iotlb.perm = IOMMU_RW; 3004 success = true; 3005 } 3006 3007 if (likely(success)) { 3008 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 3009 VTD_PCI_SLOT(vtd_as->devfn), 3010 VTD_PCI_FUNC(vtd_as->devfn), 3011 iotlb.iova, iotlb.translated_addr, 3012 iotlb.addr_mask); 3013 } else { 3014 error_report_once("%s: detected translation failure " 3015 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 3016 __func__, pci_bus_num(vtd_as->bus), 3017 VTD_PCI_SLOT(vtd_as->devfn), 3018 VTD_PCI_FUNC(vtd_as->devfn), 3019 addr); 3020 } 3021 3022 return iotlb; 3023 } 3024 3025 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 3026 IOMMUNotifierFlag old, 3027 IOMMUNotifierFlag new, 3028 Error **errp) 3029 { 3030 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 3031 IntelIOMMUState *s = vtd_as->iommu_state; 3032 3033 /* TODO: add support for VFIO and vhost users */ 3034 if (s->snoop_control) { 3035 error_setg_errno(errp, -ENOTSUP, 3036 "Snoop Control with vhost or VFIO is not supported"); 3037 return -ENOTSUP; 3038 } 3039 3040 /* Update per-address-space notifier flags */ 3041 vtd_as->notifier_flags = new; 3042 3043 if (old == IOMMU_NOTIFIER_NONE) { 3044 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 3045 } else if (new == IOMMU_NOTIFIER_NONE) { 3046 QLIST_REMOVE(vtd_as, next); 3047 } 3048 return 0; 3049 } 3050 3051 static int vtd_post_load(void *opaque, int version_id) 3052 { 3053 IntelIOMMUState *iommu = opaque; 3054 3055 /* 3056 * Memory regions are dynamically turned on/off depending on 3057 * context entry configurations from the guest. After migration, 3058 * we need to make sure the memory regions are still correct. 3059 */ 3060 vtd_switch_address_space_all(iommu); 3061 3062 /* 3063 * We don't need to migrate the root_scalable because we can 3064 * simply do the calculation after the loading is complete. We 3065 * can actually do similar things with root, dmar_enabled, etc. 3066 * however since we've had them already so we'd better keep them 3067 * for compatibility of migration. 3068 */ 3069 vtd_update_scalable_state(iommu); 3070 3071 return 0; 3072 } 3073 3074 static const VMStateDescription vtd_vmstate = { 3075 .name = "iommu-intel", 3076 .version_id = 1, 3077 .minimum_version_id = 1, 3078 .priority = MIG_PRI_IOMMU, 3079 .post_load = vtd_post_load, 3080 .fields = (VMStateField[]) { 3081 VMSTATE_UINT64(root, IntelIOMMUState), 3082 VMSTATE_UINT64(intr_root, IntelIOMMUState), 3083 VMSTATE_UINT64(iq, IntelIOMMUState), 3084 VMSTATE_UINT32(intr_size, IntelIOMMUState), 3085 VMSTATE_UINT16(iq_head, IntelIOMMUState), 3086 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 3087 VMSTATE_UINT16(iq_size, IntelIOMMUState), 3088 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 3089 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 3090 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 3091 VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ 3092 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 3093 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 3094 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 3095 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 3096 VMSTATE_END_OF_LIST() 3097 } 3098 }; 3099 3100 static const MemoryRegionOps vtd_mem_ops = { 3101 .read = vtd_mem_read, 3102 .write = vtd_mem_write, 3103 .endianness = DEVICE_LITTLE_ENDIAN, 3104 .impl = { 3105 .min_access_size = 4, 3106 .max_access_size = 8, 3107 }, 3108 .valid = { 3109 .min_access_size = 4, 3110 .max_access_size = 8, 3111 }, 3112 }; 3113 3114 static Property vtd_properties[] = { 3115 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 3116 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 3117 ON_OFF_AUTO_AUTO), 3118 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 3119 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, 3120 VTD_HOST_ADDRESS_WIDTH), 3121 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 3122 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), 3123 DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), 3124 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), 3125 DEFINE_PROP_END_OF_LIST(), 3126 }; 3127 3128 /* Read IRTE entry with specific index */ 3129 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 3130 VTD_IR_TableEntry *entry, uint16_t sid) 3131 { 3132 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 3133 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 3134 dma_addr_t addr = 0x00; 3135 uint16_t mask, source_id; 3136 uint8_t bus, bus_max, bus_min; 3137 3138 if (index >= iommu->intr_size) { 3139 error_report_once("%s: index too large: ind=0x%x", 3140 __func__, index); 3141 return -VTD_FR_IR_INDEX_OVER; 3142 } 3143 3144 addr = iommu->intr_root + index * sizeof(*entry); 3145 if (dma_memory_read(&address_space_memory, addr, 3146 entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) { 3147 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 3148 __func__, index, addr); 3149 return -VTD_FR_IR_ROOT_INVAL; 3150 } 3151 3152 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 3153 le64_to_cpu(entry->data[0])); 3154 3155 if (!entry->irte.present) { 3156 error_report_once("%s: detected non-present IRTE " 3157 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3158 __func__, index, le64_to_cpu(entry->data[1]), 3159 le64_to_cpu(entry->data[0])); 3160 return -VTD_FR_IR_ENTRY_P; 3161 } 3162 3163 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 3164 entry->irte.__reserved_2) { 3165 error_report_once("%s: detected non-zero reserved IRTE " 3166 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3167 __func__, index, le64_to_cpu(entry->data[1]), 3168 le64_to_cpu(entry->data[0])); 3169 return -VTD_FR_IR_IRTE_RSVD; 3170 } 3171 3172 if (sid != X86_IOMMU_SID_INVALID) { 3173 /* Validate IRTE SID */ 3174 source_id = le32_to_cpu(entry->irte.source_id); 3175 switch (entry->irte.sid_vtype) { 3176 case VTD_SVT_NONE: 3177 break; 3178 3179 case VTD_SVT_ALL: 3180 mask = vtd_svt_mask[entry->irte.sid_q]; 3181 if ((source_id & mask) != (sid & mask)) { 3182 error_report_once("%s: invalid IRTE SID " 3183 "(index=%u, sid=%u, source_id=%u)", 3184 __func__, index, sid, source_id); 3185 return -VTD_FR_IR_SID_ERR; 3186 } 3187 break; 3188 3189 case VTD_SVT_BUS: 3190 bus_max = source_id >> 8; 3191 bus_min = source_id & 0xff; 3192 bus = sid >> 8; 3193 if (bus > bus_max || bus < bus_min) { 3194 error_report_once("%s: invalid SVT_BUS " 3195 "(index=%u, bus=%u, min=%u, max=%u)", 3196 __func__, index, bus, bus_min, bus_max); 3197 return -VTD_FR_IR_SID_ERR; 3198 } 3199 break; 3200 3201 default: 3202 error_report_once("%s: detected invalid IRTE SVT " 3203 "(index=%u, type=%d)", __func__, 3204 index, entry->irte.sid_vtype); 3205 /* Take this as verification failure. */ 3206 return -VTD_FR_IR_SID_ERR; 3207 } 3208 } 3209 3210 return 0; 3211 } 3212 3213 /* Fetch IRQ information of specific IR index */ 3214 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 3215 X86IOMMUIrq *irq, uint16_t sid) 3216 { 3217 VTD_IR_TableEntry irte = {}; 3218 int ret = 0; 3219 3220 ret = vtd_irte_get(iommu, index, &irte, sid); 3221 if (ret) { 3222 return ret; 3223 } 3224 3225 irq->trigger_mode = irte.irte.trigger_mode; 3226 irq->vector = irte.irte.vector; 3227 irq->delivery_mode = irte.irte.delivery_mode; 3228 irq->dest = le32_to_cpu(irte.irte.dest_id); 3229 if (!iommu->intr_eime) { 3230 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 3231 #define VTD_IR_APIC_DEST_SHIFT (8) 3232 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 3233 VTD_IR_APIC_DEST_SHIFT; 3234 } 3235 irq->dest_mode = irte.irte.dest_mode; 3236 irq->redir_hint = irte.irte.redir_hint; 3237 3238 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 3239 irq->delivery_mode, irq->dest, irq->dest_mode); 3240 3241 return 0; 3242 } 3243 3244 /* Interrupt remapping for MSI/MSI-X entry */ 3245 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 3246 MSIMessage *origin, 3247 MSIMessage *translated, 3248 uint16_t sid) 3249 { 3250 int ret = 0; 3251 VTD_IR_MSIAddress addr; 3252 uint16_t index; 3253 X86IOMMUIrq irq = {}; 3254 3255 assert(origin && translated); 3256 3257 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 3258 3259 if (!iommu || !iommu->intr_enabled) { 3260 memcpy(translated, origin, sizeof(*origin)); 3261 goto out; 3262 } 3263 3264 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 3265 error_report_once("%s: MSI address high 32 bits non-zero detected: " 3266 "address=0x%" PRIx64, __func__, origin->address); 3267 return -VTD_FR_IR_REQ_RSVD; 3268 } 3269 3270 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 3271 if (addr.addr.__head != 0xfee) { 3272 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 3273 __func__, addr.data); 3274 return -VTD_FR_IR_REQ_RSVD; 3275 } 3276 3277 /* This is compatible mode. */ 3278 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 3279 memcpy(translated, origin, sizeof(*origin)); 3280 goto out; 3281 } 3282 3283 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 3284 3285 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 3286 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 3287 3288 if (addr.addr.sub_valid) { 3289 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 3290 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 3291 } 3292 3293 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 3294 if (ret) { 3295 return ret; 3296 } 3297 3298 if (addr.addr.sub_valid) { 3299 trace_vtd_ir_remap_type("MSI"); 3300 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 3301 error_report_once("%s: invalid IR MSI " 3302 "(sid=%u, address=0x%" PRIx64 3303 ", data=0x%" PRIx32 ")", 3304 __func__, sid, origin->address, origin->data); 3305 return -VTD_FR_IR_REQ_RSVD; 3306 } 3307 } else { 3308 uint8_t vector = origin->data & 0xff; 3309 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 3310 3311 trace_vtd_ir_remap_type("IOAPIC"); 3312 /* IOAPIC entry vector should be aligned with IRTE vector 3313 * (see vt-d spec 5.1.5.1). */ 3314 if (vector != irq.vector) { 3315 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 3316 } 3317 3318 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 3319 * (see vt-d spec 5.1.5.1). */ 3320 if (trigger_mode != irq.trigger_mode) { 3321 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 3322 irq.trigger_mode); 3323 } 3324 } 3325 3326 /* 3327 * We'd better keep the last two bits, assuming that guest OS 3328 * might modify it. Keep it does not hurt after all. 3329 */ 3330 irq.msi_addr_last_bits = addr.addr.__not_care; 3331 3332 /* Translate X86IOMMUIrq to MSI message */ 3333 x86_iommu_irq_to_msi_message(&irq, translated); 3334 3335 out: 3336 trace_vtd_ir_remap_msi(origin->address, origin->data, 3337 translated->address, translated->data); 3338 return 0; 3339 } 3340 3341 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 3342 MSIMessage *dst, uint16_t sid) 3343 { 3344 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 3345 src, dst, sid); 3346 } 3347 3348 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 3349 uint64_t *data, unsigned size, 3350 MemTxAttrs attrs) 3351 { 3352 return MEMTX_OK; 3353 } 3354 3355 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 3356 uint64_t value, unsigned size, 3357 MemTxAttrs attrs) 3358 { 3359 int ret = 0; 3360 MSIMessage from = {}, to = {}; 3361 uint16_t sid = X86_IOMMU_SID_INVALID; 3362 3363 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 3364 from.data = (uint32_t) value; 3365 3366 if (!attrs.unspecified) { 3367 /* We have explicit Source ID */ 3368 sid = attrs.requester_id; 3369 } 3370 3371 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 3372 if (ret) { 3373 /* TODO: report error */ 3374 /* Drop this interrupt */ 3375 return MEMTX_ERROR; 3376 } 3377 3378 apic_get_class()->send_msi(&to); 3379 3380 return MEMTX_OK; 3381 } 3382 3383 static const MemoryRegionOps vtd_mem_ir_ops = { 3384 .read_with_attrs = vtd_mem_ir_read, 3385 .write_with_attrs = vtd_mem_ir_write, 3386 .endianness = DEVICE_LITTLE_ENDIAN, 3387 .impl = { 3388 .min_access_size = 4, 3389 .max_access_size = 4, 3390 }, 3391 .valid = { 3392 .min_access_size = 4, 3393 .max_access_size = 4, 3394 }, 3395 }; 3396 3397 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 3398 { 3399 uintptr_t key = (uintptr_t)bus; 3400 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 3401 VTDAddressSpace *vtd_dev_as; 3402 char name[128]; 3403 3404 if (!vtd_bus) { 3405 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 3406 *new_key = (uintptr_t)bus; 3407 /* No corresponding free() */ 3408 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 3409 PCI_DEVFN_MAX); 3410 vtd_bus->bus = bus; 3411 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 3412 } 3413 3414 vtd_dev_as = vtd_bus->dev_as[devfn]; 3415 3416 if (!vtd_dev_as) { 3417 snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), 3418 PCI_FUNC(devfn)); 3419 vtd_bus->dev_as[devfn] = vtd_dev_as = g_new0(VTDAddressSpace, 1); 3420 3421 vtd_dev_as->bus = bus; 3422 vtd_dev_as->devfn = (uint8_t)devfn; 3423 vtd_dev_as->iommu_state = s; 3424 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 3425 vtd_dev_as->iova_tree = iova_tree_new(); 3426 3427 memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); 3428 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); 3429 3430 /* 3431 * Build the DMAR-disabled container with aliases to the 3432 * shared MRs. Note that aliasing to a shared memory region 3433 * could help the memory API to detect same FlatViews so we 3434 * can have devices to share the same FlatView when DMAR is 3435 * disabled (either by not providing "intel_iommu=on" or with 3436 * "iommu=pt"). It will greatly reduce the total number of 3437 * FlatViews of the system hence VM runs faster. 3438 */ 3439 memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), 3440 "vtd-nodmar", &s->mr_nodmar, 0, 3441 memory_region_size(&s->mr_nodmar)); 3442 3443 /* 3444 * Build the per-device DMAR-enabled container. 3445 * 3446 * TODO: currently we have per-device IOMMU memory region only 3447 * because we have per-device IOMMU notifiers for devices. If 3448 * one day we can abstract the IOMMU notifiers out of the 3449 * memory regions then we can also share the same memory 3450 * region here just like what we've done above with the nodmar 3451 * region. 3452 */ 3453 strcat(name, "-dmar"); 3454 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 3455 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 3456 name, UINT64_MAX); 3457 memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", 3458 &s->mr_ir, 0, memory_region_size(&s->mr_ir)); 3459 memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), 3460 VTD_INTERRUPT_ADDR_FIRST, 3461 &vtd_dev_as->iommu_ir, 1); 3462 3463 /* 3464 * Hook both the containers under the root container, we 3465 * switch between DMAR & noDMAR by enable/disable 3466 * corresponding sub-containers 3467 */ 3468 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3469 MEMORY_REGION(&vtd_dev_as->iommu), 3470 0); 3471 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3472 &vtd_dev_as->nodmar, 0); 3473 3474 vtd_switch_address_space(vtd_dev_as); 3475 } 3476 return vtd_dev_as; 3477 } 3478 3479 /* Unmap the whole range in the notifier's scope. */ 3480 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 3481 { 3482 hwaddr size, remain; 3483 hwaddr start = n->start; 3484 hwaddr end = n->end; 3485 IntelIOMMUState *s = as->iommu_state; 3486 DMAMap map; 3487 3488 /* 3489 * Note: all the codes in this function has a assumption that IOVA 3490 * bits are no more than VTD_MGAW bits (which is restricted by 3491 * VT-d spec), otherwise we need to consider overflow of 64 bits. 3492 */ 3493 3494 if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { 3495 /* 3496 * Don't need to unmap regions that is bigger than the whole 3497 * VT-d supported address space size 3498 */ 3499 end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; 3500 } 3501 3502 assert(start <= end); 3503 size = remain = end - start + 1; 3504 3505 while (remain >= VTD_PAGE_SIZE) { 3506 IOMMUTLBEvent event; 3507 uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits); 3508 uint64_t size = mask + 1; 3509 3510 assert(size); 3511 3512 event.type = IOMMU_NOTIFIER_UNMAP; 3513 event.entry.iova = start; 3514 event.entry.addr_mask = mask; 3515 event.entry.target_as = &address_space_memory; 3516 event.entry.perm = IOMMU_NONE; 3517 /* This field is meaningless for unmap */ 3518 event.entry.translated_addr = 0; 3519 3520 memory_region_notify_iommu_one(n, &event); 3521 3522 start += size; 3523 remain -= size; 3524 } 3525 3526 assert(!remain); 3527 3528 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3529 VTD_PCI_SLOT(as->devfn), 3530 VTD_PCI_FUNC(as->devfn), 3531 n->start, size); 3532 3533 map.iova = n->start; 3534 map.size = size; 3535 iova_tree_remove(as->iova_tree, &map); 3536 } 3537 3538 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3539 { 3540 VTDAddressSpace *vtd_as; 3541 IOMMUNotifier *n; 3542 3543 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3544 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3545 vtd_address_space_unmap(vtd_as, n); 3546 } 3547 } 3548 } 3549 3550 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3551 { 3552 vtd_address_space_unmap_all(s); 3553 vtd_switch_address_space_all(s); 3554 } 3555 3556 static int vtd_replay_hook(IOMMUTLBEvent *event, void *private) 3557 { 3558 memory_region_notify_iommu_one(private, event); 3559 return 0; 3560 } 3561 3562 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3563 { 3564 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3565 IntelIOMMUState *s = vtd_as->iommu_state; 3566 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3567 VTDContextEntry ce; 3568 3569 /* 3570 * The replay can be triggered by either a invalidation or a newly 3571 * created entry. No matter what, we release existing mappings 3572 * (it means flushing caches for UNMAP-only registers). 3573 */ 3574 vtd_address_space_unmap(vtd_as, n); 3575 3576 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3577 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : 3578 "legacy mode", 3579 bus_n, PCI_SLOT(vtd_as->devfn), 3580 PCI_FUNC(vtd_as->devfn), 3581 vtd_get_domain_id(s, &ce), 3582 ce.hi, ce.lo); 3583 if (vtd_as_has_map_notifier(vtd_as)) { 3584 /* This is required only for MAP typed notifiers */ 3585 vtd_page_walk_info info = { 3586 .hook_fn = vtd_replay_hook, 3587 .private = (void *)n, 3588 .notify_unmap = false, 3589 .aw = s->aw_bits, 3590 .as = vtd_as, 3591 .domain_id = vtd_get_domain_id(s, &ce), 3592 }; 3593 3594 vtd_page_walk(s, &ce, 0, ~0ULL, &info); 3595 } 3596 } else { 3597 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3598 PCI_FUNC(vtd_as->devfn)); 3599 } 3600 3601 return; 3602 } 3603 3604 /* Do the initialization. It will also be called when reset, so pay 3605 * attention when adding new initialization stuff. 3606 */ 3607 static void vtd_init(IntelIOMMUState *s) 3608 { 3609 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3610 3611 memset(s->csr, 0, DMAR_REG_SIZE); 3612 memset(s->wmask, 0, DMAR_REG_SIZE); 3613 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3614 memset(s->womask, 0, DMAR_REG_SIZE); 3615 3616 s->root = 0; 3617 s->root_scalable = false; 3618 s->dmar_enabled = false; 3619 s->intr_enabled = false; 3620 s->iq_head = 0; 3621 s->iq_tail = 0; 3622 s->iq = 0; 3623 s->iq_size = 0; 3624 s->qi_enabled = false; 3625 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3626 s->iq_dw = false; 3627 s->next_frcd_reg = 0; 3628 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3629 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3630 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3631 if (s->dma_drain) { 3632 s->cap |= VTD_CAP_DRAIN; 3633 } 3634 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3635 s->cap |= VTD_CAP_SAGAW_48bit; 3636 } 3637 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3638 3639 /* 3640 * Rsvd field masks for spte 3641 */ 3642 vtd_spte_rsvd[0] = ~0ULL; 3643 vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, 3644 x86_iommu->dt_supported); 3645 vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3646 vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3647 vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3648 3649 vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, 3650 x86_iommu->dt_supported); 3651 vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, 3652 x86_iommu->dt_supported); 3653 3654 if (s->scalable_mode || s->snoop_control) { 3655 vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; 3656 vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; 3657 vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; 3658 } 3659 3660 if (x86_iommu_ir_supported(x86_iommu)) { 3661 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3662 if (s->intr_eim == ON_OFF_AUTO_ON) { 3663 s->ecap |= VTD_ECAP_EIM; 3664 } 3665 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3666 } 3667 3668 if (x86_iommu->dt_supported) { 3669 s->ecap |= VTD_ECAP_DT; 3670 } 3671 3672 if (x86_iommu->pt_supported) { 3673 s->ecap |= VTD_ECAP_PT; 3674 } 3675 3676 if (s->caching_mode) { 3677 s->cap |= VTD_CAP_CM; 3678 } 3679 3680 /* TODO: read cap/ecap from host to decide which cap to be exposed. */ 3681 if (s->scalable_mode) { 3682 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; 3683 } 3684 3685 if (s->snoop_control) { 3686 s->ecap |= VTD_ECAP_SC; 3687 } 3688 3689 vtd_reset_caches(s); 3690 3691 /* Define registers with default values and bit semantics */ 3692 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3693 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3694 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3695 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3696 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3697 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3698 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); 3699 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3700 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3701 3702 /* Advanced Fault Logging not supported */ 3703 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3704 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3705 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3706 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3707 3708 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3709 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3710 */ 3711 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3712 3713 /* Treated as RO for implementations that PLMR and PHMR fields reported 3714 * as Clear in the CAP_REG. 3715 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3716 */ 3717 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3718 3719 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3720 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3721 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); 3722 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3723 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3724 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3725 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3726 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3727 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3728 3729 /* IOTLB registers */ 3730 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3731 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3732 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3733 3734 /* Fault Recording Registers, 128-bit */ 3735 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3736 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3737 3738 /* 3739 * Interrupt remapping registers. 3740 */ 3741 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3742 } 3743 3744 /* Should not reset address_spaces when reset because devices will still use 3745 * the address space they got at first (won't ask the bus again). 3746 */ 3747 static void vtd_reset(DeviceState *dev) 3748 { 3749 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3750 3751 vtd_init(s); 3752 vtd_address_space_refresh_all(s); 3753 } 3754 3755 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3756 { 3757 IntelIOMMUState *s = opaque; 3758 VTDAddressSpace *vtd_as; 3759 3760 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3761 3762 vtd_as = vtd_find_add_as(s, bus, devfn); 3763 return &vtd_as->as; 3764 } 3765 3766 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3767 { 3768 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3769 3770 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { 3771 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3772 return false; 3773 } 3774 3775 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3776 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3777 && x86_iommu_ir_supported(x86_iommu) ? 3778 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3779 } 3780 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3781 if (!kvm_irqchip_in_kernel()) { 3782 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3783 return false; 3784 } 3785 if (!kvm_enable_x2apic()) { 3786 error_setg(errp, "eim=on requires support on the KVM side" 3787 "(X2APIC_API, first shipped in v4.7)"); 3788 return false; 3789 } 3790 } 3791 3792 /* Currently only address widths supported are 39 and 48 bits */ 3793 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3794 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3795 error_setg(errp, "Supported values for aw-bits are: %d, %d", 3796 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3797 return false; 3798 } 3799 3800 if (s->scalable_mode && !s->dma_drain) { 3801 error_setg(errp, "Need to set dma_drain for scalable mode"); 3802 return false; 3803 } 3804 3805 return true; 3806 } 3807 3808 static int vtd_machine_done_notify_one(Object *child, void *unused) 3809 { 3810 IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); 3811 3812 /* 3813 * We hard-coded here because vfio-pci is the only special case 3814 * here. Let's be more elegant in the future when we can, but so 3815 * far there seems to be no better way. 3816 */ 3817 if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { 3818 vtd_panic_require_caching_mode(); 3819 } 3820 3821 return 0; 3822 } 3823 3824 static void vtd_machine_done_hook(Notifier *notifier, void *unused) 3825 { 3826 object_child_foreach_recursive(object_get_root(), 3827 vtd_machine_done_notify_one, NULL); 3828 } 3829 3830 static Notifier vtd_machine_done_notify = { 3831 .notify = vtd_machine_done_hook, 3832 }; 3833 3834 static void vtd_realize(DeviceState *dev, Error **errp) 3835 { 3836 MachineState *ms = MACHINE(qdev_get_machine()); 3837 PCMachineState *pcms = PC_MACHINE(ms); 3838 X86MachineState *x86ms = X86_MACHINE(ms); 3839 PCIBus *bus = pcms->bus; 3840 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3841 3842 if (!vtd_decide_config(s, errp)) { 3843 return; 3844 } 3845 3846 QLIST_INIT(&s->vtd_as_with_notifiers); 3847 qemu_mutex_init(&s->iommu_lock); 3848 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3849 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3850 "intel_iommu", DMAR_REG_SIZE); 3851 3852 /* Create the shared memory regions by all devices */ 3853 memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", 3854 UINT64_MAX); 3855 memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, 3856 s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); 3857 memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), 3858 "vtd-sys-alias", get_system_memory(), 0, 3859 memory_region_size(get_system_memory())); 3860 memory_region_add_subregion_overlap(&s->mr_nodmar, 0, 3861 &s->mr_sys_alias, 0); 3862 memory_region_add_subregion_overlap(&s->mr_nodmar, 3863 VTD_INTERRUPT_ADDR_FIRST, 3864 &s->mr_ir, 1); 3865 3866 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3867 /* No corresponding destroy */ 3868 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3869 g_free, g_free); 3870 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3871 g_free, g_free); 3872 vtd_init(s); 3873 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3874 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3875 /* Pseudo address space under root PCI bus. */ 3876 x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3877 qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); 3878 } 3879 3880 static void vtd_class_init(ObjectClass *klass, void *data) 3881 { 3882 DeviceClass *dc = DEVICE_CLASS(klass); 3883 X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); 3884 3885 dc->reset = vtd_reset; 3886 dc->vmsd = &vtd_vmstate; 3887 device_class_set_props(dc, vtd_properties); 3888 dc->hotpluggable = false; 3889 x86_class->realize = vtd_realize; 3890 x86_class->int_remap = vtd_int_remap; 3891 /* Supported by the pc-q35-* machine types */ 3892 dc->user_creatable = true; 3893 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3894 dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; 3895 } 3896 3897 static const TypeInfo vtd_info = { 3898 .name = TYPE_INTEL_IOMMU_DEVICE, 3899 .parent = TYPE_X86_IOMMU_DEVICE, 3900 .instance_size = sizeof(IntelIOMMUState), 3901 .class_init = vtd_class_init, 3902 }; 3903 3904 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3905 void *data) 3906 { 3907 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3908 3909 imrc->translate = vtd_iommu_translate; 3910 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3911 imrc->replay = vtd_iommu_replay; 3912 } 3913 3914 static const TypeInfo vtd_iommu_memory_region_info = { 3915 .parent = TYPE_IOMMU_MEMORY_REGION, 3916 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3917 .class_init = vtd_iommu_memory_region_class_init, 3918 }; 3919 3920 static void vtd_register_types(void) 3921 { 3922 type_register_static(&vtd_info); 3923 type_register_static(&vtd_iommu_memory_region_info); 3924 } 3925 3926 type_init(vtd_register_types) 3927