1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qemu/main-loop.h" 25 #include "qapi/error.h" 26 #include "hw/sysbus.h" 27 #include "exec/address-spaces.h" 28 #include "intel_iommu_internal.h" 29 #include "hw/pci/pci.h" 30 #include "hw/pci/pci_bus.h" 31 #include "hw/qdev-properties.h" 32 #include "hw/i386/pc.h" 33 #include "hw/i386/apic-msidef.h" 34 #include "hw/boards.h" 35 #include "hw/i386/x86-iommu.h" 36 #include "hw/pci-host/q35.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/sysemu.h" 39 #include "hw/i386/apic_internal.h" 40 #include "kvm_i386.h" 41 #include "migration/vmstate.h" 42 #include "trace.h" 43 44 /* context entry operations */ 45 #define VTD_CE_GET_RID2PASID(ce) \ 46 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) 47 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ 48 ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) 49 50 /* pe operations */ 51 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) 52 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) 53 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ 54 if (ret_fr) { \ 55 ret_fr = -ret_fr; \ 56 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ 57 trace_vtd_fault_disabled(); \ 58 } else { \ 59 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ 60 } \ 61 goto error; \ 62 } \ 63 } 64 65 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 66 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 67 68 static void vtd_panic_require_caching_mode(void) 69 { 70 error_report("We need to set caching-mode=on for intel-iommu to enable " 71 "device assignment with IOMMU protection."); 72 exit(1); 73 } 74 75 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 76 uint64_t wmask, uint64_t w1cmask) 77 { 78 stq_le_p(&s->csr[addr], val); 79 stq_le_p(&s->wmask[addr], wmask); 80 stq_le_p(&s->w1cmask[addr], w1cmask); 81 } 82 83 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 84 { 85 stq_le_p(&s->womask[addr], mask); 86 } 87 88 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 89 uint32_t wmask, uint32_t w1cmask) 90 { 91 stl_le_p(&s->csr[addr], val); 92 stl_le_p(&s->wmask[addr], wmask); 93 stl_le_p(&s->w1cmask[addr], w1cmask); 94 } 95 96 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 97 { 98 stl_le_p(&s->womask[addr], mask); 99 } 100 101 /* "External" get/set operations */ 102 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 103 { 104 uint64_t oldval = ldq_le_p(&s->csr[addr]); 105 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 106 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 107 stq_le_p(&s->csr[addr], 108 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 109 } 110 111 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 112 { 113 uint32_t oldval = ldl_le_p(&s->csr[addr]); 114 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 115 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 116 stl_le_p(&s->csr[addr], 117 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 118 } 119 120 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 121 { 122 uint64_t val = ldq_le_p(&s->csr[addr]); 123 uint64_t womask = ldq_le_p(&s->womask[addr]); 124 return val & ~womask; 125 } 126 127 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 128 { 129 uint32_t val = ldl_le_p(&s->csr[addr]); 130 uint32_t womask = ldl_le_p(&s->womask[addr]); 131 return val & ~womask; 132 } 133 134 /* "Internal" get/set operations */ 135 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 136 { 137 return ldq_le_p(&s->csr[addr]); 138 } 139 140 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 141 { 142 return ldl_le_p(&s->csr[addr]); 143 } 144 145 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 146 { 147 stq_le_p(&s->csr[addr], val); 148 } 149 150 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 151 uint32_t clear, uint32_t mask) 152 { 153 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 154 stl_le_p(&s->csr[addr], new_val); 155 return new_val; 156 } 157 158 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 159 uint64_t clear, uint64_t mask) 160 { 161 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 162 stq_le_p(&s->csr[addr], new_val); 163 return new_val; 164 } 165 166 static inline void vtd_iommu_lock(IntelIOMMUState *s) 167 { 168 qemu_mutex_lock(&s->iommu_lock); 169 } 170 171 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 172 { 173 qemu_mutex_unlock(&s->iommu_lock); 174 } 175 176 static void vtd_update_scalable_state(IntelIOMMUState *s) 177 { 178 uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 179 180 if (s->scalable_mode) { 181 s->root_scalable = val & VTD_RTADDR_SMT; 182 } 183 } 184 185 /* Whether the address space needs to notify new mappings */ 186 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 187 { 188 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 189 } 190 191 /* GHashTable functions */ 192 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 193 { 194 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 195 } 196 197 static guint vtd_uint64_hash(gconstpointer v) 198 { 199 return (guint)*(const uint64_t *)v; 200 } 201 202 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 203 gpointer user_data) 204 { 205 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 206 uint16_t domain_id = *(uint16_t *)user_data; 207 return entry->domain_id == domain_id; 208 } 209 210 /* The shift of an addr for a certain level of paging structure */ 211 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 212 { 213 assert(level != 0); 214 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 215 } 216 217 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 218 { 219 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 220 } 221 222 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 223 gpointer user_data) 224 { 225 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 226 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 227 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 228 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 229 return (entry->domain_id == info->domain_id) && 230 (((entry->gfn & info->mask) == gfn) || 231 (entry->gfn == gfn_tlb)); 232 } 233 234 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 235 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 236 */ 237 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 238 { 239 VTDAddressSpace *vtd_as; 240 VTDBus *vtd_bus; 241 GHashTableIter bus_it; 242 uint32_t devfn_it; 243 244 trace_vtd_context_cache_reset(); 245 246 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 247 248 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 249 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 250 vtd_as = vtd_bus->dev_as[devfn_it]; 251 if (!vtd_as) { 252 continue; 253 } 254 vtd_as->context_cache_entry.context_cache_gen = 0; 255 } 256 } 257 s->context_cache_gen = 1; 258 } 259 260 /* Must be called with IOMMU lock held. */ 261 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 262 { 263 assert(s->iotlb); 264 g_hash_table_remove_all(s->iotlb); 265 } 266 267 static void vtd_reset_iotlb(IntelIOMMUState *s) 268 { 269 vtd_iommu_lock(s); 270 vtd_reset_iotlb_locked(s); 271 vtd_iommu_unlock(s); 272 } 273 274 static void vtd_reset_caches(IntelIOMMUState *s) 275 { 276 vtd_iommu_lock(s); 277 vtd_reset_iotlb_locked(s); 278 vtd_reset_context_cache_locked(s); 279 vtd_iommu_unlock(s); 280 } 281 282 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 283 uint32_t level) 284 { 285 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 286 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 287 } 288 289 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 290 { 291 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 292 } 293 294 /* Must be called with IOMMU lock held */ 295 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 296 hwaddr addr) 297 { 298 VTDIOTLBEntry *entry; 299 uint64_t key; 300 int level; 301 302 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 303 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 304 source_id, level); 305 entry = g_hash_table_lookup(s->iotlb, &key); 306 if (entry) { 307 goto out; 308 } 309 } 310 311 out: 312 return entry; 313 } 314 315 /* Must be with IOMMU lock held */ 316 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 317 uint16_t domain_id, hwaddr addr, uint64_t slpte, 318 uint8_t access_flags, uint32_t level) 319 { 320 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 321 uint64_t *key = g_malloc(sizeof(*key)); 322 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 323 324 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 325 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 326 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 327 vtd_reset_iotlb_locked(s); 328 } 329 330 entry->gfn = gfn; 331 entry->domain_id = domain_id; 332 entry->slpte = slpte; 333 entry->access_flags = access_flags; 334 entry->mask = vtd_slpt_level_page_mask(level); 335 *key = vtd_get_iotlb_key(gfn, source_id, level); 336 g_hash_table_replace(s->iotlb, key, entry); 337 } 338 339 /* Given the reg addr of both the message data and address, generate an 340 * interrupt via MSI. 341 */ 342 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 343 hwaddr mesg_data_reg) 344 { 345 MSIMessage msi; 346 347 assert(mesg_data_reg < DMAR_REG_SIZE); 348 assert(mesg_addr_reg < DMAR_REG_SIZE); 349 350 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 351 msi.data = vtd_get_long_raw(s, mesg_data_reg); 352 353 trace_vtd_irq_generate(msi.address, msi.data); 354 355 apic_get_class()->send_msi(&msi); 356 } 357 358 /* Generate a fault event to software via MSI if conditions are met. 359 * Notice that the value of FSTS_REG being passed to it should be the one 360 * before any update. 361 */ 362 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 363 { 364 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 365 pre_fsts & VTD_FSTS_IQE) { 366 error_report_once("There are previous interrupt conditions " 367 "to be serviced by software, fault event " 368 "is not generated"); 369 return; 370 } 371 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 372 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 373 error_report_once("Interrupt Mask set, irq is not generated"); 374 } else { 375 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 376 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 377 } 378 } 379 380 /* Check if the Fault (F) field of the Fault Recording Register referenced by 381 * @index is Set. 382 */ 383 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 384 { 385 /* Each reg is 128-bit */ 386 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 387 addr += 8; /* Access the high 64-bit half */ 388 389 assert(index < DMAR_FRCD_REG_NR); 390 391 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 392 } 393 394 /* Update the PPF field of Fault Status Register. 395 * Should be called whenever change the F field of any fault recording 396 * registers. 397 */ 398 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 399 { 400 uint32_t i; 401 uint32_t ppf_mask = 0; 402 403 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 404 if (vtd_is_frcd_set(s, i)) { 405 ppf_mask = VTD_FSTS_PPF; 406 break; 407 } 408 } 409 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 410 trace_vtd_fsts_ppf(!!ppf_mask); 411 } 412 413 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 414 { 415 /* Each reg is 128-bit */ 416 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 417 addr += 8; /* Access the high 64-bit half */ 418 419 assert(index < DMAR_FRCD_REG_NR); 420 421 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 422 vtd_update_fsts_ppf(s); 423 } 424 425 /* Must not update F field now, should be done later */ 426 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 427 uint16_t source_id, hwaddr addr, 428 VTDFaultReason fault, bool is_write) 429 { 430 uint64_t hi = 0, lo; 431 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 432 433 assert(index < DMAR_FRCD_REG_NR); 434 435 lo = VTD_FRCD_FI(addr); 436 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 437 if (!is_write) { 438 hi |= VTD_FRCD_T; 439 } 440 vtd_set_quad_raw(s, frcd_reg_addr, lo); 441 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 442 443 trace_vtd_frr_new(index, hi, lo); 444 } 445 446 /* Try to collapse multiple pending faults from the same requester */ 447 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 448 { 449 uint32_t i; 450 uint64_t frcd_reg; 451 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 452 453 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 454 frcd_reg = vtd_get_quad_raw(s, addr); 455 if ((frcd_reg & VTD_FRCD_F) && 456 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 457 return true; 458 } 459 addr += 16; /* 128-bit for each */ 460 } 461 return false; 462 } 463 464 /* Log and report an DMAR (address translation) fault to software */ 465 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 466 hwaddr addr, VTDFaultReason fault, 467 bool is_write) 468 { 469 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 470 471 assert(fault < VTD_FR_MAX); 472 473 if (fault == VTD_FR_RESERVED_ERR) { 474 /* This is not a normal fault reason case. Drop it. */ 475 return; 476 } 477 478 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 479 480 if (fsts_reg & VTD_FSTS_PFO) { 481 error_report_once("New fault is not recorded due to " 482 "Primary Fault Overflow"); 483 return; 484 } 485 486 if (vtd_try_collapse_fault(s, source_id)) { 487 error_report_once("New fault is not recorded due to " 488 "compression of faults"); 489 return; 490 } 491 492 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 493 error_report_once("Next Fault Recording Reg is used, " 494 "new fault is not recorded, set PFO field"); 495 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 496 return; 497 } 498 499 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 500 501 if (fsts_reg & VTD_FSTS_PPF) { 502 error_report_once("There are pending faults already, " 503 "fault event is not generated"); 504 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 505 s->next_frcd_reg++; 506 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 507 s->next_frcd_reg = 0; 508 } 509 } else { 510 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 511 VTD_FSTS_FRI(s->next_frcd_reg)); 512 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 513 s->next_frcd_reg++; 514 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 515 s->next_frcd_reg = 0; 516 } 517 /* This case actually cause the PPF to be Set. 518 * So generate fault event (interrupt). 519 */ 520 vtd_generate_fault_event(s, fsts_reg); 521 } 522 } 523 524 /* Handle Invalidation Queue Errors of queued invalidation interface error 525 * conditions. 526 */ 527 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 528 { 529 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 530 531 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 532 vtd_generate_fault_event(s, fsts_reg); 533 } 534 535 /* Set the IWC field and try to generate an invalidation completion interrupt */ 536 static void vtd_generate_completion_event(IntelIOMMUState *s) 537 { 538 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 539 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 540 return; 541 } 542 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 543 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 544 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 545 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 546 "new event not generated"); 547 return; 548 } else { 549 /* Generate the interrupt event */ 550 trace_vtd_inv_desc_wait_irq("Generating complete event"); 551 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 552 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 553 } 554 } 555 556 static inline bool vtd_root_entry_present(IntelIOMMUState *s, 557 VTDRootEntry *re, 558 uint8_t devfn) 559 { 560 if (s->root_scalable && devfn > UINT8_MAX / 2) { 561 return re->hi & VTD_ROOT_ENTRY_P; 562 } 563 564 return re->lo & VTD_ROOT_ENTRY_P; 565 } 566 567 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 568 VTDRootEntry *re) 569 { 570 dma_addr_t addr; 571 572 addr = s->root + index * sizeof(*re); 573 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { 574 re->lo = 0; 575 return -VTD_FR_ROOT_TABLE_INV; 576 } 577 re->lo = le64_to_cpu(re->lo); 578 re->hi = le64_to_cpu(re->hi); 579 return 0; 580 } 581 582 static inline bool vtd_ce_present(VTDContextEntry *context) 583 { 584 return context->lo & VTD_CONTEXT_ENTRY_P; 585 } 586 587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s, 588 VTDRootEntry *re, 589 uint8_t index, 590 VTDContextEntry *ce) 591 { 592 dma_addr_t addr, ce_size; 593 594 /* we have checked that root entry is present */ 595 ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : 596 VTD_CTX_ENTRY_LEGACY_SIZE; 597 598 if (s->root_scalable && index > UINT8_MAX / 2) { 599 index = index & (~VTD_DEVFN_CHECK_MASK); 600 addr = re->hi & VTD_ROOT_ENTRY_CTP; 601 } else { 602 addr = re->lo & VTD_ROOT_ENTRY_CTP; 603 } 604 605 addr = addr + index * ce_size; 606 if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) { 607 return -VTD_FR_CONTEXT_TABLE_INV; 608 } 609 610 ce->lo = le64_to_cpu(ce->lo); 611 ce->hi = le64_to_cpu(ce->hi); 612 if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { 613 ce->val[2] = le64_to_cpu(ce->val[2]); 614 ce->val[3] = le64_to_cpu(ce->val[3]); 615 } 616 return 0; 617 } 618 619 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 620 { 621 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 622 } 623 624 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 625 { 626 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 627 } 628 629 /* Whether the pte indicates the address of the page frame */ 630 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 631 { 632 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 633 } 634 635 /* Get the content of a spte located in @base_addr[@index] */ 636 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 637 { 638 uint64_t slpte; 639 640 assert(index < VTD_SL_PT_ENTRY_NR); 641 642 if (dma_memory_read(&address_space_memory, 643 base_addr + index * sizeof(slpte), &slpte, 644 sizeof(slpte))) { 645 slpte = (uint64_t)-1; 646 return slpte; 647 } 648 slpte = le64_to_cpu(slpte); 649 return slpte; 650 } 651 652 /* Given an iova and the level of paging structure, return the offset 653 * of current level. 654 */ 655 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 656 { 657 return (iova >> vtd_slpt_level_shift(level)) & 658 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 659 } 660 661 /* Check Capability Register to see if the @level of page-table is supported */ 662 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 663 { 664 return VTD_CAP_SAGAW_MASK & s->cap & 665 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 666 } 667 668 /* Return true if check passed, otherwise false */ 669 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, 670 VTDPASIDEntry *pe) 671 { 672 switch (VTD_PE_GET_TYPE(pe)) { 673 case VTD_SM_PASID_ENTRY_FLT: 674 case VTD_SM_PASID_ENTRY_SLT: 675 case VTD_SM_PASID_ENTRY_NESTED: 676 break; 677 case VTD_SM_PASID_ENTRY_PT: 678 if (!x86_iommu->pt_supported) { 679 return false; 680 } 681 break; 682 default: 683 /* Unknwon type */ 684 return false; 685 } 686 return true; 687 } 688 689 static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, 690 uint32_t pasid, 691 VTDPASIDDirEntry *pdire) 692 { 693 uint32_t index; 694 dma_addr_t addr, entry_size; 695 696 index = VTD_PASID_DIR_INDEX(pasid); 697 entry_size = VTD_PASID_DIR_ENTRY_SIZE; 698 addr = pasid_dir_base + index * entry_size; 699 if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) { 700 return -VTD_FR_PASID_TABLE_INV; 701 } 702 703 return 0; 704 } 705 706 static int vtd_get_pasid_entry(IntelIOMMUState *s, 707 uint32_t pasid, 708 VTDPASIDDirEntry *pdire, 709 VTDPASIDEntry *pe) 710 { 711 uint32_t index; 712 dma_addr_t addr, entry_size; 713 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 714 715 index = VTD_PASID_TABLE_INDEX(pasid); 716 entry_size = VTD_PASID_ENTRY_SIZE; 717 addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; 718 addr = addr + index * entry_size; 719 if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { 720 return -VTD_FR_PASID_TABLE_INV; 721 } 722 723 /* Do translation type check */ 724 if (!vtd_pe_type_check(x86_iommu, pe)) { 725 return -VTD_FR_PASID_TABLE_INV; 726 } 727 728 if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { 729 return -VTD_FR_PASID_TABLE_INV; 730 } 731 732 return 0; 733 } 734 735 static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s, 736 dma_addr_t pasid_dir_base, 737 uint32_t pasid, 738 VTDPASIDEntry *pe) 739 { 740 int ret; 741 VTDPASIDDirEntry pdire; 742 743 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 744 if (ret) { 745 return ret; 746 } 747 748 ret = vtd_get_pasid_entry(s, pasid, &pdire, pe); 749 if (ret) { 750 return ret; 751 } 752 753 return ret; 754 } 755 756 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, 757 VTDContextEntry *ce, 758 VTDPASIDEntry *pe) 759 { 760 uint32_t pasid; 761 dma_addr_t pasid_dir_base; 762 int ret = 0; 763 764 pasid = VTD_CE_GET_RID2PASID(ce); 765 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 766 ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe); 767 768 return ret; 769 } 770 771 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, 772 VTDContextEntry *ce, 773 bool *pe_fpd_set) 774 { 775 int ret; 776 uint32_t pasid; 777 dma_addr_t pasid_dir_base; 778 VTDPASIDDirEntry pdire; 779 VTDPASIDEntry pe; 780 781 pasid = VTD_CE_GET_RID2PASID(ce); 782 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 783 784 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 785 if (ret) { 786 return ret; 787 } 788 789 if (pdire.val & VTD_PASID_DIR_FPD) { 790 *pe_fpd_set = true; 791 return 0; 792 } 793 794 ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe); 795 if (ret) { 796 return ret; 797 } 798 799 if (pe.val[0] & VTD_PASID_ENTRY_FPD) { 800 *pe_fpd_set = true; 801 } 802 803 return 0; 804 } 805 806 /* Get the page-table level that hardware should use for the second-level 807 * page-table walk from the Address Width field of context-entry. 808 */ 809 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 810 { 811 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 812 } 813 814 static uint32_t vtd_get_iova_level(IntelIOMMUState *s, 815 VTDContextEntry *ce) 816 { 817 VTDPASIDEntry pe; 818 819 if (s->root_scalable) { 820 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 821 return VTD_PE_GET_LEVEL(&pe); 822 } 823 824 return vtd_ce_get_level(ce); 825 } 826 827 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 828 { 829 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 830 } 831 832 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, 833 VTDContextEntry *ce) 834 { 835 VTDPASIDEntry pe; 836 837 if (s->root_scalable) { 838 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 839 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; 840 } 841 842 return vtd_ce_get_agaw(ce); 843 } 844 845 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 846 { 847 return ce->lo & VTD_CONTEXT_ENTRY_TT; 848 } 849 850 /* Only for Legacy Mode. Return true if check passed, otherwise false */ 851 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 852 VTDContextEntry *ce) 853 { 854 switch (vtd_ce_get_type(ce)) { 855 case VTD_CONTEXT_TT_MULTI_LEVEL: 856 /* Always supported */ 857 break; 858 case VTD_CONTEXT_TT_DEV_IOTLB: 859 if (!x86_iommu->dt_supported) { 860 error_report_once("%s: DT specified but not supported", __func__); 861 return false; 862 } 863 break; 864 case VTD_CONTEXT_TT_PASS_THROUGH: 865 if (!x86_iommu->pt_supported) { 866 error_report_once("%s: PT specified but not supported", __func__); 867 return false; 868 } 869 break; 870 default: 871 /* Unknown type */ 872 error_report_once("%s: unknown ce type: %"PRIu32, __func__, 873 vtd_ce_get_type(ce)); 874 return false; 875 } 876 return true; 877 } 878 879 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, 880 VTDContextEntry *ce, uint8_t aw) 881 { 882 uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); 883 return 1ULL << MIN(ce_agaw, aw); 884 } 885 886 /* Return true if IOVA passes range check, otherwise false. */ 887 static inline bool vtd_iova_range_check(IntelIOMMUState *s, 888 uint64_t iova, VTDContextEntry *ce, 889 uint8_t aw) 890 { 891 /* 892 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 893 * in CAP_REG and AW in context-entry. 894 */ 895 return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); 896 } 897 898 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, 899 VTDContextEntry *ce) 900 { 901 VTDPASIDEntry pe; 902 903 if (s->root_scalable) { 904 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 905 return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; 906 } 907 908 return vtd_ce_get_slpt_base(ce); 909 } 910 911 /* 912 * Rsvd field masks for spte: 913 * Index [1] to [4] 4k pages 914 * Index [5] to [8] large pages 915 */ 916 static uint64_t vtd_paging_entry_rsvd_field[9]; 917 918 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 919 { 920 if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) { 921 /* Maybe large page */ 922 return slpte & vtd_paging_entry_rsvd_field[level + 4]; 923 } else { 924 return slpte & vtd_paging_entry_rsvd_field[level]; 925 } 926 } 927 928 /* Find the VTD address space associated with a given bus number */ 929 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 930 { 931 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 932 if (!vtd_bus) { 933 /* 934 * Iterate over the registered buses to find the one which 935 * currently hold this bus number, and update the bus_num 936 * lookup table: 937 */ 938 GHashTableIter iter; 939 940 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 941 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 942 if (pci_bus_num(vtd_bus->bus) == bus_num) { 943 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 944 return vtd_bus; 945 } 946 } 947 } 948 return vtd_bus; 949 } 950 951 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 952 * of the translation, can be used for deciding the size of large page. 953 */ 954 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, 955 uint64_t iova, bool is_write, 956 uint64_t *slptep, uint32_t *slpte_level, 957 bool *reads, bool *writes, uint8_t aw_bits) 958 { 959 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 960 uint32_t level = vtd_get_iova_level(s, ce); 961 uint32_t offset; 962 uint64_t slpte; 963 uint64_t access_right_check; 964 965 if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { 966 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 967 __func__, iova); 968 return -VTD_FR_ADDR_BEYOND_MGAW; 969 } 970 971 /* FIXME: what is the Atomics request here? */ 972 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 973 974 while (true) { 975 offset = vtd_iova_level_offset(iova, level); 976 slpte = vtd_get_slpte(addr, offset); 977 978 if (slpte == (uint64_t)-1) { 979 error_report_once("%s: detected read error on DMAR slpte " 980 "(iova=0x%" PRIx64 ")", __func__, iova); 981 if (level == vtd_get_iova_level(s, ce)) { 982 /* Invalid programming of context-entry */ 983 return -VTD_FR_CONTEXT_ENTRY_INV; 984 } else { 985 return -VTD_FR_PAGING_ENTRY_INV; 986 } 987 } 988 *reads = (*reads) && (slpte & VTD_SL_R); 989 *writes = (*writes) && (slpte & VTD_SL_W); 990 if (!(slpte & access_right_check)) { 991 error_report_once("%s: detected slpte permission error " 992 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 993 "slpte=0x%" PRIx64 ", write=%d)", __func__, 994 iova, level, slpte, is_write); 995 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 996 } 997 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 998 error_report_once("%s: detected splte reserve non-zero " 999 "iova=0x%" PRIx64 ", level=0x%" PRIx32 1000 "slpte=0x%" PRIx64 ")", __func__, iova, 1001 level, slpte); 1002 return -VTD_FR_PAGING_ENTRY_RSVD; 1003 } 1004 1005 if (vtd_is_last_slpte(slpte, level)) { 1006 *slptep = slpte; 1007 *slpte_level = level; 1008 return 0; 1009 } 1010 addr = vtd_get_slpte_addr(slpte, aw_bits); 1011 level--; 1012 } 1013 } 1014 1015 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); 1016 1017 /** 1018 * Constant information used during page walking 1019 * 1020 * @hook_fn: hook func to be called when detected page 1021 * @private: private data to be passed into hook func 1022 * @notify_unmap: whether we should notify invalid entries 1023 * @as: VT-d address space of the device 1024 * @aw: maximum address width 1025 * @domain: domain ID of the page walk 1026 */ 1027 typedef struct { 1028 VTDAddressSpace *as; 1029 vtd_page_walk_hook hook_fn; 1030 void *private; 1031 bool notify_unmap; 1032 uint8_t aw; 1033 uint16_t domain_id; 1034 } vtd_page_walk_info; 1035 1036 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) 1037 { 1038 VTDAddressSpace *as = info->as; 1039 vtd_page_walk_hook hook_fn = info->hook_fn; 1040 void *private = info->private; 1041 DMAMap target = { 1042 .iova = entry->iova, 1043 .size = entry->addr_mask, 1044 .translated_addr = entry->translated_addr, 1045 .perm = entry->perm, 1046 }; 1047 DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 1048 1049 if (entry->perm == IOMMU_NONE && !info->notify_unmap) { 1050 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1051 return 0; 1052 } 1053 1054 assert(hook_fn); 1055 1056 /* Update local IOVA mapped ranges */ 1057 if (entry->perm) { 1058 if (mapped) { 1059 /* If it's exactly the same translation, skip */ 1060 if (!memcmp(mapped, &target, sizeof(target))) { 1061 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 1062 entry->translated_addr); 1063 return 0; 1064 } else { 1065 /* 1066 * Translation changed. Normally this should not 1067 * happen, but it can happen when with buggy guest 1068 * OSes. Note that there will be a small window that 1069 * we don't have map at all. But that's the best 1070 * effort we can do. The ideal way to emulate this is 1071 * atomically modify the PTE to follow what has 1072 * changed, but we can't. One example is that vfio 1073 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 1074 * interface to modify a mapping (meanwhile it seems 1075 * meaningless to even provide one). Anyway, let's 1076 * mark this as a TODO in case one day we'll have 1077 * a better solution. 1078 */ 1079 IOMMUAccessFlags cache_perm = entry->perm; 1080 int ret; 1081 1082 /* Emulate an UNMAP */ 1083 entry->perm = IOMMU_NONE; 1084 trace_vtd_page_walk_one(info->domain_id, 1085 entry->iova, 1086 entry->translated_addr, 1087 entry->addr_mask, 1088 entry->perm); 1089 ret = hook_fn(entry, private); 1090 if (ret) { 1091 return ret; 1092 } 1093 /* Drop any existing mapping */ 1094 iova_tree_remove(as->iova_tree, &target); 1095 /* Recover the correct permission */ 1096 entry->perm = cache_perm; 1097 } 1098 } 1099 iova_tree_insert(as->iova_tree, &target); 1100 } else { 1101 if (!mapped) { 1102 /* Skip since we didn't map this range at all */ 1103 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1104 return 0; 1105 } 1106 iova_tree_remove(as->iova_tree, &target); 1107 } 1108 1109 trace_vtd_page_walk_one(info->domain_id, entry->iova, 1110 entry->translated_addr, entry->addr_mask, 1111 entry->perm); 1112 return hook_fn(entry, private); 1113 } 1114 1115 /** 1116 * vtd_page_walk_level - walk over specific level for IOVA range 1117 * 1118 * @addr: base GPA addr to start the walk 1119 * @start: IOVA range start address 1120 * @end: IOVA range end address (start <= addr < end) 1121 * @read: whether parent level has read permission 1122 * @write: whether parent level has write permission 1123 * @info: constant information for the page walk 1124 */ 1125 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 1126 uint64_t end, uint32_t level, bool read, 1127 bool write, vtd_page_walk_info *info) 1128 { 1129 bool read_cur, write_cur, entry_valid; 1130 uint32_t offset; 1131 uint64_t slpte; 1132 uint64_t subpage_size, subpage_mask; 1133 IOMMUTLBEntry entry; 1134 uint64_t iova = start; 1135 uint64_t iova_next; 1136 int ret = 0; 1137 1138 trace_vtd_page_walk_level(addr, level, start, end); 1139 1140 subpage_size = 1ULL << vtd_slpt_level_shift(level); 1141 subpage_mask = vtd_slpt_level_page_mask(level); 1142 1143 while (iova < end) { 1144 iova_next = (iova & subpage_mask) + subpage_size; 1145 1146 offset = vtd_iova_level_offset(iova, level); 1147 slpte = vtd_get_slpte(addr, offset); 1148 1149 if (slpte == (uint64_t)-1) { 1150 trace_vtd_page_walk_skip_read(iova, iova_next); 1151 goto next; 1152 } 1153 1154 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1155 trace_vtd_page_walk_skip_reserve(iova, iova_next); 1156 goto next; 1157 } 1158 1159 /* Permissions are stacked with parents' */ 1160 read_cur = read && (slpte & VTD_SL_R); 1161 write_cur = write && (slpte & VTD_SL_W); 1162 1163 /* 1164 * As long as we have either read/write permission, this is a 1165 * valid entry. The rule works for both page entries and page 1166 * table entries. 1167 */ 1168 entry_valid = read_cur | write_cur; 1169 1170 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 1171 /* 1172 * This is a valid PDE (or even bigger than PDE). We need 1173 * to walk one further level. 1174 */ 1175 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 1176 iova, MIN(iova_next, end), level - 1, 1177 read_cur, write_cur, info); 1178 } else { 1179 /* 1180 * This means we are either: 1181 * 1182 * (1) the real page entry (either 4K page, or huge page) 1183 * (2) the whole range is invalid 1184 * 1185 * In either case, we send an IOTLB notification down. 1186 */ 1187 entry.target_as = &address_space_memory; 1188 entry.iova = iova & subpage_mask; 1189 entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 1190 entry.addr_mask = ~subpage_mask; 1191 /* NOTE: this is only meaningful if entry_valid == true */ 1192 entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 1193 ret = vtd_page_walk_one(&entry, info); 1194 } 1195 1196 if (ret < 0) { 1197 return ret; 1198 } 1199 1200 next: 1201 iova = iova_next; 1202 } 1203 1204 return 0; 1205 } 1206 1207 /** 1208 * vtd_page_walk - walk specific IOVA range, and call the hook 1209 * 1210 * @s: intel iommu state 1211 * @ce: context entry to walk upon 1212 * @start: IOVA address to start the walk 1213 * @end: IOVA range end address (start <= addr < end) 1214 * @info: page walking information struct 1215 */ 1216 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, 1217 uint64_t start, uint64_t end, 1218 vtd_page_walk_info *info) 1219 { 1220 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1221 uint32_t level = vtd_get_iova_level(s, ce); 1222 1223 if (!vtd_iova_range_check(s, start, ce, info->aw)) { 1224 return -VTD_FR_ADDR_BEYOND_MGAW; 1225 } 1226 1227 if (!vtd_iova_range_check(s, end, ce, info->aw)) { 1228 /* Fix end so that it reaches the maximum */ 1229 end = vtd_iova_limit(s, ce, info->aw); 1230 } 1231 1232 return vtd_page_walk_level(addr, start, end, level, true, true, info); 1233 } 1234 1235 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, 1236 VTDRootEntry *re) 1237 { 1238 /* Legacy Mode reserved bits check */ 1239 if (!s->root_scalable && 1240 (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1241 goto rsvd_err; 1242 1243 /* Scalable Mode reserved bits check */ 1244 if (s->root_scalable && 1245 ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || 1246 (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1247 goto rsvd_err; 1248 1249 return 0; 1250 1251 rsvd_err: 1252 error_report_once("%s: invalid root entry: hi=0x%"PRIx64 1253 ", lo=0x%"PRIx64, 1254 __func__, re->hi, re->lo); 1255 return -VTD_FR_ROOT_ENTRY_RSVD; 1256 } 1257 1258 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, 1259 VTDContextEntry *ce) 1260 { 1261 if (!s->root_scalable && 1262 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || 1263 ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1264 error_report_once("%s: invalid context entry: hi=%"PRIx64 1265 ", lo=%"PRIx64" (reserved nonzero)", 1266 __func__, ce->hi, ce->lo); 1267 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1268 } 1269 1270 if (s->root_scalable && 1271 (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || 1272 ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || 1273 ce->val[2] || 1274 ce->val[3])) { 1275 error_report_once("%s: invalid context entry: val[3]=%"PRIx64 1276 ", val[2]=%"PRIx64 1277 ", val[1]=%"PRIx64 1278 ", val[0]=%"PRIx64" (reserved nonzero)", 1279 __func__, ce->val[3], ce->val[2], 1280 ce->val[1], ce->val[0]); 1281 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1282 } 1283 1284 return 0; 1285 } 1286 1287 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, 1288 VTDContextEntry *ce) 1289 { 1290 VTDPASIDEntry pe; 1291 1292 /* 1293 * Make sure in Scalable Mode, a present context entry 1294 * has valid rid2pasid setting, which includes valid 1295 * rid2pasid field and corresponding pasid entry setting 1296 */ 1297 return vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1298 } 1299 1300 /* Map a device to its corresponding domain (context-entry) */ 1301 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 1302 uint8_t devfn, VTDContextEntry *ce) 1303 { 1304 VTDRootEntry re; 1305 int ret_fr; 1306 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 1307 1308 ret_fr = vtd_get_root_entry(s, bus_num, &re); 1309 if (ret_fr) { 1310 return ret_fr; 1311 } 1312 1313 if (!vtd_root_entry_present(s, &re, devfn)) { 1314 /* Not error - it's okay we don't have root entry. */ 1315 trace_vtd_re_not_present(bus_num); 1316 return -VTD_FR_ROOT_ENTRY_P; 1317 } 1318 1319 ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); 1320 if (ret_fr) { 1321 return ret_fr; 1322 } 1323 1324 ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); 1325 if (ret_fr) { 1326 return ret_fr; 1327 } 1328 1329 if (!vtd_ce_present(ce)) { 1330 /* Not error - it's okay we don't have context entry. */ 1331 trace_vtd_ce_not_present(bus_num, devfn); 1332 return -VTD_FR_CONTEXT_ENTRY_P; 1333 } 1334 1335 ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); 1336 if (ret_fr) { 1337 return ret_fr; 1338 } 1339 1340 /* Check if the programming of context-entry is valid */ 1341 if (!s->root_scalable && 1342 !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1343 error_report_once("%s: invalid context entry: hi=%"PRIx64 1344 ", lo=%"PRIx64" (level %d not supported)", 1345 __func__, ce->hi, ce->lo, 1346 vtd_ce_get_level(ce)); 1347 return -VTD_FR_CONTEXT_ENTRY_INV; 1348 } 1349 1350 if (!s->root_scalable) { 1351 /* Do translation type check */ 1352 if (!vtd_ce_type_check(x86_iommu, ce)) { 1353 /* Errors dumped in vtd_ce_type_check() */ 1354 return -VTD_FR_CONTEXT_ENTRY_INV; 1355 } 1356 } else { 1357 /* 1358 * Check if the programming of context-entry.rid2pasid 1359 * and corresponding pasid setting is valid, and thus 1360 * avoids to check pasid entry fetching result in future 1361 * helper function calling. 1362 */ 1363 ret_fr = vtd_ce_rid2pasid_check(s, ce); 1364 if (ret_fr) { 1365 return ret_fr; 1366 } 1367 } 1368 1369 return 0; 1370 } 1371 1372 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, 1373 void *private) 1374 { 1375 memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); 1376 return 0; 1377 } 1378 1379 static uint16_t vtd_get_domain_id(IntelIOMMUState *s, 1380 VTDContextEntry *ce) 1381 { 1382 VTDPASIDEntry pe; 1383 1384 if (s->root_scalable) { 1385 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1386 return VTD_SM_PASID_ENTRY_DID(pe.val[1]); 1387 } 1388 1389 return VTD_CONTEXT_ENTRY_DID(ce->hi); 1390 } 1391 1392 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1393 VTDContextEntry *ce, 1394 hwaddr addr, hwaddr size) 1395 { 1396 IntelIOMMUState *s = vtd_as->iommu_state; 1397 vtd_page_walk_info info = { 1398 .hook_fn = vtd_sync_shadow_page_hook, 1399 .private = (void *)&vtd_as->iommu, 1400 .notify_unmap = true, 1401 .aw = s->aw_bits, 1402 .as = vtd_as, 1403 .domain_id = vtd_get_domain_id(s, ce), 1404 }; 1405 1406 return vtd_page_walk(s, ce, addr, addr + size, &info); 1407 } 1408 1409 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1410 { 1411 int ret; 1412 VTDContextEntry ce; 1413 IOMMUNotifier *n; 1414 1415 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1416 pci_bus_num(vtd_as->bus), 1417 vtd_as->devfn, &ce); 1418 if (ret) { 1419 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1420 /* 1421 * It's a valid scenario to have a context entry that is 1422 * not present. For example, when a device is removed 1423 * from an existing domain then the context entry will be 1424 * zeroed by the guest before it was put into another 1425 * domain. When this happens, instead of synchronizing 1426 * the shadow pages we should invalidate all existing 1427 * mappings and notify the backends. 1428 */ 1429 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1430 vtd_address_space_unmap(vtd_as, n); 1431 } 1432 ret = 0; 1433 } 1434 return ret; 1435 } 1436 1437 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1438 } 1439 1440 /* 1441 * Check if specific device is configed to bypass address 1442 * translation for DMA requests. In Scalable Mode, bypass 1443 * 1st-level translation or 2nd-level translation, it depends 1444 * on PGTT setting. 1445 */ 1446 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1447 { 1448 IntelIOMMUState *s; 1449 VTDContextEntry ce; 1450 VTDPASIDEntry pe; 1451 int ret; 1452 1453 assert(as); 1454 1455 s = as->iommu_state; 1456 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1457 as->devfn, &ce); 1458 if (ret) { 1459 /* 1460 * Possibly failed to parse the context entry for some reason 1461 * (e.g., during init, or any guest configuration errors on 1462 * context entries). We should assume PT not enabled for 1463 * safety. 1464 */ 1465 return false; 1466 } 1467 1468 if (s->root_scalable) { 1469 ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); 1470 if (ret) { 1471 error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, 1472 __func__, ret); 1473 return false; 1474 } 1475 return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); 1476 } 1477 1478 return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); 1479 } 1480 1481 /* Return whether the device is using IOMMU translation. */ 1482 static bool vtd_switch_address_space(VTDAddressSpace *as) 1483 { 1484 bool use_iommu; 1485 /* Whether we need to take the BQL on our own */ 1486 bool take_bql = !qemu_mutex_iothread_locked(); 1487 1488 assert(as); 1489 1490 use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); 1491 1492 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1493 VTD_PCI_SLOT(as->devfn), 1494 VTD_PCI_FUNC(as->devfn), 1495 use_iommu); 1496 1497 /* 1498 * It's possible that we reach here without BQL, e.g., when called 1499 * from vtd_pt_enable_fast_path(). However the memory APIs need 1500 * it. We'd better make sure we have had it already, or, take it. 1501 */ 1502 if (take_bql) { 1503 qemu_mutex_lock_iothread(); 1504 } 1505 1506 /* Turn off first then on the other */ 1507 if (use_iommu) { 1508 memory_region_set_enabled(&as->nodmar, false); 1509 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1510 } else { 1511 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1512 memory_region_set_enabled(&as->nodmar, true); 1513 } 1514 1515 if (take_bql) { 1516 qemu_mutex_unlock_iothread(); 1517 } 1518 1519 return use_iommu; 1520 } 1521 1522 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1523 { 1524 GHashTableIter iter; 1525 VTDBus *vtd_bus; 1526 int i; 1527 1528 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1529 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1530 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1531 if (!vtd_bus->dev_as[i]) { 1532 continue; 1533 } 1534 vtd_switch_address_space(vtd_bus->dev_as[i]); 1535 } 1536 } 1537 } 1538 1539 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1540 { 1541 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1542 } 1543 1544 static const bool vtd_qualified_faults[] = { 1545 [VTD_FR_RESERVED] = false, 1546 [VTD_FR_ROOT_ENTRY_P] = false, 1547 [VTD_FR_CONTEXT_ENTRY_P] = true, 1548 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1549 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1550 [VTD_FR_WRITE] = true, 1551 [VTD_FR_READ] = true, 1552 [VTD_FR_PAGING_ENTRY_INV] = true, 1553 [VTD_FR_ROOT_TABLE_INV] = false, 1554 [VTD_FR_CONTEXT_TABLE_INV] = false, 1555 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1556 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1557 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1558 [VTD_FR_PASID_TABLE_INV] = false, 1559 [VTD_FR_RESERVED_ERR] = false, 1560 [VTD_FR_MAX] = false, 1561 }; 1562 1563 /* To see if a fault condition is "qualified", which is reported to software 1564 * only if the FPD field in the context-entry used to process the faulting 1565 * request is 0. 1566 */ 1567 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1568 { 1569 return vtd_qualified_faults[fault]; 1570 } 1571 1572 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1573 { 1574 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1575 } 1576 1577 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1578 { 1579 VTDBus *vtd_bus; 1580 VTDAddressSpace *vtd_as; 1581 bool success = false; 1582 1583 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1584 if (!vtd_bus) { 1585 goto out; 1586 } 1587 1588 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1589 if (!vtd_as) { 1590 goto out; 1591 } 1592 1593 if (vtd_switch_address_space(vtd_as) == false) { 1594 /* We switched off IOMMU region successfully. */ 1595 success = true; 1596 } 1597 1598 out: 1599 trace_vtd_pt_enable_fast_path(source_id, success); 1600 } 1601 1602 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1603 * translation. 1604 * 1605 * Called from RCU critical section. 1606 * 1607 * @bus_num: The bus number 1608 * @devfn: The devfn, which is the combined of device and function number 1609 * @is_write: The access is a write operation 1610 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1611 * 1612 * Returns true if translation is successful, otherwise false. 1613 */ 1614 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1615 uint8_t devfn, hwaddr addr, bool is_write, 1616 IOMMUTLBEntry *entry) 1617 { 1618 IntelIOMMUState *s = vtd_as->iommu_state; 1619 VTDContextEntry ce; 1620 uint8_t bus_num = pci_bus_num(bus); 1621 VTDContextCacheEntry *cc_entry; 1622 uint64_t slpte, page_mask; 1623 uint32_t level; 1624 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1625 int ret_fr; 1626 bool is_fpd_set = false; 1627 bool reads = true; 1628 bool writes = true; 1629 uint8_t access_flags; 1630 VTDIOTLBEntry *iotlb_entry; 1631 1632 /* 1633 * We have standalone memory region for interrupt addresses, we 1634 * should never receive translation requests in this region. 1635 */ 1636 assert(!vtd_is_interrupt_addr(addr)); 1637 1638 vtd_iommu_lock(s); 1639 1640 cc_entry = &vtd_as->context_cache_entry; 1641 1642 /* Try to fetch slpte form IOTLB */ 1643 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1644 if (iotlb_entry) { 1645 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1646 iotlb_entry->domain_id); 1647 slpte = iotlb_entry->slpte; 1648 access_flags = iotlb_entry->access_flags; 1649 page_mask = iotlb_entry->mask; 1650 goto out; 1651 } 1652 1653 /* Try to fetch context-entry from cache first */ 1654 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1655 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1656 cc_entry->context_entry.lo, 1657 cc_entry->context_cache_gen); 1658 ce = cc_entry->context_entry; 1659 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1660 if (!is_fpd_set && s->root_scalable) { 1661 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1662 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1663 } 1664 } else { 1665 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1666 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1667 if (!ret_fr && !is_fpd_set && s->root_scalable) { 1668 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1669 } 1670 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1671 /* Update context-cache */ 1672 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1673 cc_entry->context_cache_gen, 1674 s->context_cache_gen); 1675 cc_entry->context_entry = ce; 1676 cc_entry->context_cache_gen = s->context_cache_gen; 1677 } 1678 1679 /* 1680 * We don't need to translate for pass-through context entries. 1681 * Also, let's ignore IOTLB caching as well for PT devices. 1682 */ 1683 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1684 entry->iova = addr & VTD_PAGE_MASK_4K; 1685 entry->translated_addr = entry->iova; 1686 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1687 entry->perm = IOMMU_RW; 1688 trace_vtd_translate_pt(source_id, entry->iova); 1689 1690 /* 1691 * When this happens, it means firstly caching-mode is not 1692 * enabled, and this is the first passthrough translation for 1693 * the device. Let's enable the fast path for passthrough. 1694 * 1695 * When passthrough is disabled again for the device, we can 1696 * capture it via the context entry invalidation, then the 1697 * IOMMU region can be swapped back. 1698 */ 1699 vtd_pt_enable_fast_path(s, source_id); 1700 vtd_iommu_unlock(s); 1701 return true; 1702 } 1703 1704 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, 1705 &reads, &writes, s->aw_bits); 1706 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1707 1708 page_mask = vtd_slpt_level_page_mask(level); 1709 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1710 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, 1711 access_flags, level); 1712 out: 1713 vtd_iommu_unlock(s); 1714 entry->iova = addr & page_mask; 1715 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1716 entry->addr_mask = ~page_mask; 1717 entry->perm = access_flags; 1718 return true; 1719 1720 error: 1721 vtd_iommu_unlock(s); 1722 entry->iova = 0; 1723 entry->translated_addr = 0; 1724 entry->addr_mask = 0; 1725 entry->perm = IOMMU_NONE; 1726 return false; 1727 } 1728 1729 static void vtd_root_table_setup(IntelIOMMUState *s) 1730 { 1731 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1732 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1733 1734 vtd_update_scalable_state(s); 1735 1736 trace_vtd_reg_dmar_root(s->root, s->root_scalable); 1737 } 1738 1739 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1740 uint32_t index, uint32_t mask) 1741 { 1742 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1743 } 1744 1745 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1746 { 1747 uint64_t value = 0; 1748 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1749 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1750 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1751 s->intr_eime = value & VTD_IRTA_EIME; 1752 1753 /* Notify global invalidation */ 1754 vtd_iec_notify_all(s, true, 0, 0); 1755 1756 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1757 } 1758 1759 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1760 { 1761 VTDAddressSpace *vtd_as; 1762 1763 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1764 vtd_sync_shadow_page_table(vtd_as); 1765 } 1766 } 1767 1768 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1769 { 1770 trace_vtd_inv_desc_cc_global(); 1771 /* Protects context cache */ 1772 vtd_iommu_lock(s); 1773 s->context_cache_gen++; 1774 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1775 vtd_reset_context_cache_locked(s); 1776 } 1777 vtd_iommu_unlock(s); 1778 vtd_address_space_refresh_all(s); 1779 /* 1780 * From VT-d spec 6.5.2.1, a global context entry invalidation 1781 * should be followed by a IOTLB global invalidation, so we should 1782 * be safe even without this. Hoewever, let's replay the region as 1783 * well to be safer, and go back here when we need finer tunes for 1784 * VT-d emulation codes. 1785 */ 1786 vtd_iommu_replay_all(s); 1787 } 1788 1789 /* Do a context-cache device-selective invalidation. 1790 * @func_mask: FM field after shifting 1791 */ 1792 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1793 uint16_t source_id, 1794 uint16_t func_mask) 1795 { 1796 uint16_t mask; 1797 VTDBus *vtd_bus; 1798 VTDAddressSpace *vtd_as; 1799 uint8_t bus_n, devfn; 1800 uint16_t devfn_it; 1801 1802 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1803 1804 switch (func_mask & 3) { 1805 case 0: 1806 mask = 0; /* No bits in the SID field masked */ 1807 break; 1808 case 1: 1809 mask = 4; /* Mask bit 2 in the SID field */ 1810 break; 1811 case 2: 1812 mask = 6; /* Mask bit 2:1 in the SID field */ 1813 break; 1814 case 3: 1815 mask = 7; /* Mask bit 2:0 in the SID field */ 1816 break; 1817 } 1818 mask = ~mask; 1819 1820 bus_n = VTD_SID_TO_BUS(source_id); 1821 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1822 if (vtd_bus) { 1823 devfn = VTD_SID_TO_DEVFN(source_id); 1824 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1825 vtd_as = vtd_bus->dev_as[devfn_it]; 1826 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1827 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1828 VTD_PCI_FUNC(devfn_it)); 1829 vtd_iommu_lock(s); 1830 vtd_as->context_cache_entry.context_cache_gen = 0; 1831 vtd_iommu_unlock(s); 1832 /* 1833 * Do switch address space when needed, in case if the 1834 * device passthrough bit is switched. 1835 */ 1836 vtd_switch_address_space(vtd_as); 1837 /* 1838 * So a device is moving out of (or moving into) a 1839 * domain, resync the shadow page table. 1840 * This won't bring bad even if we have no such 1841 * notifier registered - the IOMMU notification 1842 * framework will skip MAP notifications if that 1843 * happened. 1844 */ 1845 vtd_sync_shadow_page_table(vtd_as); 1846 } 1847 } 1848 } 1849 } 1850 1851 /* Context-cache invalidation 1852 * Returns the Context Actual Invalidation Granularity. 1853 * @val: the content of the CCMD_REG 1854 */ 1855 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1856 { 1857 uint64_t caig; 1858 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1859 1860 switch (type) { 1861 case VTD_CCMD_DOMAIN_INVL: 1862 /* Fall through */ 1863 case VTD_CCMD_GLOBAL_INVL: 1864 caig = VTD_CCMD_GLOBAL_INVL_A; 1865 vtd_context_global_invalidate(s); 1866 break; 1867 1868 case VTD_CCMD_DEVICE_INVL: 1869 caig = VTD_CCMD_DEVICE_INVL_A; 1870 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1871 break; 1872 1873 default: 1874 error_report_once("%s: invalid context: 0x%" PRIx64, 1875 __func__, val); 1876 caig = 0; 1877 } 1878 return caig; 1879 } 1880 1881 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1882 { 1883 trace_vtd_inv_desc_iotlb_global(); 1884 vtd_reset_iotlb(s); 1885 vtd_iommu_replay_all(s); 1886 } 1887 1888 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1889 { 1890 VTDContextEntry ce; 1891 VTDAddressSpace *vtd_as; 1892 1893 trace_vtd_inv_desc_iotlb_domain(domain_id); 1894 1895 vtd_iommu_lock(s); 1896 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1897 &domain_id); 1898 vtd_iommu_unlock(s); 1899 1900 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1901 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1902 vtd_as->devfn, &ce) && 1903 domain_id == vtd_get_domain_id(s, &ce)) { 1904 vtd_sync_shadow_page_table(vtd_as); 1905 } 1906 } 1907 } 1908 1909 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1910 uint16_t domain_id, hwaddr addr, 1911 uint8_t am) 1912 { 1913 VTDAddressSpace *vtd_as; 1914 VTDContextEntry ce; 1915 int ret; 1916 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1917 1918 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1919 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1920 vtd_as->devfn, &ce); 1921 if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { 1922 if (vtd_as_has_map_notifier(vtd_as)) { 1923 /* 1924 * As long as we have MAP notifications registered in 1925 * any of our IOMMU notifiers, we need to sync the 1926 * shadow page table. 1927 */ 1928 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 1929 } else { 1930 /* 1931 * For UNMAP-only notifiers, we don't need to walk the 1932 * page tables. We just deliver the PSI down to 1933 * invalidate caches. 1934 */ 1935 IOMMUTLBEntry entry = { 1936 .target_as = &address_space_memory, 1937 .iova = addr, 1938 .translated_addr = 0, 1939 .addr_mask = size - 1, 1940 .perm = IOMMU_NONE, 1941 }; 1942 memory_region_notify_iommu(&vtd_as->iommu, 0, entry); 1943 } 1944 } 1945 } 1946 } 1947 1948 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 1949 hwaddr addr, uint8_t am) 1950 { 1951 VTDIOTLBPageInvInfo info; 1952 1953 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 1954 1955 assert(am <= VTD_MAMV); 1956 info.domain_id = domain_id; 1957 info.addr = addr; 1958 info.mask = ~((1 << am) - 1); 1959 vtd_iommu_lock(s); 1960 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 1961 vtd_iommu_unlock(s); 1962 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 1963 } 1964 1965 /* Flush IOTLB 1966 * Returns the IOTLB Actual Invalidation Granularity. 1967 * @val: the content of the IOTLB_REG 1968 */ 1969 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 1970 { 1971 uint64_t iaig; 1972 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 1973 uint16_t domain_id; 1974 hwaddr addr; 1975 uint8_t am; 1976 1977 switch (type) { 1978 case VTD_TLB_GLOBAL_FLUSH: 1979 iaig = VTD_TLB_GLOBAL_FLUSH_A; 1980 vtd_iotlb_global_invalidate(s); 1981 break; 1982 1983 case VTD_TLB_DSI_FLUSH: 1984 domain_id = VTD_TLB_DID(val); 1985 iaig = VTD_TLB_DSI_FLUSH_A; 1986 vtd_iotlb_domain_invalidate(s, domain_id); 1987 break; 1988 1989 case VTD_TLB_PSI_FLUSH: 1990 domain_id = VTD_TLB_DID(val); 1991 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 1992 am = VTD_IVA_AM(addr); 1993 addr = VTD_IVA_ADDR(addr); 1994 if (am > VTD_MAMV) { 1995 error_report_once("%s: address mask overflow: 0x%" PRIx64, 1996 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 1997 iaig = 0; 1998 break; 1999 } 2000 iaig = VTD_TLB_PSI_FLUSH_A; 2001 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2002 break; 2003 2004 default: 2005 error_report_once("%s: invalid granularity: 0x%" PRIx64, 2006 __func__, val); 2007 iaig = 0; 2008 } 2009 return iaig; 2010 } 2011 2012 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 2013 2014 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 2015 { 2016 return s->qi_enabled && (s->iq_tail == s->iq_head) && 2017 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 2018 } 2019 2020 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 2021 { 2022 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 2023 2024 trace_vtd_inv_qi_enable(en); 2025 2026 if (en) { 2027 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 2028 /* 2^(x+8) entries */ 2029 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); 2030 s->qi_enabled = true; 2031 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 2032 /* Ok - report back to driver */ 2033 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 2034 2035 if (s->iq_tail != 0) { 2036 /* 2037 * This is a spec violation but Windows guests are known to set up 2038 * Queued Invalidation this way so we allow the write and process 2039 * Invalidation Descriptors right away. 2040 */ 2041 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 2042 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2043 vtd_fetch_inv_desc(s); 2044 } 2045 } 2046 } else { 2047 if (vtd_queued_inv_disable_check(s)) { 2048 /* disable Queued Invalidation */ 2049 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 2050 s->iq_head = 0; 2051 s->qi_enabled = false; 2052 /* Ok - report back to driver */ 2053 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 2054 } else { 2055 error_report_once("%s: detected improper state when disable QI " 2056 "(head=0x%x, tail=0x%x, last_type=%d)", 2057 __func__, 2058 s->iq_head, s->iq_tail, s->iq_last_desc_type); 2059 } 2060 } 2061 } 2062 2063 /* Set Root Table Pointer */ 2064 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 2065 { 2066 vtd_root_table_setup(s); 2067 /* Ok - report back to driver */ 2068 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 2069 vtd_reset_caches(s); 2070 vtd_address_space_refresh_all(s); 2071 } 2072 2073 /* Set Interrupt Remap Table Pointer */ 2074 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 2075 { 2076 vtd_interrupt_remap_table_setup(s); 2077 /* Ok - report back to driver */ 2078 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 2079 } 2080 2081 /* Handle Translation Enable/Disable */ 2082 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 2083 { 2084 if (s->dmar_enabled == en) { 2085 return; 2086 } 2087 2088 trace_vtd_dmar_enable(en); 2089 2090 if (en) { 2091 s->dmar_enabled = true; 2092 /* Ok - report back to driver */ 2093 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 2094 } else { 2095 s->dmar_enabled = false; 2096 2097 /* Clear the index of Fault Recording Register */ 2098 s->next_frcd_reg = 0; 2099 /* Ok - report back to driver */ 2100 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 2101 } 2102 2103 vtd_reset_caches(s); 2104 vtd_address_space_refresh_all(s); 2105 } 2106 2107 /* Handle Interrupt Remap Enable/Disable */ 2108 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 2109 { 2110 trace_vtd_ir_enable(en); 2111 2112 if (en) { 2113 s->intr_enabled = true; 2114 /* Ok - report back to driver */ 2115 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 2116 } else { 2117 s->intr_enabled = false; 2118 /* Ok - report back to driver */ 2119 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 2120 } 2121 } 2122 2123 /* Handle write to Global Command Register */ 2124 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 2125 { 2126 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 2127 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 2128 uint32_t changed = status ^ val; 2129 2130 trace_vtd_reg_write_gcmd(status, val); 2131 if (changed & VTD_GCMD_TE) { 2132 /* Translation enable/disable */ 2133 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 2134 } 2135 if (val & VTD_GCMD_SRTP) { 2136 /* Set/update the root-table pointer */ 2137 vtd_handle_gcmd_srtp(s); 2138 } 2139 if (changed & VTD_GCMD_QIE) { 2140 /* Queued Invalidation Enable */ 2141 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 2142 } 2143 if (val & VTD_GCMD_SIRTP) { 2144 /* Set/update the interrupt remapping root-table pointer */ 2145 vtd_handle_gcmd_sirtp(s); 2146 } 2147 if (changed & VTD_GCMD_IRE) { 2148 /* Interrupt remap enable/disable */ 2149 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 2150 } 2151 } 2152 2153 /* Handle write to Context Command Register */ 2154 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 2155 { 2156 uint64_t ret; 2157 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 2158 2159 /* Context-cache invalidation request */ 2160 if (val & VTD_CCMD_ICC) { 2161 if (s->qi_enabled) { 2162 error_report_once("Queued Invalidation enabled, " 2163 "should not use register-based invalidation"); 2164 return; 2165 } 2166 ret = vtd_context_cache_invalidate(s, val); 2167 /* Invalidation completed. Change something to show */ 2168 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 2169 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 2170 ret); 2171 } 2172 } 2173 2174 /* Handle write to IOTLB Invalidation Register */ 2175 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 2176 { 2177 uint64_t ret; 2178 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 2179 2180 /* IOTLB invalidation request */ 2181 if (val & VTD_TLB_IVT) { 2182 if (s->qi_enabled) { 2183 error_report_once("Queued Invalidation enabled, " 2184 "should not use register-based invalidation"); 2185 return; 2186 } 2187 ret = vtd_iotlb_flush(s, val); 2188 /* Invalidation completed. Change something to show */ 2189 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 2190 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 2191 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 2192 } 2193 } 2194 2195 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 2196 static bool vtd_get_inv_desc(IntelIOMMUState *s, 2197 VTDInvDesc *inv_desc) 2198 { 2199 dma_addr_t base_addr = s->iq; 2200 uint32_t offset = s->iq_head; 2201 uint32_t dw = s->iq_dw ? 32 : 16; 2202 dma_addr_t addr = base_addr + offset * dw; 2203 2204 if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) { 2205 error_report_once("Read INV DESC failed."); 2206 return false; 2207 } 2208 inv_desc->lo = le64_to_cpu(inv_desc->lo); 2209 inv_desc->hi = le64_to_cpu(inv_desc->hi); 2210 if (dw == 32) { 2211 inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); 2212 inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); 2213 } 2214 return true; 2215 } 2216 2217 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2218 { 2219 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 2220 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 2221 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2222 " (reserved nonzero)", __func__, inv_desc->hi, 2223 inv_desc->lo); 2224 return false; 2225 } 2226 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 2227 /* Status Write */ 2228 uint32_t status_data = (uint32_t)(inv_desc->lo >> 2229 VTD_INV_DESC_WAIT_DATA_SHIFT); 2230 2231 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 2232 2233 /* FIXME: need to be masked with HAW? */ 2234 dma_addr_t status_addr = inv_desc->hi; 2235 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 2236 status_data = cpu_to_le32(status_data); 2237 if (dma_memory_write(&address_space_memory, status_addr, &status_data, 2238 sizeof(status_data))) { 2239 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 2240 return false; 2241 } 2242 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 2243 /* Interrupt flag */ 2244 vtd_generate_completion_event(s); 2245 } else { 2246 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2247 " (unknown type)", __func__, inv_desc->hi, 2248 inv_desc->lo); 2249 return false; 2250 } 2251 return true; 2252 } 2253 2254 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 2255 VTDInvDesc *inv_desc) 2256 { 2257 uint16_t sid, fmask; 2258 2259 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 2260 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2261 " (reserved nonzero)", __func__, inv_desc->hi, 2262 inv_desc->lo); 2263 return false; 2264 } 2265 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 2266 case VTD_INV_DESC_CC_DOMAIN: 2267 trace_vtd_inv_desc_cc_domain( 2268 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 2269 /* Fall through */ 2270 case VTD_INV_DESC_CC_GLOBAL: 2271 vtd_context_global_invalidate(s); 2272 break; 2273 2274 case VTD_INV_DESC_CC_DEVICE: 2275 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 2276 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 2277 vtd_context_device_invalidate(s, sid, fmask); 2278 break; 2279 2280 default: 2281 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2282 " (invalid type)", __func__, inv_desc->hi, 2283 inv_desc->lo); 2284 return false; 2285 } 2286 return true; 2287 } 2288 2289 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2290 { 2291 uint16_t domain_id; 2292 uint8_t am; 2293 hwaddr addr; 2294 2295 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 2296 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 2297 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2298 ", lo=0x%"PRIx64" (reserved bits unzero)\n", 2299 __func__, inv_desc->hi, inv_desc->lo); 2300 return false; 2301 } 2302 2303 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 2304 case VTD_INV_DESC_IOTLB_GLOBAL: 2305 vtd_iotlb_global_invalidate(s); 2306 break; 2307 2308 case VTD_INV_DESC_IOTLB_DOMAIN: 2309 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2310 vtd_iotlb_domain_invalidate(s, domain_id); 2311 break; 2312 2313 case VTD_INV_DESC_IOTLB_PAGE: 2314 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2315 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 2316 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 2317 if (am > VTD_MAMV) { 2318 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2319 ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n", 2320 __func__, inv_desc->hi, inv_desc->lo, 2321 am, (unsigned)VTD_MAMV); 2322 return false; 2323 } 2324 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2325 break; 2326 2327 default: 2328 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2329 ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n", 2330 __func__, inv_desc->hi, inv_desc->lo, 2331 inv_desc->lo & VTD_INV_DESC_IOTLB_G); 2332 return false; 2333 } 2334 return true; 2335 } 2336 2337 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 2338 VTDInvDesc *inv_desc) 2339 { 2340 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 2341 inv_desc->iec.index, 2342 inv_desc->iec.index_mask); 2343 2344 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 2345 inv_desc->iec.index, 2346 inv_desc->iec.index_mask); 2347 return true; 2348 } 2349 2350 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 2351 VTDInvDesc *inv_desc) 2352 { 2353 VTDAddressSpace *vtd_dev_as; 2354 IOMMUTLBEntry entry; 2355 struct VTDBus *vtd_bus; 2356 hwaddr addr; 2357 uint64_t sz; 2358 uint16_t sid; 2359 uint8_t devfn; 2360 bool size; 2361 uint8_t bus_num; 2362 2363 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2364 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2365 devfn = sid & 0xff; 2366 bus_num = sid >> 8; 2367 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2368 2369 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2370 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2371 error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 2372 ", lo=%"PRIx64" (reserved nonzero)", __func__, 2373 inv_desc->hi, inv_desc->lo); 2374 return false; 2375 } 2376 2377 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2378 if (!vtd_bus) { 2379 goto done; 2380 } 2381 2382 vtd_dev_as = vtd_bus->dev_as[devfn]; 2383 if (!vtd_dev_as) { 2384 goto done; 2385 } 2386 2387 /* According to ATS spec table 2.4: 2388 * S = 0, bits 15:12 = xxxx range size: 4K 2389 * S = 1, bits 15:12 = xxx0 range size: 8K 2390 * S = 1, bits 15:12 = xx01 range size: 16K 2391 * S = 1, bits 15:12 = x011 range size: 32K 2392 * S = 1, bits 15:12 = 0111 range size: 64K 2393 * ... 2394 */ 2395 if (size) { 2396 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2397 addr &= ~(sz - 1); 2398 } else { 2399 sz = VTD_PAGE_SIZE; 2400 } 2401 2402 entry.target_as = &vtd_dev_as->as; 2403 entry.addr_mask = sz - 1; 2404 entry.iova = addr; 2405 entry.perm = IOMMU_NONE; 2406 entry.translated_addr = 0; 2407 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); 2408 2409 done: 2410 return true; 2411 } 2412 2413 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2414 { 2415 VTDInvDesc inv_desc; 2416 uint8_t desc_type; 2417 2418 trace_vtd_inv_qi_head(s->iq_head); 2419 if (!vtd_get_inv_desc(s, &inv_desc)) { 2420 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2421 return false; 2422 } 2423 2424 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2425 /* FIXME: should update at first or at last? */ 2426 s->iq_last_desc_type = desc_type; 2427 2428 switch (desc_type) { 2429 case VTD_INV_DESC_CC: 2430 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2431 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2432 return false; 2433 } 2434 break; 2435 2436 case VTD_INV_DESC_IOTLB: 2437 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2438 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2439 return false; 2440 } 2441 break; 2442 2443 /* 2444 * TODO: the entity of below two cases will be implemented in future series. 2445 * To make guest (which integrates scalable mode support patch set in 2446 * iommu driver) work, just return true is enough so far. 2447 */ 2448 case VTD_INV_DESC_PC: 2449 break; 2450 2451 case VTD_INV_DESC_PIOTLB: 2452 break; 2453 2454 case VTD_INV_DESC_WAIT: 2455 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2456 if (!vtd_process_wait_desc(s, &inv_desc)) { 2457 return false; 2458 } 2459 break; 2460 2461 case VTD_INV_DESC_IEC: 2462 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2463 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2464 return false; 2465 } 2466 break; 2467 2468 case VTD_INV_DESC_DEVICE: 2469 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2470 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2471 return false; 2472 } 2473 break; 2474 2475 default: 2476 error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 2477 " (unknown type)", __func__, inv_desc.hi, 2478 inv_desc.lo); 2479 return false; 2480 } 2481 s->iq_head++; 2482 if (s->iq_head == s->iq_size) { 2483 s->iq_head = 0; 2484 } 2485 return true; 2486 } 2487 2488 /* Try to fetch and process more Invalidation Descriptors */ 2489 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2490 { 2491 trace_vtd_inv_qi_fetch(); 2492 2493 if (s->iq_tail >= s->iq_size) { 2494 /* Detects an invalid Tail pointer */ 2495 error_report_once("%s: detected invalid QI tail " 2496 "(tail=0x%x, size=0x%x)", 2497 __func__, s->iq_tail, s->iq_size); 2498 vtd_handle_inv_queue_error(s); 2499 return; 2500 } 2501 while (s->iq_head != s->iq_tail) { 2502 if (!vtd_process_inv_desc(s)) { 2503 /* Invalidation Queue Errors */ 2504 vtd_handle_inv_queue_error(s); 2505 break; 2506 } 2507 /* Must update the IQH_REG in time */ 2508 vtd_set_quad_raw(s, DMAR_IQH_REG, 2509 (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) & 2510 VTD_IQH_QH_MASK); 2511 } 2512 } 2513 2514 /* Handle write to Invalidation Queue Tail Register */ 2515 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2516 { 2517 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2518 2519 if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { 2520 error_report_once("%s: RSV bit is set: val=0x%"PRIx64, 2521 __func__, val); 2522 return; 2523 } 2524 s->iq_tail = VTD_IQT_QT(s->iq_dw, val); 2525 trace_vtd_inv_qi_tail(s->iq_tail); 2526 2527 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2528 /* Process Invalidation Queue here */ 2529 vtd_fetch_inv_desc(s); 2530 } 2531 } 2532 2533 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2534 { 2535 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2536 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2537 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2538 2539 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2540 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2541 trace_vtd_fsts_clear_ip(); 2542 } 2543 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2544 * Descriptors if there are any when Queued Invalidation is enabled? 2545 */ 2546 } 2547 2548 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2549 { 2550 uint32_t fectl_reg; 2551 /* FIXME: when software clears the IM field, check the IP field. But do we 2552 * need to compare the old value and the new value to conclude that 2553 * software clears the IM field? Or just check if the IM field is zero? 2554 */ 2555 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2556 2557 trace_vtd_reg_write_fectl(fectl_reg); 2558 2559 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2560 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2561 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2562 } 2563 } 2564 2565 static void vtd_handle_ics_write(IntelIOMMUState *s) 2566 { 2567 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2568 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2569 2570 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2571 trace_vtd_reg_ics_clear_ip(); 2572 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2573 } 2574 } 2575 2576 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2577 { 2578 uint32_t iectl_reg; 2579 /* FIXME: when software clears the IM field, check the IP field. But do we 2580 * need to compare the old value and the new value to conclude that 2581 * software clears the IM field? Or just check if the IM field is zero? 2582 */ 2583 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2584 2585 trace_vtd_reg_write_iectl(iectl_reg); 2586 2587 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2588 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2589 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2590 } 2591 } 2592 2593 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2594 { 2595 IntelIOMMUState *s = opaque; 2596 uint64_t val; 2597 2598 trace_vtd_reg_read(addr, size); 2599 2600 if (addr + size > DMAR_REG_SIZE) { 2601 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2602 " size=0x%u", __func__, addr, size); 2603 return (uint64_t)-1; 2604 } 2605 2606 switch (addr) { 2607 /* Root Table Address Register, 64-bit */ 2608 case DMAR_RTADDR_REG: 2609 if (size == 4) { 2610 val = s->root & ((1ULL << 32) - 1); 2611 } else { 2612 val = s->root; 2613 } 2614 break; 2615 2616 case DMAR_RTADDR_REG_HI: 2617 assert(size == 4); 2618 val = s->root >> 32; 2619 break; 2620 2621 /* Invalidation Queue Address Register, 64-bit */ 2622 case DMAR_IQA_REG: 2623 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2624 if (size == 4) { 2625 val = val & ((1ULL << 32) - 1); 2626 } 2627 break; 2628 2629 case DMAR_IQA_REG_HI: 2630 assert(size == 4); 2631 val = s->iq >> 32; 2632 break; 2633 2634 default: 2635 if (size == 4) { 2636 val = vtd_get_long(s, addr); 2637 } else { 2638 val = vtd_get_quad(s, addr); 2639 } 2640 } 2641 2642 return val; 2643 } 2644 2645 static void vtd_mem_write(void *opaque, hwaddr addr, 2646 uint64_t val, unsigned size) 2647 { 2648 IntelIOMMUState *s = opaque; 2649 2650 trace_vtd_reg_write(addr, size, val); 2651 2652 if (addr + size > DMAR_REG_SIZE) { 2653 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2654 " size=0x%u", __func__, addr, size); 2655 return; 2656 } 2657 2658 switch (addr) { 2659 /* Global Command Register, 32-bit */ 2660 case DMAR_GCMD_REG: 2661 vtd_set_long(s, addr, val); 2662 vtd_handle_gcmd_write(s); 2663 break; 2664 2665 /* Context Command Register, 64-bit */ 2666 case DMAR_CCMD_REG: 2667 if (size == 4) { 2668 vtd_set_long(s, addr, val); 2669 } else { 2670 vtd_set_quad(s, addr, val); 2671 vtd_handle_ccmd_write(s); 2672 } 2673 break; 2674 2675 case DMAR_CCMD_REG_HI: 2676 assert(size == 4); 2677 vtd_set_long(s, addr, val); 2678 vtd_handle_ccmd_write(s); 2679 break; 2680 2681 /* IOTLB Invalidation Register, 64-bit */ 2682 case DMAR_IOTLB_REG: 2683 if (size == 4) { 2684 vtd_set_long(s, addr, val); 2685 } else { 2686 vtd_set_quad(s, addr, val); 2687 vtd_handle_iotlb_write(s); 2688 } 2689 break; 2690 2691 case DMAR_IOTLB_REG_HI: 2692 assert(size == 4); 2693 vtd_set_long(s, addr, val); 2694 vtd_handle_iotlb_write(s); 2695 break; 2696 2697 /* Invalidate Address Register, 64-bit */ 2698 case DMAR_IVA_REG: 2699 if (size == 4) { 2700 vtd_set_long(s, addr, val); 2701 } else { 2702 vtd_set_quad(s, addr, val); 2703 } 2704 break; 2705 2706 case DMAR_IVA_REG_HI: 2707 assert(size == 4); 2708 vtd_set_long(s, addr, val); 2709 break; 2710 2711 /* Fault Status Register, 32-bit */ 2712 case DMAR_FSTS_REG: 2713 assert(size == 4); 2714 vtd_set_long(s, addr, val); 2715 vtd_handle_fsts_write(s); 2716 break; 2717 2718 /* Fault Event Control Register, 32-bit */ 2719 case DMAR_FECTL_REG: 2720 assert(size == 4); 2721 vtd_set_long(s, addr, val); 2722 vtd_handle_fectl_write(s); 2723 break; 2724 2725 /* Fault Event Data Register, 32-bit */ 2726 case DMAR_FEDATA_REG: 2727 assert(size == 4); 2728 vtd_set_long(s, addr, val); 2729 break; 2730 2731 /* Fault Event Address Register, 32-bit */ 2732 case DMAR_FEADDR_REG: 2733 if (size == 4) { 2734 vtd_set_long(s, addr, val); 2735 } else { 2736 /* 2737 * While the register is 32-bit only, some guests (Xen...) write to 2738 * it with 64-bit. 2739 */ 2740 vtd_set_quad(s, addr, val); 2741 } 2742 break; 2743 2744 /* Fault Event Upper Address Register, 32-bit */ 2745 case DMAR_FEUADDR_REG: 2746 assert(size == 4); 2747 vtd_set_long(s, addr, val); 2748 break; 2749 2750 /* Protected Memory Enable Register, 32-bit */ 2751 case DMAR_PMEN_REG: 2752 assert(size == 4); 2753 vtd_set_long(s, addr, val); 2754 break; 2755 2756 /* Root Table Address Register, 64-bit */ 2757 case DMAR_RTADDR_REG: 2758 if (size == 4) { 2759 vtd_set_long(s, addr, val); 2760 } else { 2761 vtd_set_quad(s, addr, val); 2762 } 2763 break; 2764 2765 case DMAR_RTADDR_REG_HI: 2766 assert(size == 4); 2767 vtd_set_long(s, addr, val); 2768 break; 2769 2770 /* Invalidation Queue Tail Register, 64-bit */ 2771 case DMAR_IQT_REG: 2772 if (size == 4) { 2773 vtd_set_long(s, addr, val); 2774 } else { 2775 vtd_set_quad(s, addr, val); 2776 } 2777 vtd_handle_iqt_write(s); 2778 break; 2779 2780 case DMAR_IQT_REG_HI: 2781 assert(size == 4); 2782 vtd_set_long(s, addr, val); 2783 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2784 break; 2785 2786 /* Invalidation Queue Address Register, 64-bit */ 2787 case DMAR_IQA_REG: 2788 if (size == 4) { 2789 vtd_set_long(s, addr, val); 2790 } else { 2791 vtd_set_quad(s, addr, val); 2792 } 2793 if (s->ecap & VTD_ECAP_SMTS && 2794 val & VTD_IQA_DW_MASK) { 2795 s->iq_dw = true; 2796 } else { 2797 s->iq_dw = false; 2798 } 2799 break; 2800 2801 case DMAR_IQA_REG_HI: 2802 assert(size == 4); 2803 vtd_set_long(s, addr, val); 2804 break; 2805 2806 /* Invalidation Completion Status Register, 32-bit */ 2807 case DMAR_ICS_REG: 2808 assert(size == 4); 2809 vtd_set_long(s, addr, val); 2810 vtd_handle_ics_write(s); 2811 break; 2812 2813 /* Invalidation Event Control Register, 32-bit */ 2814 case DMAR_IECTL_REG: 2815 assert(size == 4); 2816 vtd_set_long(s, addr, val); 2817 vtd_handle_iectl_write(s); 2818 break; 2819 2820 /* Invalidation Event Data Register, 32-bit */ 2821 case DMAR_IEDATA_REG: 2822 assert(size == 4); 2823 vtd_set_long(s, addr, val); 2824 break; 2825 2826 /* Invalidation Event Address Register, 32-bit */ 2827 case DMAR_IEADDR_REG: 2828 assert(size == 4); 2829 vtd_set_long(s, addr, val); 2830 break; 2831 2832 /* Invalidation Event Upper Address Register, 32-bit */ 2833 case DMAR_IEUADDR_REG: 2834 assert(size == 4); 2835 vtd_set_long(s, addr, val); 2836 break; 2837 2838 /* Fault Recording Registers, 128-bit */ 2839 case DMAR_FRCD_REG_0_0: 2840 if (size == 4) { 2841 vtd_set_long(s, addr, val); 2842 } else { 2843 vtd_set_quad(s, addr, val); 2844 } 2845 break; 2846 2847 case DMAR_FRCD_REG_0_1: 2848 assert(size == 4); 2849 vtd_set_long(s, addr, val); 2850 break; 2851 2852 case DMAR_FRCD_REG_0_2: 2853 if (size == 4) { 2854 vtd_set_long(s, addr, val); 2855 } else { 2856 vtd_set_quad(s, addr, val); 2857 /* May clear bit 127 (Fault), update PPF */ 2858 vtd_update_fsts_ppf(s); 2859 } 2860 break; 2861 2862 case DMAR_FRCD_REG_0_3: 2863 assert(size == 4); 2864 vtd_set_long(s, addr, val); 2865 /* May clear bit 127 (Fault), update PPF */ 2866 vtd_update_fsts_ppf(s); 2867 break; 2868 2869 case DMAR_IRTA_REG: 2870 if (size == 4) { 2871 vtd_set_long(s, addr, val); 2872 } else { 2873 vtd_set_quad(s, addr, val); 2874 } 2875 break; 2876 2877 case DMAR_IRTA_REG_HI: 2878 assert(size == 4); 2879 vtd_set_long(s, addr, val); 2880 break; 2881 2882 default: 2883 if (size == 4) { 2884 vtd_set_long(s, addr, val); 2885 } else { 2886 vtd_set_quad(s, addr, val); 2887 } 2888 } 2889 } 2890 2891 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2892 IOMMUAccessFlags flag, int iommu_idx) 2893 { 2894 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2895 IntelIOMMUState *s = vtd_as->iommu_state; 2896 IOMMUTLBEntry iotlb = { 2897 /* We'll fill in the rest later. */ 2898 .target_as = &address_space_memory, 2899 }; 2900 bool success; 2901 2902 if (likely(s->dmar_enabled)) { 2903 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2904 addr, flag & IOMMU_WO, &iotlb); 2905 } else { 2906 /* DMAR disabled, passthrough, use 4k-page*/ 2907 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2908 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2909 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2910 iotlb.perm = IOMMU_RW; 2911 success = true; 2912 } 2913 2914 if (likely(success)) { 2915 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 2916 VTD_PCI_SLOT(vtd_as->devfn), 2917 VTD_PCI_FUNC(vtd_as->devfn), 2918 iotlb.iova, iotlb.translated_addr, 2919 iotlb.addr_mask); 2920 } else { 2921 error_report_once("%s: detected translation failure " 2922 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 2923 __func__, pci_bus_num(vtd_as->bus), 2924 VTD_PCI_SLOT(vtd_as->devfn), 2925 VTD_PCI_FUNC(vtd_as->devfn), 2926 addr); 2927 } 2928 2929 return iotlb; 2930 } 2931 2932 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 2933 IOMMUNotifierFlag old, 2934 IOMMUNotifierFlag new, 2935 Error **errp) 2936 { 2937 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2938 IntelIOMMUState *s = vtd_as->iommu_state; 2939 2940 /* Update per-address-space notifier flags */ 2941 vtd_as->notifier_flags = new; 2942 2943 if (old == IOMMU_NOTIFIER_NONE) { 2944 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 2945 } else if (new == IOMMU_NOTIFIER_NONE) { 2946 QLIST_REMOVE(vtd_as, next); 2947 } 2948 return 0; 2949 } 2950 2951 static int vtd_post_load(void *opaque, int version_id) 2952 { 2953 IntelIOMMUState *iommu = opaque; 2954 2955 /* 2956 * Memory regions are dynamically turned on/off depending on 2957 * context entry configurations from the guest. After migration, 2958 * we need to make sure the memory regions are still correct. 2959 */ 2960 vtd_switch_address_space_all(iommu); 2961 2962 /* 2963 * We don't need to migrate the root_scalable because we can 2964 * simply do the calculation after the loading is complete. We 2965 * can actually do similar things with root, dmar_enabled, etc. 2966 * however since we've had them already so we'd better keep them 2967 * for compatibility of migration. 2968 */ 2969 vtd_update_scalable_state(iommu); 2970 2971 return 0; 2972 } 2973 2974 static const VMStateDescription vtd_vmstate = { 2975 .name = "iommu-intel", 2976 .version_id = 1, 2977 .minimum_version_id = 1, 2978 .priority = MIG_PRI_IOMMU, 2979 .post_load = vtd_post_load, 2980 .fields = (VMStateField[]) { 2981 VMSTATE_UINT64(root, IntelIOMMUState), 2982 VMSTATE_UINT64(intr_root, IntelIOMMUState), 2983 VMSTATE_UINT64(iq, IntelIOMMUState), 2984 VMSTATE_UINT32(intr_size, IntelIOMMUState), 2985 VMSTATE_UINT16(iq_head, IntelIOMMUState), 2986 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 2987 VMSTATE_UINT16(iq_size, IntelIOMMUState), 2988 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 2989 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 2990 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 2991 VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ 2992 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 2993 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 2994 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 2995 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 2996 VMSTATE_END_OF_LIST() 2997 } 2998 }; 2999 3000 static const MemoryRegionOps vtd_mem_ops = { 3001 .read = vtd_mem_read, 3002 .write = vtd_mem_write, 3003 .endianness = DEVICE_LITTLE_ENDIAN, 3004 .impl = { 3005 .min_access_size = 4, 3006 .max_access_size = 8, 3007 }, 3008 .valid = { 3009 .min_access_size = 4, 3010 .max_access_size = 8, 3011 }, 3012 }; 3013 3014 static Property vtd_properties[] = { 3015 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 3016 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 3017 ON_OFF_AUTO_AUTO), 3018 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 3019 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, 3020 VTD_HOST_ADDRESS_WIDTH), 3021 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 3022 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), 3023 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), 3024 DEFINE_PROP_END_OF_LIST(), 3025 }; 3026 3027 /* Read IRTE entry with specific index */ 3028 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 3029 VTD_IR_TableEntry *entry, uint16_t sid) 3030 { 3031 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 3032 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 3033 dma_addr_t addr = 0x00; 3034 uint16_t mask, source_id; 3035 uint8_t bus, bus_max, bus_min; 3036 3037 addr = iommu->intr_root + index * sizeof(*entry); 3038 if (dma_memory_read(&address_space_memory, addr, entry, 3039 sizeof(*entry))) { 3040 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 3041 __func__, index, addr); 3042 return -VTD_FR_IR_ROOT_INVAL; 3043 } 3044 3045 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 3046 le64_to_cpu(entry->data[0])); 3047 3048 if (!entry->irte.present) { 3049 error_report_once("%s: detected non-present IRTE " 3050 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3051 __func__, index, le64_to_cpu(entry->data[1]), 3052 le64_to_cpu(entry->data[0])); 3053 return -VTD_FR_IR_ENTRY_P; 3054 } 3055 3056 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 3057 entry->irte.__reserved_2) { 3058 error_report_once("%s: detected non-zero reserved IRTE " 3059 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3060 __func__, index, le64_to_cpu(entry->data[1]), 3061 le64_to_cpu(entry->data[0])); 3062 return -VTD_FR_IR_IRTE_RSVD; 3063 } 3064 3065 if (sid != X86_IOMMU_SID_INVALID) { 3066 /* Validate IRTE SID */ 3067 source_id = le32_to_cpu(entry->irte.source_id); 3068 switch (entry->irte.sid_vtype) { 3069 case VTD_SVT_NONE: 3070 break; 3071 3072 case VTD_SVT_ALL: 3073 mask = vtd_svt_mask[entry->irte.sid_q]; 3074 if ((source_id & mask) != (sid & mask)) { 3075 error_report_once("%s: invalid IRTE SID " 3076 "(index=%u, sid=%u, source_id=%u)", 3077 __func__, index, sid, source_id); 3078 return -VTD_FR_IR_SID_ERR; 3079 } 3080 break; 3081 3082 case VTD_SVT_BUS: 3083 bus_max = source_id >> 8; 3084 bus_min = source_id & 0xff; 3085 bus = sid >> 8; 3086 if (bus > bus_max || bus < bus_min) { 3087 error_report_once("%s: invalid SVT_BUS " 3088 "(index=%u, bus=%u, min=%u, max=%u)", 3089 __func__, index, bus, bus_min, bus_max); 3090 return -VTD_FR_IR_SID_ERR; 3091 } 3092 break; 3093 3094 default: 3095 error_report_once("%s: detected invalid IRTE SVT " 3096 "(index=%u, type=%d)", __func__, 3097 index, entry->irte.sid_vtype); 3098 /* Take this as verification failure. */ 3099 return -VTD_FR_IR_SID_ERR; 3100 break; 3101 } 3102 } 3103 3104 return 0; 3105 } 3106 3107 /* Fetch IRQ information of specific IR index */ 3108 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 3109 X86IOMMUIrq *irq, uint16_t sid) 3110 { 3111 VTD_IR_TableEntry irte = {}; 3112 int ret = 0; 3113 3114 ret = vtd_irte_get(iommu, index, &irte, sid); 3115 if (ret) { 3116 return ret; 3117 } 3118 3119 irq->trigger_mode = irte.irte.trigger_mode; 3120 irq->vector = irte.irte.vector; 3121 irq->delivery_mode = irte.irte.delivery_mode; 3122 irq->dest = le32_to_cpu(irte.irte.dest_id); 3123 if (!iommu->intr_eime) { 3124 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 3125 #define VTD_IR_APIC_DEST_SHIFT (8) 3126 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 3127 VTD_IR_APIC_DEST_SHIFT; 3128 } 3129 irq->dest_mode = irte.irte.dest_mode; 3130 irq->redir_hint = irte.irte.redir_hint; 3131 3132 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 3133 irq->delivery_mode, irq->dest, irq->dest_mode); 3134 3135 return 0; 3136 } 3137 3138 /* Interrupt remapping for MSI/MSI-X entry */ 3139 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 3140 MSIMessage *origin, 3141 MSIMessage *translated, 3142 uint16_t sid) 3143 { 3144 int ret = 0; 3145 VTD_IR_MSIAddress addr; 3146 uint16_t index; 3147 X86IOMMUIrq irq = {}; 3148 3149 assert(origin && translated); 3150 3151 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 3152 3153 if (!iommu || !iommu->intr_enabled) { 3154 memcpy(translated, origin, sizeof(*origin)); 3155 goto out; 3156 } 3157 3158 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 3159 error_report_once("%s: MSI address high 32 bits non-zero detected: " 3160 "address=0x%" PRIx64, __func__, origin->address); 3161 return -VTD_FR_IR_REQ_RSVD; 3162 } 3163 3164 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 3165 if (addr.addr.__head != 0xfee) { 3166 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 3167 __func__, addr.data); 3168 return -VTD_FR_IR_REQ_RSVD; 3169 } 3170 3171 /* This is compatible mode. */ 3172 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 3173 memcpy(translated, origin, sizeof(*origin)); 3174 goto out; 3175 } 3176 3177 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 3178 3179 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 3180 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 3181 3182 if (addr.addr.sub_valid) { 3183 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 3184 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 3185 } 3186 3187 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 3188 if (ret) { 3189 return ret; 3190 } 3191 3192 if (addr.addr.sub_valid) { 3193 trace_vtd_ir_remap_type("MSI"); 3194 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 3195 error_report_once("%s: invalid IR MSI " 3196 "(sid=%u, address=0x%" PRIx64 3197 ", data=0x%" PRIx32 ")", 3198 __func__, sid, origin->address, origin->data); 3199 return -VTD_FR_IR_REQ_RSVD; 3200 } 3201 } else { 3202 uint8_t vector = origin->data & 0xff; 3203 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 3204 3205 trace_vtd_ir_remap_type("IOAPIC"); 3206 /* IOAPIC entry vector should be aligned with IRTE vector 3207 * (see vt-d spec 5.1.5.1). */ 3208 if (vector != irq.vector) { 3209 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 3210 } 3211 3212 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 3213 * (see vt-d spec 5.1.5.1). */ 3214 if (trigger_mode != irq.trigger_mode) { 3215 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 3216 irq.trigger_mode); 3217 } 3218 } 3219 3220 /* 3221 * We'd better keep the last two bits, assuming that guest OS 3222 * might modify it. Keep it does not hurt after all. 3223 */ 3224 irq.msi_addr_last_bits = addr.addr.__not_care; 3225 3226 /* Translate X86IOMMUIrq to MSI message */ 3227 x86_iommu_irq_to_msi_message(&irq, translated); 3228 3229 out: 3230 trace_vtd_ir_remap_msi(origin->address, origin->data, 3231 translated->address, translated->data); 3232 return 0; 3233 } 3234 3235 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 3236 MSIMessage *dst, uint16_t sid) 3237 { 3238 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 3239 src, dst, sid); 3240 } 3241 3242 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 3243 uint64_t *data, unsigned size, 3244 MemTxAttrs attrs) 3245 { 3246 return MEMTX_OK; 3247 } 3248 3249 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 3250 uint64_t value, unsigned size, 3251 MemTxAttrs attrs) 3252 { 3253 int ret = 0; 3254 MSIMessage from = {}, to = {}; 3255 uint16_t sid = X86_IOMMU_SID_INVALID; 3256 3257 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 3258 from.data = (uint32_t) value; 3259 3260 if (!attrs.unspecified) { 3261 /* We have explicit Source ID */ 3262 sid = attrs.requester_id; 3263 } 3264 3265 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 3266 if (ret) { 3267 /* TODO: report error */ 3268 /* Drop this interrupt */ 3269 return MEMTX_ERROR; 3270 } 3271 3272 apic_get_class()->send_msi(&to); 3273 3274 return MEMTX_OK; 3275 } 3276 3277 static const MemoryRegionOps vtd_mem_ir_ops = { 3278 .read_with_attrs = vtd_mem_ir_read, 3279 .write_with_attrs = vtd_mem_ir_write, 3280 .endianness = DEVICE_LITTLE_ENDIAN, 3281 .impl = { 3282 .min_access_size = 4, 3283 .max_access_size = 4, 3284 }, 3285 .valid = { 3286 .min_access_size = 4, 3287 .max_access_size = 4, 3288 }, 3289 }; 3290 3291 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 3292 { 3293 uintptr_t key = (uintptr_t)bus; 3294 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 3295 VTDAddressSpace *vtd_dev_as; 3296 char name[128]; 3297 3298 if (!vtd_bus) { 3299 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 3300 *new_key = (uintptr_t)bus; 3301 /* No corresponding free() */ 3302 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 3303 PCI_DEVFN_MAX); 3304 vtd_bus->bus = bus; 3305 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 3306 } 3307 3308 vtd_dev_as = vtd_bus->dev_as[devfn]; 3309 3310 if (!vtd_dev_as) { 3311 snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), 3312 PCI_FUNC(devfn)); 3313 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 3314 3315 vtd_dev_as->bus = bus; 3316 vtd_dev_as->devfn = (uint8_t)devfn; 3317 vtd_dev_as->iommu_state = s; 3318 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 3319 vtd_dev_as->iova_tree = iova_tree_new(); 3320 3321 memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); 3322 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); 3323 3324 /* 3325 * Build the DMAR-disabled container with aliases to the 3326 * shared MRs. Note that aliasing to a shared memory region 3327 * could help the memory API to detect same FlatViews so we 3328 * can have devices to share the same FlatView when DMAR is 3329 * disabled (either by not providing "intel_iommu=on" or with 3330 * "iommu=pt"). It will greatly reduce the total number of 3331 * FlatViews of the system hence VM runs faster. 3332 */ 3333 memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), 3334 "vtd-nodmar", &s->mr_nodmar, 0, 3335 memory_region_size(&s->mr_nodmar)); 3336 3337 /* 3338 * Build the per-device DMAR-enabled container. 3339 * 3340 * TODO: currently we have per-device IOMMU memory region only 3341 * because we have per-device IOMMU notifiers for devices. If 3342 * one day we can abstract the IOMMU notifiers out of the 3343 * memory regions then we can also share the same memory 3344 * region here just like what we've done above with the nodmar 3345 * region. 3346 */ 3347 strcat(name, "-dmar"); 3348 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 3349 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 3350 name, UINT64_MAX); 3351 memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", 3352 &s->mr_ir, 0, memory_region_size(&s->mr_ir)); 3353 memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), 3354 VTD_INTERRUPT_ADDR_FIRST, 3355 &vtd_dev_as->iommu_ir, 1); 3356 3357 /* 3358 * Hook both the containers under the root container, we 3359 * switch between DMAR & noDMAR by enable/disable 3360 * corresponding sub-containers 3361 */ 3362 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3363 MEMORY_REGION(&vtd_dev_as->iommu), 3364 0); 3365 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3366 &vtd_dev_as->nodmar, 0); 3367 3368 vtd_switch_address_space(vtd_dev_as); 3369 } 3370 return vtd_dev_as; 3371 } 3372 3373 static uint64_t get_naturally_aligned_size(uint64_t start, 3374 uint64_t size, int gaw) 3375 { 3376 uint64_t max_mask = 1ULL << gaw; 3377 uint64_t alignment = start ? start & -start : max_mask; 3378 3379 alignment = MIN(alignment, max_mask); 3380 size = MIN(size, max_mask); 3381 3382 if (alignment <= size) { 3383 /* Increase the alignment of start */ 3384 return alignment; 3385 } else { 3386 /* Find the largest page mask from size */ 3387 return 1ULL << (63 - clz64(size)); 3388 } 3389 } 3390 3391 /* Unmap the whole range in the notifier's scope. */ 3392 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 3393 { 3394 hwaddr size, remain; 3395 hwaddr start = n->start; 3396 hwaddr end = n->end; 3397 IntelIOMMUState *s = as->iommu_state; 3398 DMAMap map; 3399 3400 /* 3401 * Note: all the codes in this function has a assumption that IOVA 3402 * bits are no more than VTD_MGAW bits (which is restricted by 3403 * VT-d spec), otherwise we need to consider overflow of 64 bits. 3404 */ 3405 3406 if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { 3407 /* 3408 * Don't need to unmap regions that is bigger than the whole 3409 * VT-d supported address space size 3410 */ 3411 end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; 3412 } 3413 3414 assert(start <= end); 3415 size = remain = end - start + 1; 3416 3417 while (remain >= VTD_PAGE_SIZE) { 3418 IOMMUTLBEntry entry; 3419 uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits); 3420 3421 assert(mask); 3422 3423 entry.iova = start; 3424 entry.addr_mask = mask - 1; 3425 entry.target_as = &address_space_memory; 3426 entry.perm = IOMMU_NONE; 3427 /* This field is meaningless for unmap */ 3428 entry.translated_addr = 0; 3429 3430 memory_region_notify_one(n, &entry); 3431 3432 start += mask; 3433 remain -= mask; 3434 } 3435 3436 assert(!remain); 3437 3438 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3439 VTD_PCI_SLOT(as->devfn), 3440 VTD_PCI_FUNC(as->devfn), 3441 n->start, size); 3442 3443 map.iova = n->start; 3444 map.size = size; 3445 iova_tree_remove(as->iova_tree, &map); 3446 } 3447 3448 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3449 { 3450 VTDAddressSpace *vtd_as; 3451 IOMMUNotifier *n; 3452 3453 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3454 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3455 vtd_address_space_unmap(vtd_as, n); 3456 } 3457 } 3458 } 3459 3460 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3461 { 3462 vtd_address_space_unmap_all(s); 3463 vtd_switch_address_space_all(s); 3464 } 3465 3466 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) 3467 { 3468 memory_region_notify_one((IOMMUNotifier *)private, entry); 3469 return 0; 3470 } 3471 3472 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3473 { 3474 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3475 IntelIOMMUState *s = vtd_as->iommu_state; 3476 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3477 VTDContextEntry ce; 3478 3479 /* 3480 * The replay can be triggered by either a invalidation or a newly 3481 * created entry. No matter what, we release existing mappings 3482 * (it means flushing caches for UNMAP-only registers). 3483 */ 3484 vtd_address_space_unmap(vtd_as, n); 3485 3486 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3487 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : 3488 "legacy mode", 3489 bus_n, PCI_SLOT(vtd_as->devfn), 3490 PCI_FUNC(vtd_as->devfn), 3491 vtd_get_domain_id(s, &ce), 3492 ce.hi, ce.lo); 3493 if (vtd_as_has_map_notifier(vtd_as)) { 3494 /* This is required only for MAP typed notifiers */ 3495 vtd_page_walk_info info = { 3496 .hook_fn = vtd_replay_hook, 3497 .private = (void *)n, 3498 .notify_unmap = false, 3499 .aw = s->aw_bits, 3500 .as = vtd_as, 3501 .domain_id = vtd_get_domain_id(s, &ce), 3502 }; 3503 3504 vtd_page_walk(s, &ce, 0, ~0ULL, &info); 3505 } 3506 } else { 3507 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3508 PCI_FUNC(vtd_as->devfn)); 3509 } 3510 3511 return; 3512 } 3513 3514 /* Do the initialization. It will also be called when reset, so pay 3515 * attention when adding new initialization stuff. 3516 */ 3517 static void vtd_init(IntelIOMMUState *s) 3518 { 3519 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3520 3521 memset(s->csr, 0, DMAR_REG_SIZE); 3522 memset(s->wmask, 0, DMAR_REG_SIZE); 3523 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3524 memset(s->womask, 0, DMAR_REG_SIZE); 3525 3526 s->root = 0; 3527 s->root_scalable = false; 3528 s->dmar_enabled = false; 3529 s->intr_enabled = false; 3530 s->iq_head = 0; 3531 s->iq_tail = 0; 3532 s->iq = 0; 3533 s->iq_size = 0; 3534 s->qi_enabled = false; 3535 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3536 s->iq_dw = false; 3537 s->next_frcd_reg = 0; 3538 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3539 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3540 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3541 if (s->dma_drain) { 3542 s->cap |= VTD_CAP_DRAIN; 3543 } 3544 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3545 s->cap |= VTD_CAP_SAGAW_48bit; 3546 } 3547 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3548 3549 /* 3550 * Rsvd field masks for spte 3551 */ 3552 vtd_paging_entry_rsvd_field[0] = ~0ULL; 3553 vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits); 3554 vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3555 vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3556 vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3557 vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits); 3558 vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); 3559 vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); 3560 vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits); 3561 3562 if (x86_iommu_ir_supported(x86_iommu)) { 3563 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3564 if (s->intr_eim == ON_OFF_AUTO_ON) { 3565 s->ecap |= VTD_ECAP_EIM; 3566 } 3567 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3568 } 3569 3570 if (x86_iommu->dt_supported) { 3571 s->ecap |= VTD_ECAP_DT; 3572 } 3573 3574 if (x86_iommu->pt_supported) { 3575 s->ecap |= VTD_ECAP_PT; 3576 } 3577 3578 if (s->caching_mode) { 3579 s->cap |= VTD_CAP_CM; 3580 } 3581 3582 /* TODO: read cap/ecap from host to decide which cap to be exposed. */ 3583 if (s->scalable_mode) { 3584 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; 3585 } 3586 3587 vtd_reset_caches(s); 3588 3589 /* Define registers with default values and bit semantics */ 3590 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3591 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3592 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3593 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3594 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3595 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3596 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); 3597 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3598 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3599 3600 /* Advanced Fault Logging not supported */ 3601 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3602 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3603 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3604 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3605 3606 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3607 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3608 */ 3609 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3610 3611 /* Treated as RO for implementations that PLMR and PHMR fields reported 3612 * as Clear in the CAP_REG. 3613 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3614 */ 3615 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3616 3617 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3618 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3619 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); 3620 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3621 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3622 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3623 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3624 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3625 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3626 3627 /* IOTLB registers */ 3628 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3629 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3630 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3631 3632 /* Fault Recording Registers, 128-bit */ 3633 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3634 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3635 3636 /* 3637 * Interrupt remapping registers. 3638 */ 3639 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3640 } 3641 3642 /* Should not reset address_spaces when reset because devices will still use 3643 * the address space they got at first (won't ask the bus again). 3644 */ 3645 static void vtd_reset(DeviceState *dev) 3646 { 3647 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3648 3649 vtd_init(s); 3650 vtd_address_space_refresh_all(s); 3651 } 3652 3653 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3654 { 3655 IntelIOMMUState *s = opaque; 3656 VTDAddressSpace *vtd_as; 3657 3658 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3659 3660 vtd_as = vtd_find_add_as(s, bus, devfn); 3661 return &vtd_as->as; 3662 } 3663 3664 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3665 { 3666 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3667 3668 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { 3669 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3670 return false; 3671 } 3672 3673 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3674 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3675 && x86_iommu_ir_supported(x86_iommu) ? 3676 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3677 } 3678 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3679 if (!kvm_irqchip_in_kernel()) { 3680 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3681 return false; 3682 } 3683 if (!kvm_enable_x2apic()) { 3684 error_setg(errp, "eim=on requires support on the KVM side" 3685 "(X2APIC_API, first shipped in v4.7)"); 3686 return false; 3687 } 3688 } 3689 3690 /* Currently only address widths supported are 39 and 48 bits */ 3691 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3692 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3693 error_setg(errp, "Supported values for x-aw-bits are: %d, %d", 3694 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3695 return false; 3696 } 3697 3698 if (s->scalable_mode && !s->dma_drain) { 3699 error_setg(errp, "Need to set dma_drain for scalable mode"); 3700 return false; 3701 } 3702 3703 return true; 3704 } 3705 3706 static int vtd_machine_done_notify_one(Object *child, void *unused) 3707 { 3708 IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); 3709 3710 /* 3711 * We hard-coded here because vfio-pci is the only special case 3712 * here. Let's be more elegant in the future when we can, but so 3713 * far there seems to be no better way. 3714 */ 3715 if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { 3716 vtd_panic_require_caching_mode(); 3717 } 3718 3719 return 0; 3720 } 3721 3722 static void vtd_machine_done_hook(Notifier *notifier, void *unused) 3723 { 3724 object_child_foreach_recursive(object_get_root(), 3725 vtd_machine_done_notify_one, NULL); 3726 } 3727 3728 static Notifier vtd_machine_done_notify = { 3729 .notify = vtd_machine_done_hook, 3730 }; 3731 3732 static void vtd_realize(DeviceState *dev, Error **errp) 3733 { 3734 MachineState *ms = MACHINE(qdev_get_machine()); 3735 PCMachineState *pcms = PC_MACHINE(ms); 3736 PCIBus *bus = pcms->bus; 3737 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3738 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); 3739 3740 x86_iommu->type = TYPE_INTEL; 3741 3742 if (!vtd_decide_config(s, errp)) { 3743 return; 3744 } 3745 3746 QLIST_INIT(&s->vtd_as_with_notifiers); 3747 qemu_mutex_init(&s->iommu_lock); 3748 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3749 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3750 "intel_iommu", DMAR_REG_SIZE); 3751 3752 /* Create the shared memory regions by all devices */ 3753 memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", 3754 UINT64_MAX); 3755 memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, 3756 s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); 3757 memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), 3758 "vtd-sys-alias", get_system_memory(), 0, 3759 memory_region_size(get_system_memory())); 3760 memory_region_add_subregion_overlap(&s->mr_nodmar, 0, 3761 &s->mr_sys_alias, 0); 3762 memory_region_add_subregion_overlap(&s->mr_nodmar, 3763 VTD_INTERRUPT_ADDR_FIRST, 3764 &s->mr_ir, 1); 3765 3766 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3767 /* No corresponding destroy */ 3768 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3769 g_free, g_free); 3770 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3771 g_free, g_free); 3772 vtd_init(s); 3773 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3774 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3775 /* Pseudo address space under root PCI bus. */ 3776 pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3777 qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); 3778 } 3779 3780 static void vtd_class_init(ObjectClass *klass, void *data) 3781 { 3782 DeviceClass *dc = DEVICE_CLASS(klass); 3783 X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass); 3784 3785 dc->reset = vtd_reset; 3786 dc->vmsd = &vtd_vmstate; 3787 dc->props = vtd_properties; 3788 dc->hotpluggable = false; 3789 x86_class->realize = vtd_realize; 3790 x86_class->int_remap = vtd_int_remap; 3791 /* Supported by the pc-q35-* machine types */ 3792 dc->user_creatable = true; 3793 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3794 dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; 3795 } 3796 3797 static const TypeInfo vtd_info = { 3798 .name = TYPE_INTEL_IOMMU_DEVICE, 3799 .parent = TYPE_X86_IOMMU_DEVICE, 3800 .instance_size = sizeof(IntelIOMMUState), 3801 .class_init = vtd_class_init, 3802 }; 3803 3804 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3805 void *data) 3806 { 3807 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3808 3809 imrc->translate = vtd_iommu_translate; 3810 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3811 imrc->replay = vtd_iommu_replay; 3812 } 3813 3814 static const TypeInfo vtd_iommu_memory_region_info = { 3815 .parent = TYPE_IOMMU_MEMORY_REGION, 3816 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3817 .class_init = vtd_iommu_memory_region_class_init, 3818 }; 3819 3820 static void vtd_register_types(void) 3821 { 3822 type_register_static(&vtd_info); 3823 type_register_static(&vtd_iommu_memory_region_info); 3824 } 3825 3826 type_init(vtd_register_types) 3827