1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qemu/main-loop.h" 25 #include "qapi/error.h" 26 #include "hw/sysbus.h" 27 #include "exec/address-spaces.h" 28 #include "intel_iommu_internal.h" 29 #include "hw/pci/pci.h" 30 #include "hw/pci/pci_bus.h" 31 #include "hw/qdev-properties.h" 32 #include "hw/i386/pc.h" 33 #include "hw/i386/apic-msidef.h" 34 #include "hw/boards.h" 35 #include "hw/i386/x86-iommu.h" 36 #include "hw/pci-host/q35.h" 37 #include "sysemu/kvm.h" 38 #include "sysemu/sysemu.h" 39 #include "hw/i386/apic_internal.h" 40 #include "kvm_i386.h" 41 #include "migration/vmstate.h" 42 #include "trace.h" 43 44 /* context entry operations */ 45 #define VTD_CE_GET_RID2PASID(ce) \ 46 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) 47 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ 48 ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) 49 50 /* pe operations */ 51 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) 52 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) 53 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ 54 if (ret_fr) { \ 55 ret_fr = -ret_fr; \ 56 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ 57 trace_vtd_fault_disabled(); \ 58 } else { \ 59 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ 60 } \ 61 goto error; \ 62 } \ 63 } 64 65 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 66 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 67 68 static void vtd_panic_require_caching_mode(void) 69 { 70 error_report("We need to set caching-mode=on for intel-iommu to enable " 71 "device assignment with IOMMU protection."); 72 exit(1); 73 } 74 75 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 76 uint64_t wmask, uint64_t w1cmask) 77 { 78 stq_le_p(&s->csr[addr], val); 79 stq_le_p(&s->wmask[addr], wmask); 80 stq_le_p(&s->w1cmask[addr], w1cmask); 81 } 82 83 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 84 { 85 stq_le_p(&s->womask[addr], mask); 86 } 87 88 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 89 uint32_t wmask, uint32_t w1cmask) 90 { 91 stl_le_p(&s->csr[addr], val); 92 stl_le_p(&s->wmask[addr], wmask); 93 stl_le_p(&s->w1cmask[addr], w1cmask); 94 } 95 96 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 97 { 98 stl_le_p(&s->womask[addr], mask); 99 } 100 101 /* "External" get/set operations */ 102 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 103 { 104 uint64_t oldval = ldq_le_p(&s->csr[addr]); 105 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 106 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 107 stq_le_p(&s->csr[addr], 108 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 109 } 110 111 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 112 { 113 uint32_t oldval = ldl_le_p(&s->csr[addr]); 114 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 115 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 116 stl_le_p(&s->csr[addr], 117 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 118 } 119 120 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 121 { 122 uint64_t val = ldq_le_p(&s->csr[addr]); 123 uint64_t womask = ldq_le_p(&s->womask[addr]); 124 return val & ~womask; 125 } 126 127 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 128 { 129 uint32_t val = ldl_le_p(&s->csr[addr]); 130 uint32_t womask = ldl_le_p(&s->womask[addr]); 131 return val & ~womask; 132 } 133 134 /* "Internal" get/set operations */ 135 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 136 { 137 return ldq_le_p(&s->csr[addr]); 138 } 139 140 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 141 { 142 return ldl_le_p(&s->csr[addr]); 143 } 144 145 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 146 { 147 stq_le_p(&s->csr[addr], val); 148 } 149 150 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 151 uint32_t clear, uint32_t mask) 152 { 153 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 154 stl_le_p(&s->csr[addr], new_val); 155 return new_val; 156 } 157 158 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 159 uint64_t clear, uint64_t mask) 160 { 161 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 162 stq_le_p(&s->csr[addr], new_val); 163 return new_val; 164 } 165 166 static inline void vtd_iommu_lock(IntelIOMMUState *s) 167 { 168 qemu_mutex_lock(&s->iommu_lock); 169 } 170 171 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 172 { 173 qemu_mutex_unlock(&s->iommu_lock); 174 } 175 176 static void vtd_update_scalable_state(IntelIOMMUState *s) 177 { 178 uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 179 180 if (s->scalable_mode) { 181 s->root_scalable = val & VTD_RTADDR_SMT; 182 } 183 } 184 185 /* Whether the address space needs to notify new mappings */ 186 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 187 { 188 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 189 } 190 191 /* GHashTable functions */ 192 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 193 { 194 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 195 } 196 197 static guint vtd_uint64_hash(gconstpointer v) 198 { 199 return (guint)*(const uint64_t *)v; 200 } 201 202 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 203 gpointer user_data) 204 { 205 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 206 uint16_t domain_id = *(uint16_t *)user_data; 207 return entry->domain_id == domain_id; 208 } 209 210 /* The shift of an addr for a certain level of paging structure */ 211 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 212 { 213 assert(level != 0); 214 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 215 } 216 217 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 218 { 219 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 220 } 221 222 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 223 gpointer user_data) 224 { 225 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 226 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 227 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 228 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 229 return (entry->domain_id == info->domain_id) && 230 (((entry->gfn & info->mask) == gfn) || 231 (entry->gfn == gfn_tlb)); 232 } 233 234 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 235 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 236 */ 237 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 238 { 239 VTDAddressSpace *vtd_as; 240 VTDBus *vtd_bus; 241 GHashTableIter bus_it; 242 uint32_t devfn_it; 243 244 trace_vtd_context_cache_reset(); 245 246 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 247 248 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 249 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 250 vtd_as = vtd_bus->dev_as[devfn_it]; 251 if (!vtd_as) { 252 continue; 253 } 254 vtd_as->context_cache_entry.context_cache_gen = 0; 255 } 256 } 257 s->context_cache_gen = 1; 258 } 259 260 /* Must be called with IOMMU lock held. */ 261 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 262 { 263 assert(s->iotlb); 264 g_hash_table_remove_all(s->iotlb); 265 } 266 267 static void vtd_reset_iotlb(IntelIOMMUState *s) 268 { 269 vtd_iommu_lock(s); 270 vtd_reset_iotlb_locked(s); 271 vtd_iommu_unlock(s); 272 } 273 274 static void vtd_reset_caches(IntelIOMMUState *s) 275 { 276 vtd_iommu_lock(s); 277 vtd_reset_iotlb_locked(s); 278 vtd_reset_context_cache_locked(s); 279 vtd_iommu_unlock(s); 280 } 281 282 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 283 uint32_t level) 284 { 285 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 286 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 287 } 288 289 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 290 { 291 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 292 } 293 294 /* Must be called with IOMMU lock held */ 295 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 296 hwaddr addr) 297 { 298 VTDIOTLBEntry *entry; 299 uint64_t key; 300 int level; 301 302 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 303 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 304 source_id, level); 305 entry = g_hash_table_lookup(s->iotlb, &key); 306 if (entry) { 307 goto out; 308 } 309 } 310 311 out: 312 return entry; 313 } 314 315 /* Must be with IOMMU lock held */ 316 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 317 uint16_t domain_id, hwaddr addr, uint64_t slpte, 318 uint8_t access_flags, uint32_t level) 319 { 320 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 321 uint64_t *key = g_malloc(sizeof(*key)); 322 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 323 324 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 325 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 326 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 327 vtd_reset_iotlb_locked(s); 328 } 329 330 entry->gfn = gfn; 331 entry->domain_id = domain_id; 332 entry->slpte = slpte; 333 entry->access_flags = access_flags; 334 entry->mask = vtd_slpt_level_page_mask(level); 335 *key = vtd_get_iotlb_key(gfn, source_id, level); 336 g_hash_table_replace(s->iotlb, key, entry); 337 } 338 339 /* Given the reg addr of both the message data and address, generate an 340 * interrupt via MSI. 341 */ 342 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 343 hwaddr mesg_data_reg) 344 { 345 MSIMessage msi; 346 347 assert(mesg_data_reg < DMAR_REG_SIZE); 348 assert(mesg_addr_reg < DMAR_REG_SIZE); 349 350 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 351 msi.data = vtd_get_long_raw(s, mesg_data_reg); 352 353 trace_vtd_irq_generate(msi.address, msi.data); 354 355 apic_get_class()->send_msi(&msi); 356 } 357 358 /* Generate a fault event to software via MSI if conditions are met. 359 * Notice that the value of FSTS_REG being passed to it should be the one 360 * before any update. 361 */ 362 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 363 { 364 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 365 pre_fsts & VTD_FSTS_IQE) { 366 error_report_once("There are previous interrupt conditions " 367 "to be serviced by software, fault event " 368 "is not generated"); 369 return; 370 } 371 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 372 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 373 error_report_once("Interrupt Mask set, irq is not generated"); 374 } else { 375 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 376 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 377 } 378 } 379 380 /* Check if the Fault (F) field of the Fault Recording Register referenced by 381 * @index is Set. 382 */ 383 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 384 { 385 /* Each reg is 128-bit */ 386 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 387 addr += 8; /* Access the high 64-bit half */ 388 389 assert(index < DMAR_FRCD_REG_NR); 390 391 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 392 } 393 394 /* Update the PPF field of Fault Status Register. 395 * Should be called whenever change the F field of any fault recording 396 * registers. 397 */ 398 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 399 { 400 uint32_t i; 401 uint32_t ppf_mask = 0; 402 403 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 404 if (vtd_is_frcd_set(s, i)) { 405 ppf_mask = VTD_FSTS_PPF; 406 break; 407 } 408 } 409 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 410 trace_vtd_fsts_ppf(!!ppf_mask); 411 } 412 413 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 414 { 415 /* Each reg is 128-bit */ 416 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 417 addr += 8; /* Access the high 64-bit half */ 418 419 assert(index < DMAR_FRCD_REG_NR); 420 421 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 422 vtd_update_fsts_ppf(s); 423 } 424 425 /* Must not update F field now, should be done later */ 426 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 427 uint16_t source_id, hwaddr addr, 428 VTDFaultReason fault, bool is_write) 429 { 430 uint64_t hi = 0, lo; 431 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 432 433 assert(index < DMAR_FRCD_REG_NR); 434 435 lo = VTD_FRCD_FI(addr); 436 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 437 if (!is_write) { 438 hi |= VTD_FRCD_T; 439 } 440 vtd_set_quad_raw(s, frcd_reg_addr, lo); 441 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 442 443 trace_vtd_frr_new(index, hi, lo); 444 } 445 446 /* Try to collapse multiple pending faults from the same requester */ 447 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 448 { 449 uint32_t i; 450 uint64_t frcd_reg; 451 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 452 453 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 454 frcd_reg = vtd_get_quad_raw(s, addr); 455 if ((frcd_reg & VTD_FRCD_F) && 456 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 457 return true; 458 } 459 addr += 16; /* 128-bit for each */ 460 } 461 return false; 462 } 463 464 /* Log and report an DMAR (address translation) fault to software */ 465 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 466 hwaddr addr, VTDFaultReason fault, 467 bool is_write) 468 { 469 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 470 471 assert(fault < VTD_FR_MAX); 472 473 if (fault == VTD_FR_RESERVED_ERR) { 474 /* This is not a normal fault reason case. Drop it. */ 475 return; 476 } 477 478 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 479 480 if (fsts_reg & VTD_FSTS_PFO) { 481 error_report_once("New fault is not recorded due to " 482 "Primary Fault Overflow"); 483 return; 484 } 485 486 if (vtd_try_collapse_fault(s, source_id)) { 487 error_report_once("New fault is not recorded due to " 488 "compression of faults"); 489 return; 490 } 491 492 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 493 error_report_once("Next Fault Recording Reg is used, " 494 "new fault is not recorded, set PFO field"); 495 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 496 return; 497 } 498 499 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 500 501 if (fsts_reg & VTD_FSTS_PPF) { 502 error_report_once("There are pending faults already, " 503 "fault event is not generated"); 504 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 505 s->next_frcd_reg++; 506 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 507 s->next_frcd_reg = 0; 508 } 509 } else { 510 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 511 VTD_FSTS_FRI(s->next_frcd_reg)); 512 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 513 s->next_frcd_reg++; 514 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 515 s->next_frcd_reg = 0; 516 } 517 /* This case actually cause the PPF to be Set. 518 * So generate fault event (interrupt). 519 */ 520 vtd_generate_fault_event(s, fsts_reg); 521 } 522 } 523 524 /* Handle Invalidation Queue Errors of queued invalidation interface error 525 * conditions. 526 */ 527 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 528 { 529 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 530 531 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 532 vtd_generate_fault_event(s, fsts_reg); 533 } 534 535 /* Set the IWC field and try to generate an invalidation completion interrupt */ 536 static void vtd_generate_completion_event(IntelIOMMUState *s) 537 { 538 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 539 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 540 return; 541 } 542 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 543 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 544 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 545 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 546 "new event not generated"); 547 return; 548 } else { 549 /* Generate the interrupt event */ 550 trace_vtd_inv_desc_wait_irq("Generating complete event"); 551 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 552 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 553 } 554 } 555 556 static inline bool vtd_root_entry_present(IntelIOMMUState *s, 557 VTDRootEntry *re, 558 uint8_t devfn) 559 { 560 if (s->root_scalable && devfn > UINT8_MAX / 2) { 561 return re->hi & VTD_ROOT_ENTRY_P; 562 } 563 564 return re->lo & VTD_ROOT_ENTRY_P; 565 } 566 567 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 568 VTDRootEntry *re) 569 { 570 dma_addr_t addr; 571 572 addr = s->root + index * sizeof(*re); 573 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { 574 re->lo = 0; 575 return -VTD_FR_ROOT_TABLE_INV; 576 } 577 re->lo = le64_to_cpu(re->lo); 578 re->hi = le64_to_cpu(re->hi); 579 return 0; 580 } 581 582 static inline bool vtd_ce_present(VTDContextEntry *context) 583 { 584 return context->lo & VTD_CONTEXT_ENTRY_P; 585 } 586 587 static int vtd_get_context_entry_from_root(IntelIOMMUState *s, 588 VTDRootEntry *re, 589 uint8_t index, 590 VTDContextEntry *ce) 591 { 592 dma_addr_t addr, ce_size; 593 594 /* we have checked that root entry is present */ 595 ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : 596 VTD_CTX_ENTRY_LEGACY_SIZE; 597 598 if (s->root_scalable && index > UINT8_MAX / 2) { 599 index = index & (~VTD_DEVFN_CHECK_MASK); 600 addr = re->hi & VTD_ROOT_ENTRY_CTP; 601 } else { 602 addr = re->lo & VTD_ROOT_ENTRY_CTP; 603 } 604 605 addr = addr + index * ce_size; 606 if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) { 607 return -VTD_FR_CONTEXT_TABLE_INV; 608 } 609 610 ce->lo = le64_to_cpu(ce->lo); 611 ce->hi = le64_to_cpu(ce->hi); 612 if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { 613 ce->val[2] = le64_to_cpu(ce->val[2]); 614 ce->val[3] = le64_to_cpu(ce->val[3]); 615 } 616 return 0; 617 } 618 619 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 620 { 621 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 622 } 623 624 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 625 { 626 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 627 } 628 629 /* Whether the pte indicates the address of the page frame */ 630 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 631 { 632 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 633 } 634 635 /* Get the content of a spte located in @base_addr[@index] */ 636 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 637 { 638 uint64_t slpte; 639 640 assert(index < VTD_SL_PT_ENTRY_NR); 641 642 if (dma_memory_read(&address_space_memory, 643 base_addr + index * sizeof(slpte), &slpte, 644 sizeof(slpte))) { 645 slpte = (uint64_t)-1; 646 return slpte; 647 } 648 slpte = le64_to_cpu(slpte); 649 return slpte; 650 } 651 652 /* Given an iova and the level of paging structure, return the offset 653 * of current level. 654 */ 655 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 656 { 657 return (iova >> vtd_slpt_level_shift(level)) & 658 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 659 } 660 661 /* Check Capability Register to see if the @level of page-table is supported */ 662 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 663 { 664 return VTD_CAP_SAGAW_MASK & s->cap & 665 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 666 } 667 668 /* Return true if check passed, otherwise false */ 669 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, 670 VTDPASIDEntry *pe) 671 { 672 switch (VTD_PE_GET_TYPE(pe)) { 673 case VTD_SM_PASID_ENTRY_FLT: 674 case VTD_SM_PASID_ENTRY_SLT: 675 case VTD_SM_PASID_ENTRY_NESTED: 676 break; 677 case VTD_SM_PASID_ENTRY_PT: 678 if (!x86_iommu->pt_supported) { 679 return false; 680 } 681 break; 682 default: 683 /* Unknwon type */ 684 return false; 685 } 686 return true; 687 } 688 689 static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, 690 uint32_t pasid, 691 VTDPASIDDirEntry *pdire) 692 { 693 uint32_t index; 694 dma_addr_t addr, entry_size; 695 696 index = VTD_PASID_DIR_INDEX(pasid); 697 entry_size = VTD_PASID_DIR_ENTRY_SIZE; 698 addr = pasid_dir_base + index * entry_size; 699 if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) { 700 return -VTD_FR_PASID_TABLE_INV; 701 } 702 703 return 0; 704 } 705 706 static int vtd_get_pasid_entry(IntelIOMMUState *s, 707 uint32_t pasid, 708 VTDPASIDDirEntry *pdire, 709 VTDPASIDEntry *pe) 710 { 711 uint32_t index; 712 dma_addr_t addr, entry_size; 713 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 714 715 index = VTD_PASID_TABLE_INDEX(pasid); 716 entry_size = VTD_PASID_ENTRY_SIZE; 717 addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; 718 addr = addr + index * entry_size; 719 if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { 720 return -VTD_FR_PASID_TABLE_INV; 721 } 722 723 /* Do translation type check */ 724 if (!vtd_pe_type_check(x86_iommu, pe)) { 725 return -VTD_FR_PASID_TABLE_INV; 726 } 727 728 if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { 729 return -VTD_FR_PASID_TABLE_INV; 730 } 731 732 return 0; 733 } 734 735 static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s, 736 dma_addr_t pasid_dir_base, 737 uint32_t pasid, 738 VTDPASIDEntry *pe) 739 { 740 int ret; 741 VTDPASIDDirEntry pdire; 742 743 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 744 if (ret) { 745 return ret; 746 } 747 748 ret = vtd_get_pasid_entry(s, pasid, &pdire, pe); 749 if (ret) { 750 return ret; 751 } 752 753 return ret; 754 } 755 756 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, 757 VTDContextEntry *ce, 758 VTDPASIDEntry *pe) 759 { 760 uint32_t pasid; 761 dma_addr_t pasid_dir_base; 762 int ret = 0; 763 764 pasid = VTD_CE_GET_RID2PASID(ce); 765 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 766 ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe); 767 768 return ret; 769 } 770 771 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, 772 VTDContextEntry *ce, 773 bool *pe_fpd_set) 774 { 775 int ret; 776 uint32_t pasid; 777 dma_addr_t pasid_dir_base; 778 VTDPASIDDirEntry pdire; 779 VTDPASIDEntry pe; 780 781 pasid = VTD_CE_GET_RID2PASID(ce); 782 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 783 784 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 785 if (ret) { 786 return ret; 787 } 788 789 if (pdire.val & VTD_PASID_DIR_FPD) { 790 *pe_fpd_set = true; 791 return 0; 792 } 793 794 ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe); 795 if (ret) { 796 return ret; 797 } 798 799 if (pe.val[0] & VTD_PASID_ENTRY_FPD) { 800 *pe_fpd_set = true; 801 } 802 803 return 0; 804 } 805 806 /* Get the page-table level that hardware should use for the second-level 807 * page-table walk from the Address Width field of context-entry. 808 */ 809 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 810 { 811 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 812 } 813 814 static uint32_t vtd_get_iova_level(IntelIOMMUState *s, 815 VTDContextEntry *ce) 816 { 817 VTDPASIDEntry pe; 818 819 if (s->root_scalable) { 820 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 821 return VTD_PE_GET_LEVEL(&pe); 822 } 823 824 return vtd_ce_get_level(ce); 825 } 826 827 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 828 { 829 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 830 } 831 832 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, 833 VTDContextEntry *ce) 834 { 835 VTDPASIDEntry pe; 836 837 if (s->root_scalable) { 838 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 839 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; 840 } 841 842 return vtd_ce_get_agaw(ce); 843 } 844 845 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 846 { 847 return ce->lo & VTD_CONTEXT_ENTRY_TT; 848 } 849 850 /* Only for Legacy Mode. Return true if check passed, otherwise false */ 851 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 852 VTDContextEntry *ce) 853 { 854 switch (vtd_ce_get_type(ce)) { 855 case VTD_CONTEXT_TT_MULTI_LEVEL: 856 /* Always supported */ 857 break; 858 case VTD_CONTEXT_TT_DEV_IOTLB: 859 if (!x86_iommu->dt_supported) { 860 error_report_once("%s: DT specified but not supported", __func__); 861 return false; 862 } 863 break; 864 case VTD_CONTEXT_TT_PASS_THROUGH: 865 if (!x86_iommu->pt_supported) { 866 error_report_once("%s: PT specified but not supported", __func__); 867 return false; 868 } 869 break; 870 default: 871 /* Unknown type */ 872 error_report_once("%s: unknown ce type: %"PRIu32, __func__, 873 vtd_ce_get_type(ce)); 874 return false; 875 } 876 return true; 877 } 878 879 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, 880 VTDContextEntry *ce, uint8_t aw) 881 { 882 uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); 883 return 1ULL << MIN(ce_agaw, aw); 884 } 885 886 /* Return true if IOVA passes range check, otherwise false. */ 887 static inline bool vtd_iova_range_check(IntelIOMMUState *s, 888 uint64_t iova, VTDContextEntry *ce, 889 uint8_t aw) 890 { 891 /* 892 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 893 * in CAP_REG and AW in context-entry. 894 */ 895 return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); 896 } 897 898 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, 899 VTDContextEntry *ce) 900 { 901 VTDPASIDEntry pe; 902 903 if (s->root_scalable) { 904 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 905 return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; 906 } 907 908 return vtd_ce_get_slpt_base(ce); 909 } 910 911 /* 912 * Rsvd field masks for spte: 913 * vtd_spte_rsvd 4k pages 914 * vtd_spte_rsvd_large large pages 915 */ 916 static uint64_t vtd_spte_rsvd[5]; 917 static uint64_t vtd_spte_rsvd_large[5]; 918 919 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 920 { 921 uint64_t rsvd_mask = vtd_spte_rsvd[level]; 922 923 if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && 924 (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { 925 /* large page */ 926 rsvd_mask = vtd_spte_rsvd_large[level]; 927 } 928 929 return slpte & rsvd_mask; 930 } 931 932 /* Find the VTD address space associated with a given bus number */ 933 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 934 { 935 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 936 if (!vtd_bus) { 937 /* 938 * Iterate over the registered buses to find the one which 939 * currently hold this bus number, and update the bus_num 940 * lookup table: 941 */ 942 GHashTableIter iter; 943 944 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 945 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 946 if (pci_bus_num(vtd_bus->bus) == bus_num) { 947 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 948 return vtd_bus; 949 } 950 } 951 } 952 return vtd_bus; 953 } 954 955 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 956 * of the translation, can be used for deciding the size of large page. 957 */ 958 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, 959 uint64_t iova, bool is_write, 960 uint64_t *slptep, uint32_t *slpte_level, 961 bool *reads, bool *writes, uint8_t aw_bits) 962 { 963 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 964 uint32_t level = vtd_get_iova_level(s, ce); 965 uint32_t offset; 966 uint64_t slpte; 967 uint64_t access_right_check; 968 969 if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { 970 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 971 __func__, iova); 972 return -VTD_FR_ADDR_BEYOND_MGAW; 973 } 974 975 /* FIXME: what is the Atomics request here? */ 976 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 977 978 while (true) { 979 offset = vtd_iova_level_offset(iova, level); 980 slpte = vtd_get_slpte(addr, offset); 981 982 if (slpte == (uint64_t)-1) { 983 error_report_once("%s: detected read error on DMAR slpte " 984 "(iova=0x%" PRIx64 ")", __func__, iova); 985 if (level == vtd_get_iova_level(s, ce)) { 986 /* Invalid programming of context-entry */ 987 return -VTD_FR_CONTEXT_ENTRY_INV; 988 } else { 989 return -VTD_FR_PAGING_ENTRY_INV; 990 } 991 } 992 *reads = (*reads) && (slpte & VTD_SL_R); 993 *writes = (*writes) && (slpte & VTD_SL_W); 994 if (!(slpte & access_right_check)) { 995 error_report_once("%s: detected slpte permission error " 996 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 997 "slpte=0x%" PRIx64 ", write=%d)", __func__, 998 iova, level, slpte, is_write); 999 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 1000 } 1001 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1002 error_report_once("%s: detected splte reserve non-zero " 1003 "iova=0x%" PRIx64 ", level=0x%" PRIx32 1004 "slpte=0x%" PRIx64 ")", __func__, iova, 1005 level, slpte); 1006 return -VTD_FR_PAGING_ENTRY_RSVD; 1007 } 1008 1009 if (vtd_is_last_slpte(slpte, level)) { 1010 *slptep = slpte; 1011 *slpte_level = level; 1012 return 0; 1013 } 1014 addr = vtd_get_slpte_addr(slpte, aw_bits); 1015 level--; 1016 } 1017 } 1018 1019 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); 1020 1021 /** 1022 * Constant information used during page walking 1023 * 1024 * @hook_fn: hook func to be called when detected page 1025 * @private: private data to be passed into hook func 1026 * @notify_unmap: whether we should notify invalid entries 1027 * @as: VT-d address space of the device 1028 * @aw: maximum address width 1029 * @domain: domain ID of the page walk 1030 */ 1031 typedef struct { 1032 VTDAddressSpace *as; 1033 vtd_page_walk_hook hook_fn; 1034 void *private; 1035 bool notify_unmap; 1036 uint8_t aw; 1037 uint16_t domain_id; 1038 } vtd_page_walk_info; 1039 1040 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) 1041 { 1042 VTDAddressSpace *as = info->as; 1043 vtd_page_walk_hook hook_fn = info->hook_fn; 1044 void *private = info->private; 1045 DMAMap target = { 1046 .iova = entry->iova, 1047 .size = entry->addr_mask, 1048 .translated_addr = entry->translated_addr, 1049 .perm = entry->perm, 1050 }; 1051 DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 1052 1053 if (entry->perm == IOMMU_NONE && !info->notify_unmap) { 1054 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1055 return 0; 1056 } 1057 1058 assert(hook_fn); 1059 1060 /* Update local IOVA mapped ranges */ 1061 if (entry->perm) { 1062 if (mapped) { 1063 /* If it's exactly the same translation, skip */ 1064 if (!memcmp(mapped, &target, sizeof(target))) { 1065 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 1066 entry->translated_addr); 1067 return 0; 1068 } else { 1069 /* 1070 * Translation changed. Normally this should not 1071 * happen, but it can happen when with buggy guest 1072 * OSes. Note that there will be a small window that 1073 * we don't have map at all. But that's the best 1074 * effort we can do. The ideal way to emulate this is 1075 * atomically modify the PTE to follow what has 1076 * changed, but we can't. One example is that vfio 1077 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 1078 * interface to modify a mapping (meanwhile it seems 1079 * meaningless to even provide one). Anyway, let's 1080 * mark this as a TODO in case one day we'll have 1081 * a better solution. 1082 */ 1083 IOMMUAccessFlags cache_perm = entry->perm; 1084 int ret; 1085 1086 /* Emulate an UNMAP */ 1087 entry->perm = IOMMU_NONE; 1088 trace_vtd_page_walk_one(info->domain_id, 1089 entry->iova, 1090 entry->translated_addr, 1091 entry->addr_mask, 1092 entry->perm); 1093 ret = hook_fn(entry, private); 1094 if (ret) { 1095 return ret; 1096 } 1097 /* Drop any existing mapping */ 1098 iova_tree_remove(as->iova_tree, &target); 1099 /* Recover the correct permission */ 1100 entry->perm = cache_perm; 1101 } 1102 } 1103 iova_tree_insert(as->iova_tree, &target); 1104 } else { 1105 if (!mapped) { 1106 /* Skip since we didn't map this range at all */ 1107 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1108 return 0; 1109 } 1110 iova_tree_remove(as->iova_tree, &target); 1111 } 1112 1113 trace_vtd_page_walk_one(info->domain_id, entry->iova, 1114 entry->translated_addr, entry->addr_mask, 1115 entry->perm); 1116 return hook_fn(entry, private); 1117 } 1118 1119 /** 1120 * vtd_page_walk_level - walk over specific level for IOVA range 1121 * 1122 * @addr: base GPA addr to start the walk 1123 * @start: IOVA range start address 1124 * @end: IOVA range end address (start <= addr < end) 1125 * @read: whether parent level has read permission 1126 * @write: whether parent level has write permission 1127 * @info: constant information for the page walk 1128 */ 1129 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 1130 uint64_t end, uint32_t level, bool read, 1131 bool write, vtd_page_walk_info *info) 1132 { 1133 bool read_cur, write_cur, entry_valid; 1134 uint32_t offset; 1135 uint64_t slpte; 1136 uint64_t subpage_size, subpage_mask; 1137 IOMMUTLBEntry entry; 1138 uint64_t iova = start; 1139 uint64_t iova_next; 1140 int ret = 0; 1141 1142 trace_vtd_page_walk_level(addr, level, start, end); 1143 1144 subpage_size = 1ULL << vtd_slpt_level_shift(level); 1145 subpage_mask = vtd_slpt_level_page_mask(level); 1146 1147 while (iova < end) { 1148 iova_next = (iova & subpage_mask) + subpage_size; 1149 1150 offset = vtd_iova_level_offset(iova, level); 1151 slpte = vtd_get_slpte(addr, offset); 1152 1153 if (slpte == (uint64_t)-1) { 1154 trace_vtd_page_walk_skip_read(iova, iova_next); 1155 goto next; 1156 } 1157 1158 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1159 trace_vtd_page_walk_skip_reserve(iova, iova_next); 1160 goto next; 1161 } 1162 1163 /* Permissions are stacked with parents' */ 1164 read_cur = read && (slpte & VTD_SL_R); 1165 write_cur = write && (slpte & VTD_SL_W); 1166 1167 /* 1168 * As long as we have either read/write permission, this is a 1169 * valid entry. The rule works for both page entries and page 1170 * table entries. 1171 */ 1172 entry_valid = read_cur | write_cur; 1173 1174 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 1175 /* 1176 * This is a valid PDE (or even bigger than PDE). We need 1177 * to walk one further level. 1178 */ 1179 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 1180 iova, MIN(iova_next, end), level - 1, 1181 read_cur, write_cur, info); 1182 } else { 1183 /* 1184 * This means we are either: 1185 * 1186 * (1) the real page entry (either 4K page, or huge page) 1187 * (2) the whole range is invalid 1188 * 1189 * In either case, we send an IOTLB notification down. 1190 */ 1191 entry.target_as = &address_space_memory; 1192 entry.iova = iova & subpage_mask; 1193 entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 1194 entry.addr_mask = ~subpage_mask; 1195 /* NOTE: this is only meaningful if entry_valid == true */ 1196 entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 1197 ret = vtd_page_walk_one(&entry, info); 1198 } 1199 1200 if (ret < 0) { 1201 return ret; 1202 } 1203 1204 next: 1205 iova = iova_next; 1206 } 1207 1208 return 0; 1209 } 1210 1211 /** 1212 * vtd_page_walk - walk specific IOVA range, and call the hook 1213 * 1214 * @s: intel iommu state 1215 * @ce: context entry to walk upon 1216 * @start: IOVA address to start the walk 1217 * @end: IOVA range end address (start <= addr < end) 1218 * @info: page walking information struct 1219 */ 1220 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, 1221 uint64_t start, uint64_t end, 1222 vtd_page_walk_info *info) 1223 { 1224 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1225 uint32_t level = vtd_get_iova_level(s, ce); 1226 1227 if (!vtd_iova_range_check(s, start, ce, info->aw)) { 1228 return -VTD_FR_ADDR_BEYOND_MGAW; 1229 } 1230 1231 if (!vtd_iova_range_check(s, end, ce, info->aw)) { 1232 /* Fix end so that it reaches the maximum */ 1233 end = vtd_iova_limit(s, ce, info->aw); 1234 } 1235 1236 return vtd_page_walk_level(addr, start, end, level, true, true, info); 1237 } 1238 1239 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, 1240 VTDRootEntry *re) 1241 { 1242 /* Legacy Mode reserved bits check */ 1243 if (!s->root_scalable && 1244 (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1245 goto rsvd_err; 1246 1247 /* Scalable Mode reserved bits check */ 1248 if (s->root_scalable && 1249 ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || 1250 (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1251 goto rsvd_err; 1252 1253 return 0; 1254 1255 rsvd_err: 1256 error_report_once("%s: invalid root entry: hi=0x%"PRIx64 1257 ", lo=0x%"PRIx64, 1258 __func__, re->hi, re->lo); 1259 return -VTD_FR_ROOT_ENTRY_RSVD; 1260 } 1261 1262 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, 1263 VTDContextEntry *ce) 1264 { 1265 if (!s->root_scalable && 1266 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || 1267 ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1268 error_report_once("%s: invalid context entry: hi=%"PRIx64 1269 ", lo=%"PRIx64" (reserved nonzero)", 1270 __func__, ce->hi, ce->lo); 1271 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1272 } 1273 1274 if (s->root_scalable && 1275 (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || 1276 ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || 1277 ce->val[2] || 1278 ce->val[3])) { 1279 error_report_once("%s: invalid context entry: val[3]=%"PRIx64 1280 ", val[2]=%"PRIx64 1281 ", val[1]=%"PRIx64 1282 ", val[0]=%"PRIx64" (reserved nonzero)", 1283 __func__, ce->val[3], ce->val[2], 1284 ce->val[1], ce->val[0]); 1285 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1286 } 1287 1288 return 0; 1289 } 1290 1291 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, 1292 VTDContextEntry *ce) 1293 { 1294 VTDPASIDEntry pe; 1295 1296 /* 1297 * Make sure in Scalable Mode, a present context entry 1298 * has valid rid2pasid setting, which includes valid 1299 * rid2pasid field and corresponding pasid entry setting 1300 */ 1301 return vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1302 } 1303 1304 /* Map a device to its corresponding domain (context-entry) */ 1305 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 1306 uint8_t devfn, VTDContextEntry *ce) 1307 { 1308 VTDRootEntry re; 1309 int ret_fr; 1310 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 1311 1312 ret_fr = vtd_get_root_entry(s, bus_num, &re); 1313 if (ret_fr) { 1314 return ret_fr; 1315 } 1316 1317 if (!vtd_root_entry_present(s, &re, devfn)) { 1318 /* Not error - it's okay we don't have root entry. */ 1319 trace_vtd_re_not_present(bus_num); 1320 return -VTD_FR_ROOT_ENTRY_P; 1321 } 1322 1323 ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); 1324 if (ret_fr) { 1325 return ret_fr; 1326 } 1327 1328 ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); 1329 if (ret_fr) { 1330 return ret_fr; 1331 } 1332 1333 if (!vtd_ce_present(ce)) { 1334 /* Not error - it's okay we don't have context entry. */ 1335 trace_vtd_ce_not_present(bus_num, devfn); 1336 return -VTD_FR_CONTEXT_ENTRY_P; 1337 } 1338 1339 ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); 1340 if (ret_fr) { 1341 return ret_fr; 1342 } 1343 1344 /* Check if the programming of context-entry is valid */ 1345 if (!s->root_scalable && 1346 !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1347 error_report_once("%s: invalid context entry: hi=%"PRIx64 1348 ", lo=%"PRIx64" (level %d not supported)", 1349 __func__, ce->hi, ce->lo, 1350 vtd_ce_get_level(ce)); 1351 return -VTD_FR_CONTEXT_ENTRY_INV; 1352 } 1353 1354 if (!s->root_scalable) { 1355 /* Do translation type check */ 1356 if (!vtd_ce_type_check(x86_iommu, ce)) { 1357 /* Errors dumped in vtd_ce_type_check() */ 1358 return -VTD_FR_CONTEXT_ENTRY_INV; 1359 } 1360 } else { 1361 /* 1362 * Check if the programming of context-entry.rid2pasid 1363 * and corresponding pasid setting is valid, and thus 1364 * avoids to check pasid entry fetching result in future 1365 * helper function calling. 1366 */ 1367 ret_fr = vtd_ce_rid2pasid_check(s, ce); 1368 if (ret_fr) { 1369 return ret_fr; 1370 } 1371 } 1372 1373 return 0; 1374 } 1375 1376 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, 1377 void *private) 1378 { 1379 memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); 1380 return 0; 1381 } 1382 1383 static uint16_t vtd_get_domain_id(IntelIOMMUState *s, 1384 VTDContextEntry *ce) 1385 { 1386 VTDPASIDEntry pe; 1387 1388 if (s->root_scalable) { 1389 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1390 return VTD_SM_PASID_ENTRY_DID(pe.val[1]); 1391 } 1392 1393 return VTD_CONTEXT_ENTRY_DID(ce->hi); 1394 } 1395 1396 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1397 VTDContextEntry *ce, 1398 hwaddr addr, hwaddr size) 1399 { 1400 IntelIOMMUState *s = vtd_as->iommu_state; 1401 vtd_page_walk_info info = { 1402 .hook_fn = vtd_sync_shadow_page_hook, 1403 .private = (void *)&vtd_as->iommu, 1404 .notify_unmap = true, 1405 .aw = s->aw_bits, 1406 .as = vtd_as, 1407 .domain_id = vtd_get_domain_id(s, ce), 1408 }; 1409 1410 return vtd_page_walk(s, ce, addr, addr + size, &info); 1411 } 1412 1413 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1414 { 1415 int ret; 1416 VTDContextEntry ce; 1417 IOMMUNotifier *n; 1418 1419 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1420 pci_bus_num(vtd_as->bus), 1421 vtd_as->devfn, &ce); 1422 if (ret) { 1423 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1424 /* 1425 * It's a valid scenario to have a context entry that is 1426 * not present. For example, when a device is removed 1427 * from an existing domain then the context entry will be 1428 * zeroed by the guest before it was put into another 1429 * domain. When this happens, instead of synchronizing 1430 * the shadow pages we should invalidate all existing 1431 * mappings and notify the backends. 1432 */ 1433 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1434 vtd_address_space_unmap(vtd_as, n); 1435 } 1436 ret = 0; 1437 } 1438 return ret; 1439 } 1440 1441 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1442 } 1443 1444 /* 1445 * Check if specific device is configed to bypass address 1446 * translation for DMA requests. In Scalable Mode, bypass 1447 * 1st-level translation or 2nd-level translation, it depends 1448 * on PGTT setting. 1449 */ 1450 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1451 { 1452 IntelIOMMUState *s; 1453 VTDContextEntry ce; 1454 VTDPASIDEntry pe; 1455 int ret; 1456 1457 assert(as); 1458 1459 s = as->iommu_state; 1460 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1461 as->devfn, &ce); 1462 if (ret) { 1463 /* 1464 * Possibly failed to parse the context entry for some reason 1465 * (e.g., during init, or any guest configuration errors on 1466 * context entries). We should assume PT not enabled for 1467 * safety. 1468 */ 1469 return false; 1470 } 1471 1472 if (s->root_scalable) { 1473 ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); 1474 if (ret) { 1475 error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, 1476 __func__, ret); 1477 return false; 1478 } 1479 return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); 1480 } 1481 1482 return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); 1483 } 1484 1485 /* Return whether the device is using IOMMU translation. */ 1486 static bool vtd_switch_address_space(VTDAddressSpace *as) 1487 { 1488 bool use_iommu; 1489 /* Whether we need to take the BQL on our own */ 1490 bool take_bql = !qemu_mutex_iothread_locked(); 1491 1492 assert(as); 1493 1494 use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); 1495 1496 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1497 VTD_PCI_SLOT(as->devfn), 1498 VTD_PCI_FUNC(as->devfn), 1499 use_iommu); 1500 1501 /* 1502 * It's possible that we reach here without BQL, e.g., when called 1503 * from vtd_pt_enable_fast_path(). However the memory APIs need 1504 * it. We'd better make sure we have had it already, or, take it. 1505 */ 1506 if (take_bql) { 1507 qemu_mutex_lock_iothread(); 1508 } 1509 1510 /* Turn off first then on the other */ 1511 if (use_iommu) { 1512 memory_region_set_enabled(&as->nodmar, false); 1513 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1514 } else { 1515 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1516 memory_region_set_enabled(&as->nodmar, true); 1517 } 1518 1519 if (take_bql) { 1520 qemu_mutex_unlock_iothread(); 1521 } 1522 1523 return use_iommu; 1524 } 1525 1526 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1527 { 1528 GHashTableIter iter; 1529 VTDBus *vtd_bus; 1530 int i; 1531 1532 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1533 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1534 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1535 if (!vtd_bus->dev_as[i]) { 1536 continue; 1537 } 1538 vtd_switch_address_space(vtd_bus->dev_as[i]); 1539 } 1540 } 1541 } 1542 1543 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1544 { 1545 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1546 } 1547 1548 static const bool vtd_qualified_faults[] = { 1549 [VTD_FR_RESERVED] = false, 1550 [VTD_FR_ROOT_ENTRY_P] = false, 1551 [VTD_FR_CONTEXT_ENTRY_P] = true, 1552 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1553 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1554 [VTD_FR_WRITE] = true, 1555 [VTD_FR_READ] = true, 1556 [VTD_FR_PAGING_ENTRY_INV] = true, 1557 [VTD_FR_ROOT_TABLE_INV] = false, 1558 [VTD_FR_CONTEXT_TABLE_INV] = false, 1559 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1560 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1561 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1562 [VTD_FR_PASID_TABLE_INV] = false, 1563 [VTD_FR_RESERVED_ERR] = false, 1564 [VTD_FR_MAX] = false, 1565 }; 1566 1567 /* To see if a fault condition is "qualified", which is reported to software 1568 * only if the FPD field in the context-entry used to process the faulting 1569 * request is 0. 1570 */ 1571 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1572 { 1573 return vtd_qualified_faults[fault]; 1574 } 1575 1576 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1577 { 1578 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1579 } 1580 1581 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1582 { 1583 VTDBus *vtd_bus; 1584 VTDAddressSpace *vtd_as; 1585 bool success = false; 1586 1587 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1588 if (!vtd_bus) { 1589 goto out; 1590 } 1591 1592 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1593 if (!vtd_as) { 1594 goto out; 1595 } 1596 1597 if (vtd_switch_address_space(vtd_as) == false) { 1598 /* We switched off IOMMU region successfully. */ 1599 success = true; 1600 } 1601 1602 out: 1603 trace_vtd_pt_enable_fast_path(source_id, success); 1604 } 1605 1606 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1607 * translation. 1608 * 1609 * Called from RCU critical section. 1610 * 1611 * @bus_num: The bus number 1612 * @devfn: The devfn, which is the combined of device and function number 1613 * @is_write: The access is a write operation 1614 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1615 * 1616 * Returns true if translation is successful, otherwise false. 1617 */ 1618 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1619 uint8_t devfn, hwaddr addr, bool is_write, 1620 IOMMUTLBEntry *entry) 1621 { 1622 IntelIOMMUState *s = vtd_as->iommu_state; 1623 VTDContextEntry ce; 1624 uint8_t bus_num = pci_bus_num(bus); 1625 VTDContextCacheEntry *cc_entry; 1626 uint64_t slpte, page_mask; 1627 uint32_t level; 1628 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1629 int ret_fr; 1630 bool is_fpd_set = false; 1631 bool reads = true; 1632 bool writes = true; 1633 uint8_t access_flags; 1634 VTDIOTLBEntry *iotlb_entry; 1635 1636 /* 1637 * We have standalone memory region for interrupt addresses, we 1638 * should never receive translation requests in this region. 1639 */ 1640 assert(!vtd_is_interrupt_addr(addr)); 1641 1642 vtd_iommu_lock(s); 1643 1644 cc_entry = &vtd_as->context_cache_entry; 1645 1646 /* Try to fetch slpte form IOTLB */ 1647 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1648 if (iotlb_entry) { 1649 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1650 iotlb_entry->domain_id); 1651 slpte = iotlb_entry->slpte; 1652 access_flags = iotlb_entry->access_flags; 1653 page_mask = iotlb_entry->mask; 1654 goto out; 1655 } 1656 1657 /* Try to fetch context-entry from cache first */ 1658 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1659 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1660 cc_entry->context_entry.lo, 1661 cc_entry->context_cache_gen); 1662 ce = cc_entry->context_entry; 1663 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1664 if (!is_fpd_set && s->root_scalable) { 1665 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1666 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1667 } 1668 } else { 1669 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1670 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1671 if (!ret_fr && !is_fpd_set && s->root_scalable) { 1672 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1673 } 1674 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1675 /* Update context-cache */ 1676 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1677 cc_entry->context_cache_gen, 1678 s->context_cache_gen); 1679 cc_entry->context_entry = ce; 1680 cc_entry->context_cache_gen = s->context_cache_gen; 1681 } 1682 1683 /* 1684 * We don't need to translate for pass-through context entries. 1685 * Also, let's ignore IOTLB caching as well for PT devices. 1686 */ 1687 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1688 entry->iova = addr & VTD_PAGE_MASK_4K; 1689 entry->translated_addr = entry->iova; 1690 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1691 entry->perm = IOMMU_RW; 1692 trace_vtd_translate_pt(source_id, entry->iova); 1693 1694 /* 1695 * When this happens, it means firstly caching-mode is not 1696 * enabled, and this is the first passthrough translation for 1697 * the device. Let's enable the fast path for passthrough. 1698 * 1699 * When passthrough is disabled again for the device, we can 1700 * capture it via the context entry invalidation, then the 1701 * IOMMU region can be swapped back. 1702 */ 1703 vtd_pt_enable_fast_path(s, source_id); 1704 vtd_iommu_unlock(s); 1705 return true; 1706 } 1707 1708 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, 1709 &reads, &writes, s->aw_bits); 1710 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1711 1712 page_mask = vtd_slpt_level_page_mask(level); 1713 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1714 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, 1715 access_flags, level); 1716 out: 1717 vtd_iommu_unlock(s); 1718 entry->iova = addr & page_mask; 1719 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1720 entry->addr_mask = ~page_mask; 1721 entry->perm = access_flags; 1722 return true; 1723 1724 error: 1725 vtd_iommu_unlock(s); 1726 entry->iova = 0; 1727 entry->translated_addr = 0; 1728 entry->addr_mask = 0; 1729 entry->perm = IOMMU_NONE; 1730 return false; 1731 } 1732 1733 static void vtd_root_table_setup(IntelIOMMUState *s) 1734 { 1735 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1736 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1737 1738 vtd_update_scalable_state(s); 1739 1740 trace_vtd_reg_dmar_root(s->root, s->root_scalable); 1741 } 1742 1743 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1744 uint32_t index, uint32_t mask) 1745 { 1746 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1747 } 1748 1749 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1750 { 1751 uint64_t value = 0; 1752 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1753 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1754 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1755 s->intr_eime = value & VTD_IRTA_EIME; 1756 1757 /* Notify global invalidation */ 1758 vtd_iec_notify_all(s, true, 0, 0); 1759 1760 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1761 } 1762 1763 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1764 { 1765 VTDAddressSpace *vtd_as; 1766 1767 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1768 vtd_sync_shadow_page_table(vtd_as); 1769 } 1770 } 1771 1772 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1773 { 1774 trace_vtd_inv_desc_cc_global(); 1775 /* Protects context cache */ 1776 vtd_iommu_lock(s); 1777 s->context_cache_gen++; 1778 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1779 vtd_reset_context_cache_locked(s); 1780 } 1781 vtd_iommu_unlock(s); 1782 vtd_address_space_refresh_all(s); 1783 /* 1784 * From VT-d spec 6.5.2.1, a global context entry invalidation 1785 * should be followed by a IOTLB global invalidation, so we should 1786 * be safe even without this. Hoewever, let's replay the region as 1787 * well to be safer, and go back here when we need finer tunes for 1788 * VT-d emulation codes. 1789 */ 1790 vtd_iommu_replay_all(s); 1791 } 1792 1793 /* Do a context-cache device-selective invalidation. 1794 * @func_mask: FM field after shifting 1795 */ 1796 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1797 uint16_t source_id, 1798 uint16_t func_mask) 1799 { 1800 uint16_t mask; 1801 VTDBus *vtd_bus; 1802 VTDAddressSpace *vtd_as; 1803 uint8_t bus_n, devfn; 1804 uint16_t devfn_it; 1805 1806 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1807 1808 switch (func_mask & 3) { 1809 case 0: 1810 mask = 0; /* No bits in the SID field masked */ 1811 break; 1812 case 1: 1813 mask = 4; /* Mask bit 2 in the SID field */ 1814 break; 1815 case 2: 1816 mask = 6; /* Mask bit 2:1 in the SID field */ 1817 break; 1818 case 3: 1819 mask = 7; /* Mask bit 2:0 in the SID field */ 1820 break; 1821 } 1822 mask = ~mask; 1823 1824 bus_n = VTD_SID_TO_BUS(source_id); 1825 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1826 if (vtd_bus) { 1827 devfn = VTD_SID_TO_DEVFN(source_id); 1828 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1829 vtd_as = vtd_bus->dev_as[devfn_it]; 1830 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1831 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1832 VTD_PCI_FUNC(devfn_it)); 1833 vtd_iommu_lock(s); 1834 vtd_as->context_cache_entry.context_cache_gen = 0; 1835 vtd_iommu_unlock(s); 1836 /* 1837 * Do switch address space when needed, in case if the 1838 * device passthrough bit is switched. 1839 */ 1840 vtd_switch_address_space(vtd_as); 1841 /* 1842 * So a device is moving out of (or moving into) a 1843 * domain, resync the shadow page table. 1844 * This won't bring bad even if we have no such 1845 * notifier registered - the IOMMU notification 1846 * framework will skip MAP notifications if that 1847 * happened. 1848 */ 1849 vtd_sync_shadow_page_table(vtd_as); 1850 } 1851 } 1852 } 1853 } 1854 1855 /* Context-cache invalidation 1856 * Returns the Context Actual Invalidation Granularity. 1857 * @val: the content of the CCMD_REG 1858 */ 1859 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1860 { 1861 uint64_t caig; 1862 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1863 1864 switch (type) { 1865 case VTD_CCMD_DOMAIN_INVL: 1866 /* Fall through */ 1867 case VTD_CCMD_GLOBAL_INVL: 1868 caig = VTD_CCMD_GLOBAL_INVL_A; 1869 vtd_context_global_invalidate(s); 1870 break; 1871 1872 case VTD_CCMD_DEVICE_INVL: 1873 caig = VTD_CCMD_DEVICE_INVL_A; 1874 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1875 break; 1876 1877 default: 1878 error_report_once("%s: invalid context: 0x%" PRIx64, 1879 __func__, val); 1880 caig = 0; 1881 } 1882 return caig; 1883 } 1884 1885 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1886 { 1887 trace_vtd_inv_desc_iotlb_global(); 1888 vtd_reset_iotlb(s); 1889 vtd_iommu_replay_all(s); 1890 } 1891 1892 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1893 { 1894 VTDContextEntry ce; 1895 VTDAddressSpace *vtd_as; 1896 1897 trace_vtd_inv_desc_iotlb_domain(domain_id); 1898 1899 vtd_iommu_lock(s); 1900 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1901 &domain_id); 1902 vtd_iommu_unlock(s); 1903 1904 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1905 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1906 vtd_as->devfn, &ce) && 1907 domain_id == vtd_get_domain_id(s, &ce)) { 1908 vtd_sync_shadow_page_table(vtd_as); 1909 } 1910 } 1911 } 1912 1913 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1914 uint16_t domain_id, hwaddr addr, 1915 uint8_t am) 1916 { 1917 VTDAddressSpace *vtd_as; 1918 VTDContextEntry ce; 1919 int ret; 1920 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1921 1922 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1923 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1924 vtd_as->devfn, &ce); 1925 if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { 1926 if (vtd_as_has_map_notifier(vtd_as)) { 1927 /* 1928 * As long as we have MAP notifications registered in 1929 * any of our IOMMU notifiers, we need to sync the 1930 * shadow page table. 1931 */ 1932 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 1933 } else { 1934 /* 1935 * For UNMAP-only notifiers, we don't need to walk the 1936 * page tables. We just deliver the PSI down to 1937 * invalidate caches. 1938 */ 1939 IOMMUTLBEntry entry = { 1940 .target_as = &address_space_memory, 1941 .iova = addr, 1942 .translated_addr = 0, 1943 .addr_mask = size - 1, 1944 .perm = IOMMU_NONE, 1945 }; 1946 memory_region_notify_iommu(&vtd_as->iommu, 0, entry); 1947 } 1948 } 1949 } 1950 } 1951 1952 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 1953 hwaddr addr, uint8_t am) 1954 { 1955 VTDIOTLBPageInvInfo info; 1956 1957 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 1958 1959 assert(am <= VTD_MAMV); 1960 info.domain_id = domain_id; 1961 info.addr = addr; 1962 info.mask = ~((1 << am) - 1); 1963 vtd_iommu_lock(s); 1964 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 1965 vtd_iommu_unlock(s); 1966 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 1967 } 1968 1969 /* Flush IOTLB 1970 * Returns the IOTLB Actual Invalidation Granularity. 1971 * @val: the content of the IOTLB_REG 1972 */ 1973 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 1974 { 1975 uint64_t iaig; 1976 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 1977 uint16_t domain_id; 1978 hwaddr addr; 1979 uint8_t am; 1980 1981 switch (type) { 1982 case VTD_TLB_GLOBAL_FLUSH: 1983 iaig = VTD_TLB_GLOBAL_FLUSH_A; 1984 vtd_iotlb_global_invalidate(s); 1985 break; 1986 1987 case VTD_TLB_DSI_FLUSH: 1988 domain_id = VTD_TLB_DID(val); 1989 iaig = VTD_TLB_DSI_FLUSH_A; 1990 vtd_iotlb_domain_invalidate(s, domain_id); 1991 break; 1992 1993 case VTD_TLB_PSI_FLUSH: 1994 domain_id = VTD_TLB_DID(val); 1995 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 1996 am = VTD_IVA_AM(addr); 1997 addr = VTD_IVA_ADDR(addr); 1998 if (am > VTD_MAMV) { 1999 error_report_once("%s: address mask overflow: 0x%" PRIx64, 2000 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 2001 iaig = 0; 2002 break; 2003 } 2004 iaig = VTD_TLB_PSI_FLUSH_A; 2005 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2006 break; 2007 2008 default: 2009 error_report_once("%s: invalid granularity: 0x%" PRIx64, 2010 __func__, val); 2011 iaig = 0; 2012 } 2013 return iaig; 2014 } 2015 2016 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 2017 2018 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 2019 { 2020 return s->qi_enabled && (s->iq_tail == s->iq_head) && 2021 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 2022 } 2023 2024 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 2025 { 2026 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 2027 2028 trace_vtd_inv_qi_enable(en); 2029 2030 if (en) { 2031 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 2032 /* 2^(x+8) entries */ 2033 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); 2034 s->qi_enabled = true; 2035 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 2036 /* Ok - report back to driver */ 2037 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 2038 2039 if (s->iq_tail != 0) { 2040 /* 2041 * This is a spec violation but Windows guests are known to set up 2042 * Queued Invalidation this way so we allow the write and process 2043 * Invalidation Descriptors right away. 2044 */ 2045 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 2046 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2047 vtd_fetch_inv_desc(s); 2048 } 2049 } 2050 } else { 2051 if (vtd_queued_inv_disable_check(s)) { 2052 /* disable Queued Invalidation */ 2053 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 2054 s->iq_head = 0; 2055 s->qi_enabled = false; 2056 /* Ok - report back to driver */ 2057 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 2058 } else { 2059 error_report_once("%s: detected improper state when disable QI " 2060 "(head=0x%x, tail=0x%x, last_type=%d)", 2061 __func__, 2062 s->iq_head, s->iq_tail, s->iq_last_desc_type); 2063 } 2064 } 2065 } 2066 2067 /* Set Root Table Pointer */ 2068 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 2069 { 2070 vtd_root_table_setup(s); 2071 /* Ok - report back to driver */ 2072 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 2073 vtd_reset_caches(s); 2074 vtd_address_space_refresh_all(s); 2075 } 2076 2077 /* Set Interrupt Remap Table Pointer */ 2078 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 2079 { 2080 vtd_interrupt_remap_table_setup(s); 2081 /* Ok - report back to driver */ 2082 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 2083 } 2084 2085 /* Handle Translation Enable/Disable */ 2086 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 2087 { 2088 if (s->dmar_enabled == en) { 2089 return; 2090 } 2091 2092 trace_vtd_dmar_enable(en); 2093 2094 if (en) { 2095 s->dmar_enabled = true; 2096 /* Ok - report back to driver */ 2097 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 2098 } else { 2099 s->dmar_enabled = false; 2100 2101 /* Clear the index of Fault Recording Register */ 2102 s->next_frcd_reg = 0; 2103 /* Ok - report back to driver */ 2104 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 2105 } 2106 2107 vtd_reset_caches(s); 2108 vtd_address_space_refresh_all(s); 2109 } 2110 2111 /* Handle Interrupt Remap Enable/Disable */ 2112 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 2113 { 2114 trace_vtd_ir_enable(en); 2115 2116 if (en) { 2117 s->intr_enabled = true; 2118 /* Ok - report back to driver */ 2119 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 2120 } else { 2121 s->intr_enabled = false; 2122 /* Ok - report back to driver */ 2123 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 2124 } 2125 } 2126 2127 /* Handle write to Global Command Register */ 2128 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 2129 { 2130 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 2131 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 2132 uint32_t changed = status ^ val; 2133 2134 trace_vtd_reg_write_gcmd(status, val); 2135 if (changed & VTD_GCMD_TE) { 2136 /* Translation enable/disable */ 2137 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 2138 } 2139 if (val & VTD_GCMD_SRTP) { 2140 /* Set/update the root-table pointer */ 2141 vtd_handle_gcmd_srtp(s); 2142 } 2143 if (changed & VTD_GCMD_QIE) { 2144 /* Queued Invalidation Enable */ 2145 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 2146 } 2147 if (val & VTD_GCMD_SIRTP) { 2148 /* Set/update the interrupt remapping root-table pointer */ 2149 vtd_handle_gcmd_sirtp(s); 2150 } 2151 if (changed & VTD_GCMD_IRE) { 2152 /* Interrupt remap enable/disable */ 2153 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 2154 } 2155 } 2156 2157 /* Handle write to Context Command Register */ 2158 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 2159 { 2160 uint64_t ret; 2161 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 2162 2163 /* Context-cache invalidation request */ 2164 if (val & VTD_CCMD_ICC) { 2165 if (s->qi_enabled) { 2166 error_report_once("Queued Invalidation enabled, " 2167 "should not use register-based invalidation"); 2168 return; 2169 } 2170 ret = vtd_context_cache_invalidate(s, val); 2171 /* Invalidation completed. Change something to show */ 2172 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 2173 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 2174 ret); 2175 } 2176 } 2177 2178 /* Handle write to IOTLB Invalidation Register */ 2179 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 2180 { 2181 uint64_t ret; 2182 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 2183 2184 /* IOTLB invalidation request */ 2185 if (val & VTD_TLB_IVT) { 2186 if (s->qi_enabled) { 2187 error_report_once("Queued Invalidation enabled, " 2188 "should not use register-based invalidation"); 2189 return; 2190 } 2191 ret = vtd_iotlb_flush(s, val); 2192 /* Invalidation completed. Change something to show */ 2193 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 2194 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 2195 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 2196 } 2197 } 2198 2199 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 2200 static bool vtd_get_inv_desc(IntelIOMMUState *s, 2201 VTDInvDesc *inv_desc) 2202 { 2203 dma_addr_t base_addr = s->iq; 2204 uint32_t offset = s->iq_head; 2205 uint32_t dw = s->iq_dw ? 32 : 16; 2206 dma_addr_t addr = base_addr + offset * dw; 2207 2208 if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) { 2209 error_report_once("Read INV DESC failed."); 2210 return false; 2211 } 2212 inv_desc->lo = le64_to_cpu(inv_desc->lo); 2213 inv_desc->hi = le64_to_cpu(inv_desc->hi); 2214 if (dw == 32) { 2215 inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); 2216 inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); 2217 } 2218 return true; 2219 } 2220 2221 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2222 { 2223 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 2224 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 2225 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2226 " (reserved nonzero)", __func__, inv_desc->hi, 2227 inv_desc->lo); 2228 return false; 2229 } 2230 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 2231 /* Status Write */ 2232 uint32_t status_data = (uint32_t)(inv_desc->lo >> 2233 VTD_INV_DESC_WAIT_DATA_SHIFT); 2234 2235 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 2236 2237 /* FIXME: need to be masked with HAW? */ 2238 dma_addr_t status_addr = inv_desc->hi; 2239 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 2240 status_data = cpu_to_le32(status_data); 2241 if (dma_memory_write(&address_space_memory, status_addr, &status_data, 2242 sizeof(status_data))) { 2243 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 2244 return false; 2245 } 2246 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 2247 /* Interrupt flag */ 2248 vtd_generate_completion_event(s); 2249 } else { 2250 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2251 " (unknown type)", __func__, inv_desc->hi, 2252 inv_desc->lo); 2253 return false; 2254 } 2255 return true; 2256 } 2257 2258 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 2259 VTDInvDesc *inv_desc) 2260 { 2261 uint16_t sid, fmask; 2262 2263 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 2264 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2265 " (reserved nonzero)", __func__, inv_desc->hi, 2266 inv_desc->lo); 2267 return false; 2268 } 2269 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 2270 case VTD_INV_DESC_CC_DOMAIN: 2271 trace_vtd_inv_desc_cc_domain( 2272 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 2273 /* Fall through */ 2274 case VTD_INV_DESC_CC_GLOBAL: 2275 vtd_context_global_invalidate(s); 2276 break; 2277 2278 case VTD_INV_DESC_CC_DEVICE: 2279 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 2280 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 2281 vtd_context_device_invalidate(s, sid, fmask); 2282 break; 2283 2284 default: 2285 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2286 " (invalid type)", __func__, inv_desc->hi, 2287 inv_desc->lo); 2288 return false; 2289 } 2290 return true; 2291 } 2292 2293 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2294 { 2295 uint16_t domain_id; 2296 uint8_t am; 2297 hwaddr addr; 2298 2299 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 2300 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 2301 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2302 ", lo=0x%"PRIx64" (reserved bits unzero)\n", 2303 __func__, inv_desc->hi, inv_desc->lo); 2304 return false; 2305 } 2306 2307 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 2308 case VTD_INV_DESC_IOTLB_GLOBAL: 2309 vtd_iotlb_global_invalidate(s); 2310 break; 2311 2312 case VTD_INV_DESC_IOTLB_DOMAIN: 2313 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2314 vtd_iotlb_domain_invalidate(s, domain_id); 2315 break; 2316 2317 case VTD_INV_DESC_IOTLB_PAGE: 2318 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2319 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 2320 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 2321 if (am > VTD_MAMV) { 2322 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2323 ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n", 2324 __func__, inv_desc->hi, inv_desc->lo, 2325 am, (unsigned)VTD_MAMV); 2326 return false; 2327 } 2328 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2329 break; 2330 2331 default: 2332 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2333 ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n", 2334 __func__, inv_desc->hi, inv_desc->lo, 2335 inv_desc->lo & VTD_INV_DESC_IOTLB_G); 2336 return false; 2337 } 2338 return true; 2339 } 2340 2341 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 2342 VTDInvDesc *inv_desc) 2343 { 2344 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 2345 inv_desc->iec.index, 2346 inv_desc->iec.index_mask); 2347 2348 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 2349 inv_desc->iec.index, 2350 inv_desc->iec.index_mask); 2351 return true; 2352 } 2353 2354 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 2355 VTDInvDesc *inv_desc) 2356 { 2357 VTDAddressSpace *vtd_dev_as; 2358 IOMMUTLBEntry entry; 2359 struct VTDBus *vtd_bus; 2360 hwaddr addr; 2361 uint64_t sz; 2362 uint16_t sid; 2363 uint8_t devfn; 2364 bool size; 2365 uint8_t bus_num; 2366 2367 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2368 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2369 devfn = sid & 0xff; 2370 bus_num = sid >> 8; 2371 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2372 2373 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2374 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2375 error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 2376 ", lo=%"PRIx64" (reserved nonzero)", __func__, 2377 inv_desc->hi, inv_desc->lo); 2378 return false; 2379 } 2380 2381 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2382 if (!vtd_bus) { 2383 goto done; 2384 } 2385 2386 vtd_dev_as = vtd_bus->dev_as[devfn]; 2387 if (!vtd_dev_as) { 2388 goto done; 2389 } 2390 2391 /* According to ATS spec table 2.4: 2392 * S = 0, bits 15:12 = xxxx range size: 4K 2393 * S = 1, bits 15:12 = xxx0 range size: 8K 2394 * S = 1, bits 15:12 = xx01 range size: 16K 2395 * S = 1, bits 15:12 = x011 range size: 32K 2396 * S = 1, bits 15:12 = 0111 range size: 64K 2397 * ... 2398 */ 2399 if (size) { 2400 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2401 addr &= ~(sz - 1); 2402 } else { 2403 sz = VTD_PAGE_SIZE; 2404 } 2405 2406 entry.target_as = &vtd_dev_as->as; 2407 entry.addr_mask = sz - 1; 2408 entry.iova = addr; 2409 entry.perm = IOMMU_NONE; 2410 entry.translated_addr = 0; 2411 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); 2412 2413 done: 2414 return true; 2415 } 2416 2417 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2418 { 2419 VTDInvDesc inv_desc; 2420 uint8_t desc_type; 2421 2422 trace_vtd_inv_qi_head(s->iq_head); 2423 if (!vtd_get_inv_desc(s, &inv_desc)) { 2424 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2425 return false; 2426 } 2427 2428 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2429 /* FIXME: should update at first or at last? */ 2430 s->iq_last_desc_type = desc_type; 2431 2432 switch (desc_type) { 2433 case VTD_INV_DESC_CC: 2434 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2435 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2436 return false; 2437 } 2438 break; 2439 2440 case VTD_INV_DESC_IOTLB: 2441 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2442 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2443 return false; 2444 } 2445 break; 2446 2447 /* 2448 * TODO: the entity of below two cases will be implemented in future series. 2449 * To make guest (which integrates scalable mode support patch set in 2450 * iommu driver) work, just return true is enough so far. 2451 */ 2452 case VTD_INV_DESC_PC: 2453 break; 2454 2455 case VTD_INV_DESC_PIOTLB: 2456 break; 2457 2458 case VTD_INV_DESC_WAIT: 2459 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2460 if (!vtd_process_wait_desc(s, &inv_desc)) { 2461 return false; 2462 } 2463 break; 2464 2465 case VTD_INV_DESC_IEC: 2466 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2467 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2468 return false; 2469 } 2470 break; 2471 2472 case VTD_INV_DESC_DEVICE: 2473 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2474 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2475 return false; 2476 } 2477 break; 2478 2479 default: 2480 error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 2481 " (unknown type)", __func__, inv_desc.hi, 2482 inv_desc.lo); 2483 return false; 2484 } 2485 s->iq_head++; 2486 if (s->iq_head == s->iq_size) { 2487 s->iq_head = 0; 2488 } 2489 return true; 2490 } 2491 2492 /* Try to fetch and process more Invalidation Descriptors */ 2493 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2494 { 2495 trace_vtd_inv_qi_fetch(); 2496 2497 if (s->iq_tail >= s->iq_size) { 2498 /* Detects an invalid Tail pointer */ 2499 error_report_once("%s: detected invalid QI tail " 2500 "(tail=0x%x, size=0x%x)", 2501 __func__, s->iq_tail, s->iq_size); 2502 vtd_handle_inv_queue_error(s); 2503 return; 2504 } 2505 while (s->iq_head != s->iq_tail) { 2506 if (!vtd_process_inv_desc(s)) { 2507 /* Invalidation Queue Errors */ 2508 vtd_handle_inv_queue_error(s); 2509 break; 2510 } 2511 /* Must update the IQH_REG in time */ 2512 vtd_set_quad_raw(s, DMAR_IQH_REG, 2513 (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) & 2514 VTD_IQH_QH_MASK); 2515 } 2516 } 2517 2518 /* Handle write to Invalidation Queue Tail Register */ 2519 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2520 { 2521 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2522 2523 if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { 2524 error_report_once("%s: RSV bit is set: val=0x%"PRIx64, 2525 __func__, val); 2526 return; 2527 } 2528 s->iq_tail = VTD_IQT_QT(s->iq_dw, val); 2529 trace_vtd_inv_qi_tail(s->iq_tail); 2530 2531 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2532 /* Process Invalidation Queue here */ 2533 vtd_fetch_inv_desc(s); 2534 } 2535 } 2536 2537 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2538 { 2539 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2540 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2541 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2542 2543 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2544 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2545 trace_vtd_fsts_clear_ip(); 2546 } 2547 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2548 * Descriptors if there are any when Queued Invalidation is enabled? 2549 */ 2550 } 2551 2552 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2553 { 2554 uint32_t fectl_reg; 2555 /* FIXME: when software clears the IM field, check the IP field. But do we 2556 * need to compare the old value and the new value to conclude that 2557 * software clears the IM field? Or just check if the IM field is zero? 2558 */ 2559 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2560 2561 trace_vtd_reg_write_fectl(fectl_reg); 2562 2563 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2564 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2565 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2566 } 2567 } 2568 2569 static void vtd_handle_ics_write(IntelIOMMUState *s) 2570 { 2571 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2572 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2573 2574 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2575 trace_vtd_reg_ics_clear_ip(); 2576 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2577 } 2578 } 2579 2580 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2581 { 2582 uint32_t iectl_reg; 2583 /* FIXME: when software clears the IM field, check the IP field. But do we 2584 * need to compare the old value and the new value to conclude that 2585 * software clears the IM field? Or just check if the IM field is zero? 2586 */ 2587 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2588 2589 trace_vtd_reg_write_iectl(iectl_reg); 2590 2591 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2592 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2593 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2594 } 2595 } 2596 2597 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2598 { 2599 IntelIOMMUState *s = opaque; 2600 uint64_t val; 2601 2602 trace_vtd_reg_read(addr, size); 2603 2604 if (addr + size > DMAR_REG_SIZE) { 2605 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2606 " size=0x%u", __func__, addr, size); 2607 return (uint64_t)-1; 2608 } 2609 2610 switch (addr) { 2611 /* Root Table Address Register, 64-bit */ 2612 case DMAR_RTADDR_REG: 2613 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 2614 if (size == 4) { 2615 val = val & ((1ULL << 32) - 1); 2616 } 2617 break; 2618 2619 case DMAR_RTADDR_REG_HI: 2620 assert(size == 4); 2621 val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32; 2622 break; 2623 2624 /* Invalidation Queue Address Register, 64-bit */ 2625 case DMAR_IQA_REG: 2626 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2627 if (size == 4) { 2628 val = val & ((1ULL << 32) - 1); 2629 } 2630 break; 2631 2632 case DMAR_IQA_REG_HI: 2633 assert(size == 4); 2634 val = s->iq >> 32; 2635 break; 2636 2637 default: 2638 if (size == 4) { 2639 val = vtd_get_long(s, addr); 2640 } else { 2641 val = vtd_get_quad(s, addr); 2642 } 2643 } 2644 2645 return val; 2646 } 2647 2648 static void vtd_mem_write(void *opaque, hwaddr addr, 2649 uint64_t val, unsigned size) 2650 { 2651 IntelIOMMUState *s = opaque; 2652 2653 trace_vtd_reg_write(addr, size, val); 2654 2655 if (addr + size > DMAR_REG_SIZE) { 2656 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2657 " size=0x%u", __func__, addr, size); 2658 return; 2659 } 2660 2661 switch (addr) { 2662 /* Global Command Register, 32-bit */ 2663 case DMAR_GCMD_REG: 2664 vtd_set_long(s, addr, val); 2665 vtd_handle_gcmd_write(s); 2666 break; 2667 2668 /* Context Command Register, 64-bit */ 2669 case DMAR_CCMD_REG: 2670 if (size == 4) { 2671 vtd_set_long(s, addr, val); 2672 } else { 2673 vtd_set_quad(s, addr, val); 2674 vtd_handle_ccmd_write(s); 2675 } 2676 break; 2677 2678 case DMAR_CCMD_REG_HI: 2679 assert(size == 4); 2680 vtd_set_long(s, addr, val); 2681 vtd_handle_ccmd_write(s); 2682 break; 2683 2684 /* IOTLB Invalidation Register, 64-bit */ 2685 case DMAR_IOTLB_REG: 2686 if (size == 4) { 2687 vtd_set_long(s, addr, val); 2688 } else { 2689 vtd_set_quad(s, addr, val); 2690 vtd_handle_iotlb_write(s); 2691 } 2692 break; 2693 2694 case DMAR_IOTLB_REG_HI: 2695 assert(size == 4); 2696 vtd_set_long(s, addr, val); 2697 vtd_handle_iotlb_write(s); 2698 break; 2699 2700 /* Invalidate Address Register, 64-bit */ 2701 case DMAR_IVA_REG: 2702 if (size == 4) { 2703 vtd_set_long(s, addr, val); 2704 } else { 2705 vtd_set_quad(s, addr, val); 2706 } 2707 break; 2708 2709 case DMAR_IVA_REG_HI: 2710 assert(size == 4); 2711 vtd_set_long(s, addr, val); 2712 break; 2713 2714 /* Fault Status Register, 32-bit */ 2715 case DMAR_FSTS_REG: 2716 assert(size == 4); 2717 vtd_set_long(s, addr, val); 2718 vtd_handle_fsts_write(s); 2719 break; 2720 2721 /* Fault Event Control Register, 32-bit */ 2722 case DMAR_FECTL_REG: 2723 assert(size == 4); 2724 vtd_set_long(s, addr, val); 2725 vtd_handle_fectl_write(s); 2726 break; 2727 2728 /* Fault Event Data Register, 32-bit */ 2729 case DMAR_FEDATA_REG: 2730 assert(size == 4); 2731 vtd_set_long(s, addr, val); 2732 break; 2733 2734 /* Fault Event Address Register, 32-bit */ 2735 case DMAR_FEADDR_REG: 2736 if (size == 4) { 2737 vtd_set_long(s, addr, val); 2738 } else { 2739 /* 2740 * While the register is 32-bit only, some guests (Xen...) write to 2741 * it with 64-bit. 2742 */ 2743 vtd_set_quad(s, addr, val); 2744 } 2745 break; 2746 2747 /* Fault Event Upper Address Register, 32-bit */ 2748 case DMAR_FEUADDR_REG: 2749 assert(size == 4); 2750 vtd_set_long(s, addr, val); 2751 break; 2752 2753 /* Protected Memory Enable Register, 32-bit */ 2754 case DMAR_PMEN_REG: 2755 assert(size == 4); 2756 vtd_set_long(s, addr, val); 2757 break; 2758 2759 /* Root Table Address Register, 64-bit */ 2760 case DMAR_RTADDR_REG: 2761 if (size == 4) { 2762 vtd_set_long(s, addr, val); 2763 } else { 2764 vtd_set_quad(s, addr, val); 2765 } 2766 break; 2767 2768 case DMAR_RTADDR_REG_HI: 2769 assert(size == 4); 2770 vtd_set_long(s, addr, val); 2771 break; 2772 2773 /* Invalidation Queue Tail Register, 64-bit */ 2774 case DMAR_IQT_REG: 2775 if (size == 4) { 2776 vtd_set_long(s, addr, val); 2777 } else { 2778 vtd_set_quad(s, addr, val); 2779 } 2780 vtd_handle_iqt_write(s); 2781 break; 2782 2783 case DMAR_IQT_REG_HI: 2784 assert(size == 4); 2785 vtd_set_long(s, addr, val); 2786 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2787 break; 2788 2789 /* Invalidation Queue Address Register, 64-bit */ 2790 case DMAR_IQA_REG: 2791 if (size == 4) { 2792 vtd_set_long(s, addr, val); 2793 } else { 2794 vtd_set_quad(s, addr, val); 2795 } 2796 if (s->ecap & VTD_ECAP_SMTS && 2797 val & VTD_IQA_DW_MASK) { 2798 s->iq_dw = true; 2799 } else { 2800 s->iq_dw = false; 2801 } 2802 break; 2803 2804 case DMAR_IQA_REG_HI: 2805 assert(size == 4); 2806 vtd_set_long(s, addr, val); 2807 break; 2808 2809 /* Invalidation Completion Status Register, 32-bit */ 2810 case DMAR_ICS_REG: 2811 assert(size == 4); 2812 vtd_set_long(s, addr, val); 2813 vtd_handle_ics_write(s); 2814 break; 2815 2816 /* Invalidation Event Control Register, 32-bit */ 2817 case DMAR_IECTL_REG: 2818 assert(size == 4); 2819 vtd_set_long(s, addr, val); 2820 vtd_handle_iectl_write(s); 2821 break; 2822 2823 /* Invalidation Event Data Register, 32-bit */ 2824 case DMAR_IEDATA_REG: 2825 assert(size == 4); 2826 vtd_set_long(s, addr, val); 2827 break; 2828 2829 /* Invalidation Event Address Register, 32-bit */ 2830 case DMAR_IEADDR_REG: 2831 assert(size == 4); 2832 vtd_set_long(s, addr, val); 2833 break; 2834 2835 /* Invalidation Event Upper Address Register, 32-bit */ 2836 case DMAR_IEUADDR_REG: 2837 assert(size == 4); 2838 vtd_set_long(s, addr, val); 2839 break; 2840 2841 /* Fault Recording Registers, 128-bit */ 2842 case DMAR_FRCD_REG_0_0: 2843 if (size == 4) { 2844 vtd_set_long(s, addr, val); 2845 } else { 2846 vtd_set_quad(s, addr, val); 2847 } 2848 break; 2849 2850 case DMAR_FRCD_REG_0_1: 2851 assert(size == 4); 2852 vtd_set_long(s, addr, val); 2853 break; 2854 2855 case DMAR_FRCD_REG_0_2: 2856 if (size == 4) { 2857 vtd_set_long(s, addr, val); 2858 } else { 2859 vtd_set_quad(s, addr, val); 2860 /* May clear bit 127 (Fault), update PPF */ 2861 vtd_update_fsts_ppf(s); 2862 } 2863 break; 2864 2865 case DMAR_FRCD_REG_0_3: 2866 assert(size == 4); 2867 vtd_set_long(s, addr, val); 2868 /* May clear bit 127 (Fault), update PPF */ 2869 vtd_update_fsts_ppf(s); 2870 break; 2871 2872 case DMAR_IRTA_REG: 2873 if (size == 4) { 2874 vtd_set_long(s, addr, val); 2875 } else { 2876 vtd_set_quad(s, addr, val); 2877 } 2878 break; 2879 2880 case DMAR_IRTA_REG_HI: 2881 assert(size == 4); 2882 vtd_set_long(s, addr, val); 2883 break; 2884 2885 default: 2886 if (size == 4) { 2887 vtd_set_long(s, addr, val); 2888 } else { 2889 vtd_set_quad(s, addr, val); 2890 } 2891 } 2892 } 2893 2894 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2895 IOMMUAccessFlags flag, int iommu_idx) 2896 { 2897 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2898 IntelIOMMUState *s = vtd_as->iommu_state; 2899 IOMMUTLBEntry iotlb = { 2900 /* We'll fill in the rest later. */ 2901 .target_as = &address_space_memory, 2902 }; 2903 bool success; 2904 2905 if (likely(s->dmar_enabled)) { 2906 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2907 addr, flag & IOMMU_WO, &iotlb); 2908 } else { 2909 /* DMAR disabled, passthrough, use 4k-page*/ 2910 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2911 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2912 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2913 iotlb.perm = IOMMU_RW; 2914 success = true; 2915 } 2916 2917 if (likely(success)) { 2918 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 2919 VTD_PCI_SLOT(vtd_as->devfn), 2920 VTD_PCI_FUNC(vtd_as->devfn), 2921 iotlb.iova, iotlb.translated_addr, 2922 iotlb.addr_mask); 2923 } else { 2924 error_report_once("%s: detected translation failure " 2925 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 2926 __func__, pci_bus_num(vtd_as->bus), 2927 VTD_PCI_SLOT(vtd_as->devfn), 2928 VTD_PCI_FUNC(vtd_as->devfn), 2929 addr); 2930 } 2931 2932 return iotlb; 2933 } 2934 2935 static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 2936 IOMMUNotifierFlag old, 2937 IOMMUNotifierFlag new, 2938 Error **errp) 2939 { 2940 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2941 IntelIOMMUState *s = vtd_as->iommu_state; 2942 2943 /* Update per-address-space notifier flags */ 2944 vtd_as->notifier_flags = new; 2945 2946 if (old == IOMMU_NOTIFIER_NONE) { 2947 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 2948 } else if (new == IOMMU_NOTIFIER_NONE) { 2949 QLIST_REMOVE(vtd_as, next); 2950 } 2951 return 0; 2952 } 2953 2954 static int vtd_post_load(void *opaque, int version_id) 2955 { 2956 IntelIOMMUState *iommu = opaque; 2957 2958 /* 2959 * Memory regions are dynamically turned on/off depending on 2960 * context entry configurations from the guest. After migration, 2961 * we need to make sure the memory regions are still correct. 2962 */ 2963 vtd_switch_address_space_all(iommu); 2964 2965 /* 2966 * We don't need to migrate the root_scalable because we can 2967 * simply do the calculation after the loading is complete. We 2968 * can actually do similar things with root, dmar_enabled, etc. 2969 * however since we've had them already so we'd better keep them 2970 * for compatibility of migration. 2971 */ 2972 vtd_update_scalable_state(iommu); 2973 2974 return 0; 2975 } 2976 2977 static const VMStateDescription vtd_vmstate = { 2978 .name = "iommu-intel", 2979 .version_id = 1, 2980 .minimum_version_id = 1, 2981 .priority = MIG_PRI_IOMMU, 2982 .post_load = vtd_post_load, 2983 .fields = (VMStateField[]) { 2984 VMSTATE_UINT64(root, IntelIOMMUState), 2985 VMSTATE_UINT64(intr_root, IntelIOMMUState), 2986 VMSTATE_UINT64(iq, IntelIOMMUState), 2987 VMSTATE_UINT32(intr_size, IntelIOMMUState), 2988 VMSTATE_UINT16(iq_head, IntelIOMMUState), 2989 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 2990 VMSTATE_UINT16(iq_size, IntelIOMMUState), 2991 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 2992 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 2993 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 2994 VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ 2995 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 2996 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 2997 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 2998 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 2999 VMSTATE_END_OF_LIST() 3000 } 3001 }; 3002 3003 static const MemoryRegionOps vtd_mem_ops = { 3004 .read = vtd_mem_read, 3005 .write = vtd_mem_write, 3006 .endianness = DEVICE_LITTLE_ENDIAN, 3007 .impl = { 3008 .min_access_size = 4, 3009 .max_access_size = 8, 3010 }, 3011 .valid = { 3012 .min_access_size = 4, 3013 .max_access_size = 8, 3014 }, 3015 }; 3016 3017 static Property vtd_properties[] = { 3018 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 3019 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 3020 ON_OFF_AUTO_AUTO), 3021 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 3022 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, 3023 VTD_HOST_ADDRESS_WIDTH), 3024 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 3025 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), 3026 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), 3027 DEFINE_PROP_END_OF_LIST(), 3028 }; 3029 3030 /* Read IRTE entry with specific index */ 3031 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 3032 VTD_IR_TableEntry *entry, uint16_t sid) 3033 { 3034 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 3035 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 3036 dma_addr_t addr = 0x00; 3037 uint16_t mask, source_id; 3038 uint8_t bus, bus_max, bus_min; 3039 3040 addr = iommu->intr_root + index * sizeof(*entry); 3041 if (dma_memory_read(&address_space_memory, addr, entry, 3042 sizeof(*entry))) { 3043 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 3044 __func__, index, addr); 3045 return -VTD_FR_IR_ROOT_INVAL; 3046 } 3047 3048 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 3049 le64_to_cpu(entry->data[0])); 3050 3051 if (!entry->irte.present) { 3052 error_report_once("%s: detected non-present IRTE " 3053 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3054 __func__, index, le64_to_cpu(entry->data[1]), 3055 le64_to_cpu(entry->data[0])); 3056 return -VTD_FR_IR_ENTRY_P; 3057 } 3058 3059 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 3060 entry->irte.__reserved_2) { 3061 error_report_once("%s: detected non-zero reserved IRTE " 3062 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3063 __func__, index, le64_to_cpu(entry->data[1]), 3064 le64_to_cpu(entry->data[0])); 3065 return -VTD_FR_IR_IRTE_RSVD; 3066 } 3067 3068 if (sid != X86_IOMMU_SID_INVALID) { 3069 /* Validate IRTE SID */ 3070 source_id = le32_to_cpu(entry->irte.source_id); 3071 switch (entry->irte.sid_vtype) { 3072 case VTD_SVT_NONE: 3073 break; 3074 3075 case VTD_SVT_ALL: 3076 mask = vtd_svt_mask[entry->irte.sid_q]; 3077 if ((source_id & mask) != (sid & mask)) { 3078 error_report_once("%s: invalid IRTE SID " 3079 "(index=%u, sid=%u, source_id=%u)", 3080 __func__, index, sid, source_id); 3081 return -VTD_FR_IR_SID_ERR; 3082 } 3083 break; 3084 3085 case VTD_SVT_BUS: 3086 bus_max = source_id >> 8; 3087 bus_min = source_id & 0xff; 3088 bus = sid >> 8; 3089 if (bus > bus_max || bus < bus_min) { 3090 error_report_once("%s: invalid SVT_BUS " 3091 "(index=%u, bus=%u, min=%u, max=%u)", 3092 __func__, index, bus, bus_min, bus_max); 3093 return -VTD_FR_IR_SID_ERR; 3094 } 3095 break; 3096 3097 default: 3098 error_report_once("%s: detected invalid IRTE SVT " 3099 "(index=%u, type=%d)", __func__, 3100 index, entry->irte.sid_vtype); 3101 /* Take this as verification failure. */ 3102 return -VTD_FR_IR_SID_ERR; 3103 break; 3104 } 3105 } 3106 3107 return 0; 3108 } 3109 3110 /* Fetch IRQ information of specific IR index */ 3111 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 3112 X86IOMMUIrq *irq, uint16_t sid) 3113 { 3114 VTD_IR_TableEntry irte = {}; 3115 int ret = 0; 3116 3117 ret = vtd_irte_get(iommu, index, &irte, sid); 3118 if (ret) { 3119 return ret; 3120 } 3121 3122 irq->trigger_mode = irte.irte.trigger_mode; 3123 irq->vector = irte.irte.vector; 3124 irq->delivery_mode = irte.irte.delivery_mode; 3125 irq->dest = le32_to_cpu(irte.irte.dest_id); 3126 if (!iommu->intr_eime) { 3127 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 3128 #define VTD_IR_APIC_DEST_SHIFT (8) 3129 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 3130 VTD_IR_APIC_DEST_SHIFT; 3131 } 3132 irq->dest_mode = irte.irte.dest_mode; 3133 irq->redir_hint = irte.irte.redir_hint; 3134 3135 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 3136 irq->delivery_mode, irq->dest, irq->dest_mode); 3137 3138 return 0; 3139 } 3140 3141 /* Interrupt remapping for MSI/MSI-X entry */ 3142 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 3143 MSIMessage *origin, 3144 MSIMessage *translated, 3145 uint16_t sid) 3146 { 3147 int ret = 0; 3148 VTD_IR_MSIAddress addr; 3149 uint16_t index; 3150 X86IOMMUIrq irq = {}; 3151 3152 assert(origin && translated); 3153 3154 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 3155 3156 if (!iommu || !iommu->intr_enabled) { 3157 memcpy(translated, origin, sizeof(*origin)); 3158 goto out; 3159 } 3160 3161 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 3162 error_report_once("%s: MSI address high 32 bits non-zero detected: " 3163 "address=0x%" PRIx64, __func__, origin->address); 3164 return -VTD_FR_IR_REQ_RSVD; 3165 } 3166 3167 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 3168 if (addr.addr.__head != 0xfee) { 3169 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 3170 __func__, addr.data); 3171 return -VTD_FR_IR_REQ_RSVD; 3172 } 3173 3174 /* This is compatible mode. */ 3175 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 3176 memcpy(translated, origin, sizeof(*origin)); 3177 goto out; 3178 } 3179 3180 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 3181 3182 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 3183 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 3184 3185 if (addr.addr.sub_valid) { 3186 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 3187 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 3188 } 3189 3190 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 3191 if (ret) { 3192 return ret; 3193 } 3194 3195 if (addr.addr.sub_valid) { 3196 trace_vtd_ir_remap_type("MSI"); 3197 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 3198 error_report_once("%s: invalid IR MSI " 3199 "(sid=%u, address=0x%" PRIx64 3200 ", data=0x%" PRIx32 ")", 3201 __func__, sid, origin->address, origin->data); 3202 return -VTD_FR_IR_REQ_RSVD; 3203 } 3204 } else { 3205 uint8_t vector = origin->data & 0xff; 3206 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 3207 3208 trace_vtd_ir_remap_type("IOAPIC"); 3209 /* IOAPIC entry vector should be aligned with IRTE vector 3210 * (see vt-d spec 5.1.5.1). */ 3211 if (vector != irq.vector) { 3212 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 3213 } 3214 3215 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 3216 * (see vt-d spec 5.1.5.1). */ 3217 if (trigger_mode != irq.trigger_mode) { 3218 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 3219 irq.trigger_mode); 3220 } 3221 } 3222 3223 /* 3224 * We'd better keep the last two bits, assuming that guest OS 3225 * might modify it. Keep it does not hurt after all. 3226 */ 3227 irq.msi_addr_last_bits = addr.addr.__not_care; 3228 3229 /* Translate X86IOMMUIrq to MSI message */ 3230 x86_iommu_irq_to_msi_message(&irq, translated); 3231 3232 out: 3233 trace_vtd_ir_remap_msi(origin->address, origin->data, 3234 translated->address, translated->data); 3235 return 0; 3236 } 3237 3238 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 3239 MSIMessage *dst, uint16_t sid) 3240 { 3241 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 3242 src, dst, sid); 3243 } 3244 3245 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 3246 uint64_t *data, unsigned size, 3247 MemTxAttrs attrs) 3248 { 3249 return MEMTX_OK; 3250 } 3251 3252 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 3253 uint64_t value, unsigned size, 3254 MemTxAttrs attrs) 3255 { 3256 int ret = 0; 3257 MSIMessage from = {}, to = {}; 3258 uint16_t sid = X86_IOMMU_SID_INVALID; 3259 3260 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 3261 from.data = (uint32_t) value; 3262 3263 if (!attrs.unspecified) { 3264 /* We have explicit Source ID */ 3265 sid = attrs.requester_id; 3266 } 3267 3268 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 3269 if (ret) { 3270 /* TODO: report error */ 3271 /* Drop this interrupt */ 3272 return MEMTX_ERROR; 3273 } 3274 3275 apic_get_class()->send_msi(&to); 3276 3277 return MEMTX_OK; 3278 } 3279 3280 static const MemoryRegionOps vtd_mem_ir_ops = { 3281 .read_with_attrs = vtd_mem_ir_read, 3282 .write_with_attrs = vtd_mem_ir_write, 3283 .endianness = DEVICE_LITTLE_ENDIAN, 3284 .impl = { 3285 .min_access_size = 4, 3286 .max_access_size = 4, 3287 }, 3288 .valid = { 3289 .min_access_size = 4, 3290 .max_access_size = 4, 3291 }, 3292 }; 3293 3294 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 3295 { 3296 uintptr_t key = (uintptr_t)bus; 3297 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 3298 VTDAddressSpace *vtd_dev_as; 3299 char name[128]; 3300 3301 if (!vtd_bus) { 3302 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 3303 *new_key = (uintptr_t)bus; 3304 /* No corresponding free() */ 3305 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 3306 PCI_DEVFN_MAX); 3307 vtd_bus->bus = bus; 3308 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 3309 } 3310 3311 vtd_dev_as = vtd_bus->dev_as[devfn]; 3312 3313 if (!vtd_dev_as) { 3314 snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), 3315 PCI_FUNC(devfn)); 3316 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 3317 3318 vtd_dev_as->bus = bus; 3319 vtd_dev_as->devfn = (uint8_t)devfn; 3320 vtd_dev_as->iommu_state = s; 3321 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 3322 vtd_dev_as->iova_tree = iova_tree_new(); 3323 3324 memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); 3325 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); 3326 3327 /* 3328 * Build the DMAR-disabled container with aliases to the 3329 * shared MRs. Note that aliasing to a shared memory region 3330 * could help the memory API to detect same FlatViews so we 3331 * can have devices to share the same FlatView when DMAR is 3332 * disabled (either by not providing "intel_iommu=on" or with 3333 * "iommu=pt"). It will greatly reduce the total number of 3334 * FlatViews of the system hence VM runs faster. 3335 */ 3336 memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), 3337 "vtd-nodmar", &s->mr_nodmar, 0, 3338 memory_region_size(&s->mr_nodmar)); 3339 3340 /* 3341 * Build the per-device DMAR-enabled container. 3342 * 3343 * TODO: currently we have per-device IOMMU memory region only 3344 * because we have per-device IOMMU notifiers for devices. If 3345 * one day we can abstract the IOMMU notifiers out of the 3346 * memory regions then we can also share the same memory 3347 * region here just like what we've done above with the nodmar 3348 * region. 3349 */ 3350 strcat(name, "-dmar"); 3351 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 3352 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 3353 name, UINT64_MAX); 3354 memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", 3355 &s->mr_ir, 0, memory_region_size(&s->mr_ir)); 3356 memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), 3357 VTD_INTERRUPT_ADDR_FIRST, 3358 &vtd_dev_as->iommu_ir, 1); 3359 3360 /* 3361 * Hook both the containers under the root container, we 3362 * switch between DMAR & noDMAR by enable/disable 3363 * corresponding sub-containers 3364 */ 3365 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3366 MEMORY_REGION(&vtd_dev_as->iommu), 3367 0); 3368 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3369 &vtd_dev_as->nodmar, 0); 3370 3371 vtd_switch_address_space(vtd_dev_as); 3372 } 3373 return vtd_dev_as; 3374 } 3375 3376 static uint64_t get_naturally_aligned_size(uint64_t start, 3377 uint64_t size, int gaw) 3378 { 3379 uint64_t max_mask = 1ULL << gaw; 3380 uint64_t alignment = start ? start & -start : max_mask; 3381 3382 alignment = MIN(alignment, max_mask); 3383 size = MIN(size, max_mask); 3384 3385 if (alignment <= size) { 3386 /* Increase the alignment of start */ 3387 return alignment; 3388 } else { 3389 /* Find the largest page mask from size */ 3390 return 1ULL << (63 - clz64(size)); 3391 } 3392 } 3393 3394 /* Unmap the whole range in the notifier's scope. */ 3395 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 3396 { 3397 hwaddr size, remain; 3398 hwaddr start = n->start; 3399 hwaddr end = n->end; 3400 IntelIOMMUState *s = as->iommu_state; 3401 DMAMap map; 3402 3403 /* 3404 * Note: all the codes in this function has a assumption that IOVA 3405 * bits are no more than VTD_MGAW bits (which is restricted by 3406 * VT-d spec), otherwise we need to consider overflow of 64 bits. 3407 */ 3408 3409 if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { 3410 /* 3411 * Don't need to unmap regions that is bigger than the whole 3412 * VT-d supported address space size 3413 */ 3414 end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; 3415 } 3416 3417 assert(start <= end); 3418 size = remain = end - start + 1; 3419 3420 while (remain >= VTD_PAGE_SIZE) { 3421 IOMMUTLBEntry entry; 3422 uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits); 3423 3424 assert(mask); 3425 3426 entry.iova = start; 3427 entry.addr_mask = mask - 1; 3428 entry.target_as = &address_space_memory; 3429 entry.perm = IOMMU_NONE; 3430 /* This field is meaningless for unmap */ 3431 entry.translated_addr = 0; 3432 3433 memory_region_notify_one(n, &entry); 3434 3435 start += mask; 3436 remain -= mask; 3437 } 3438 3439 assert(!remain); 3440 3441 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3442 VTD_PCI_SLOT(as->devfn), 3443 VTD_PCI_FUNC(as->devfn), 3444 n->start, size); 3445 3446 map.iova = n->start; 3447 map.size = size; 3448 iova_tree_remove(as->iova_tree, &map); 3449 } 3450 3451 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3452 { 3453 VTDAddressSpace *vtd_as; 3454 IOMMUNotifier *n; 3455 3456 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3457 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3458 vtd_address_space_unmap(vtd_as, n); 3459 } 3460 } 3461 } 3462 3463 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3464 { 3465 vtd_address_space_unmap_all(s); 3466 vtd_switch_address_space_all(s); 3467 } 3468 3469 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) 3470 { 3471 memory_region_notify_one((IOMMUNotifier *)private, entry); 3472 return 0; 3473 } 3474 3475 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3476 { 3477 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3478 IntelIOMMUState *s = vtd_as->iommu_state; 3479 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3480 VTDContextEntry ce; 3481 3482 /* 3483 * The replay can be triggered by either a invalidation or a newly 3484 * created entry. No matter what, we release existing mappings 3485 * (it means flushing caches for UNMAP-only registers). 3486 */ 3487 vtd_address_space_unmap(vtd_as, n); 3488 3489 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3490 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : 3491 "legacy mode", 3492 bus_n, PCI_SLOT(vtd_as->devfn), 3493 PCI_FUNC(vtd_as->devfn), 3494 vtd_get_domain_id(s, &ce), 3495 ce.hi, ce.lo); 3496 if (vtd_as_has_map_notifier(vtd_as)) { 3497 /* This is required only for MAP typed notifiers */ 3498 vtd_page_walk_info info = { 3499 .hook_fn = vtd_replay_hook, 3500 .private = (void *)n, 3501 .notify_unmap = false, 3502 .aw = s->aw_bits, 3503 .as = vtd_as, 3504 .domain_id = vtd_get_domain_id(s, &ce), 3505 }; 3506 3507 vtd_page_walk(s, &ce, 0, ~0ULL, &info); 3508 } 3509 } else { 3510 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3511 PCI_FUNC(vtd_as->devfn)); 3512 } 3513 3514 return; 3515 } 3516 3517 /* Do the initialization. It will also be called when reset, so pay 3518 * attention when adding new initialization stuff. 3519 */ 3520 static void vtd_init(IntelIOMMUState *s) 3521 { 3522 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3523 3524 memset(s->csr, 0, DMAR_REG_SIZE); 3525 memset(s->wmask, 0, DMAR_REG_SIZE); 3526 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3527 memset(s->womask, 0, DMAR_REG_SIZE); 3528 3529 s->root = 0; 3530 s->root_scalable = false; 3531 s->dmar_enabled = false; 3532 s->intr_enabled = false; 3533 s->iq_head = 0; 3534 s->iq_tail = 0; 3535 s->iq = 0; 3536 s->iq_size = 0; 3537 s->qi_enabled = false; 3538 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3539 s->iq_dw = false; 3540 s->next_frcd_reg = 0; 3541 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3542 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3543 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3544 if (s->dma_drain) { 3545 s->cap |= VTD_CAP_DRAIN; 3546 } 3547 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3548 s->cap |= VTD_CAP_SAGAW_48bit; 3549 } 3550 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3551 3552 /* 3553 * Rsvd field masks for spte 3554 */ 3555 vtd_spte_rsvd[0] = ~0ULL; 3556 vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, 3557 x86_iommu->dt_supported); 3558 vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3559 vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3560 vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3561 3562 vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, 3563 x86_iommu->dt_supported); 3564 vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, 3565 x86_iommu->dt_supported); 3566 3567 if (x86_iommu_ir_supported(x86_iommu)) { 3568 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3569 if (s->intr_eim == ON_OFF_AUTO_ON) { 3570 s->ecap |= VTD_ECAP_EIM; 3571 } 3572 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3573 } 3574 3575 if (x86_iommu->dt_supported) { 3576 s->ecap |= VTD_ECAP_DT; 3577 } 3578 3579 if (x86_iommu->pt_supported) { 3580 s->ecap |= VTD_ECAP_PT; 3581 } 3582 3583 if (s->caching_mode) { 3584 s->cap |= VTD_CAP_CM; 3585 } 3586 3587 /* TODO: read cap/ecap from host to decide which cap to be exposed. */ 3588 if (s->scalable_mode) { 3589 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; 3590 } 3591 3592 vtd_reset_caches(s); 3593 3594 /* Define registers with default values and bit semantics */ 3595 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3596 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3597 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3598 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3599 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3600 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3601 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); 3602 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3603 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3604 3605 /* Advanced Fault Logging not supported */ 3606 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3607 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3608 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3609 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3610 3611 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3612 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3613 */ 3614 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3615 3616 /* Treated as RO for implementations that PLMR and PHMR fields reported 3617 * as Clear in the CAP_REG. 3618 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3619 */ 3620 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3621 3622 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3623 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3624 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); 3625 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3626 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3627 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3628 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3629 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3630 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3631 3632 /* IOTLB registers */ 3633 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3634 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3635 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3636 3637 /* Fault Recording Registers, 128-bit */ 3638 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3639 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3640 3641 /* 3642 * Interrupt remapping registers. 3643 */ 3644 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3645 } 3646 3647 /* Should not reset address_spaces when reset because devices will still use 3648 * the address space they got at first (won't ask the bus again). 3649 */ 3650 static void vtd_reset(DeviceState *dev) 3651 { 3652 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3653 3654 vtd_init(s); 3655 vtd_address_space_refresh_all(s); 3656 } 3657 3658 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3659 { 3660 IntelIOMMUState *s = opaque; 3661 VTDAddressSpace *vtd_as; 3662 3663 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3664 3665 vtd_as = vtd_find_add_as(s, bus, devfn); 3666 return &vtd_as->as; 3667 } 3668 3669 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3670 { 3671 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3672 3673 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { 3674 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3675 return false; 3676 } 3677 3678 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3679 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3680 && x86_iommu_ir_supported(x86_iommu) ? 3681 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3682 } 3683 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3684 if (!kvm_irqchip_in_kernel()) { 3685 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3686 return false; 3687 } 3688 if (!kvm_enable_x2apic()) { 3689 error_setg(errp, "eim=on requires support on the KVM side" 3690 "(X2APIC_API, first shipped in v4.7)"); 3691 return false; 3692 } 3693 } 3694 3695 /* Currently only address widths supported are 39 and 48 bits */ 3696 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3697 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3698 error_setg(errp, "Supported values for x-aw-bits are: %d, %d", 3699 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3700 return false; 3701 } 3702 3703 if (s->scalable_mode && !s->dma_drain) { 3704 error_setg(errp, "Need to set dma_drain for scalable mode"); 3705 return false; 3706 } 3707 3708 return true; 3709 } 3710 3711 static int vtd_machine_done_notify_one(Object *child, void *unused) 3712 { 3713 IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); 3714 3715 /* 3716 * We hard-coded here because vfio-pci is the only special case 3717 * here. Let's be more elegant in the future when we can, but so 3718 * far there seems to be no better way. 3719 */ 3720 if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { 3721 vtd_panic_require_caching_mode(); 3722 } 3723 3724 return 0; 3725 } 3726 3727 static void vtd_machine_done_hook(Notifier *notifier, void *unused) 3728 { 3729 object_child_foreach_recursive(object_get_root(), 3730 vtd_machine_done_notify_one, NULL); 3731 } 3732 3733 static Notifier vtd_machine_done_notify = { 3734 .notify = vtd_machine_done_hook, 3735 }; 3736 3737 static void vtd_realize(DeviceState *dev, Error **errp) 3738 { 3739 MachineState *ms = MACHINE(qdev_get_machine()); 3740 PCMachineState *pcms = PC_MACHINE(ms); 3741 X86MachineState *x86ms = X86_MACHINE(ms); 3742 PCIBus *bus = pcms->bus; 3743 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3744 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); 3745 3746 x86_iommu->type = TYPE_INTEL; 3747 3748 if (!vtd_decide_config(s, errp)) { 3749 return; 3750 } 3751 3752 QLIST_INIT(&s->vtd_as_with_notifiers); 3753 qemu_mutex_init(&s->iommu_lock); 3754 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3755 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3756 "intel_iommu", DMAR_REG_SIZE); 3757 3758 /* Create the shared memory regions by all devices */ 3759 memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", 3760 UINT64_MAX); 3761 memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, 3762 s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); 3763 memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), 3764 "vtd-sys-alias", get_system_memory(), 0, 3765 memory_region_size(get_system_memory())); 3766 memory_region_add_subregion_overlap(&s->mr_nodmar, 0, 3767 &s->mr_sys_alias, 0); 3768 memory_region_add_subregion_overlap(&s->mr_nodmar, 3769 VTD_INTERRUPT_ADDR_FIRST, 3770 &s->mr_ir, 1); 3771 3772 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3773 /* No corresponding destroy */ 3774 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3775 g_free, g_free); 3776 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3777 g_free, g_free); 3778 vtd_init(s); 3779 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3780 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3781 /* Pseudo address space under root PCI bus. */ 3782 x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3783 qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); 3784 } 3785 3786 static void vtd_class_init(ObjectClass *klass, void *data) 3787 { 3788 DeviceClass *dc = DEVICE_CLASS(klass); 3789 X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass); 3790 3791 dc->reset = vtd_reset; 3792 dc->vmsd = &vtd_vmstate; 3793 dc->props = vtd_properties; 3794 dc->hotpluggable = false; 3795 x86_class->realize = vtd_realize; 3796 x86_class->int_remap = vtd_int_remap; 3797 /* Supported by the pc-q35-* machine types */ 3798 dc->user_creatable = true; 3799 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3800 dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; 3801 } 3802 3803 static const TypeInfo vtd_info = { 3804 .name = TYPE_INTEL_IOMMU_DEVICE, 3805 .parent = TYPE_X86_IOMMU_DEVICE, 3806 .instance_size = sizeof(IntelIOMMUState), 3807 .class_init = vtd_class_init, 3808 }; 3809 3810 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3811 void *data) 3812 { 3813 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3814 3815 imrc->translate = vtd_iommu_translate; 3816 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3817 imrc->replay = vtd_iommu_replay; 3818 } 3819 3820 static const TypeInfo vtd_iommu_memory_region_info = { 3821 .parent = TYPE_IOMMU_MEMORY_REGION, 3822 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3823 .class_init = vtd_iommu_memory_region_class_init, 3824 }; 3825 3826 static void vtd_register_types(void) 3827 { 3828 type_register_static(&vtd_info); 3829 type_register_static(&vtd_iommu_memory_region_info); 3830 } 3831 3832 type_init(vtd_register_types) 3833