1 /* 2 * QEMU emulation of an Intel IOMMU (VT-d) 3 * (DMA Remapping device) 4 * 5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com> 6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "qemu/error-report.h" 24 #include "qemu/main-loop.h" 25 #include "qapi/error.h" 26 #include "hw/sysbus.h" 27 #include "exec/address-spaces.h" 28 #include "intel_iommu_internal.h" 29 #include "hw/pci/pci.h" 30 #include "hw/pci/pci_bus.h" 31 #include "hw/i386/pc.h" 32 #include "hw/i386/apic-msidef.h" 33 #include "hw/boards.h" 34 #include "hw/i386/x86-iommu.h" 35 #include "hw/pci-host/q35.h" 36 #include "sysemu/kvm.h" 37 #include "hw/i386/apic_internal.h" 38 #include "kvm_i386.h" 39 #include "migration/vmstate.h" 40 #include "trace.h" 41 42 /* context entry operations */ 43 #define VTD_CE_GET_RID2PASID(ce) \ 44 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) 45 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ 46 ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) 47 48 /* pe operations */ 49 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) 50 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) 51 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ 52 if (ret_fr) { \ 53 ret_fr = -ret_fr; \ 54 if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ 55 trace_vtd_fault_disabled(); \ 56 } else { \ 57 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ 58 } \ 59 goto error; \ 60 } \ 61 } 62 63 static void vtd_address_space_refresh_all(IntelIOMMUState *s); 64 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); 65 66 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, 67 uint64_t wmask, uint64_t w1cmask) 68 { 69 stq_le_p(&s->csr[addr], val); 70 stq_le_p(&s->wmask[addr], wmask); 71 stq_le_p(&s->w1cmask[addr], w1cmask); 72 } 73 74 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask) 75 { 76 stq_le_p(&s->womask[addr], mask); 77 } 78 79 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val, 80 uint32_t wmask, uint32_t w1cmask) 81 { 82 stl_le_p(&s->csr[addr], val); 83 stl_le_p(&s->wmask[addr], wmask); 84 stl_le_p(&s->w1cmask[addr], w1cmask); 85 } 86 87 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask) 88 { 89 stl_le_p(&s->womask[addr], mask); 90 } 91 92 /* "External" get/set operations */ 93 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val) 94 { 95 uint64_t oldval = ldq_le_p(&s->csr[addr]); 96 uint64_t wmask = ldq_le_p(&s->wmask[addr]); 97 uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); 98 stq_le_p(&s->csr[addr], 99 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 100 } 101 102 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val) 103 { 104 uint32_t oldval = ldl_le_p(&s->csr[addr]); 105 uint32_t wmask = ldl_le_p(&s->wmask[addr]); 106 uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); 107 stl_le_p(&s->csr[addr], 108 ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val)); 109 } 110 111 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr) 112 { 113 uint64_t val = ldq_le_p(&s->csr[addr]); 114 uint64_t womask = ldq_le_p(&s->womask[addr]); 115 return val & ~womask; 116 } 117 118 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr) 119 { 120 uint32_t val = ldl_le_p(&s->csr[addr]); 121 uint32_t womask = ldl_le_p(&s->womask[addr]); 122 return val & ~womask; 123 } 124 125 /* "Internal" get/set operations */ 126 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr) 127 { 128 return ldq_le_p(&s->csr[addr]); 129 } 130 131 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr) 132 { 133 return ldl_le_p(&s->csr[addr]); 134 } 135 136 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val) 137 { 138 stq_le_p(&s->csr[addr], val); 139 } 140 141 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, 142 uint32_t clear, uint32_t mask) 143 { 144 uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask; 145 stl_le_p(&s->csr[addr], new_val); 146 return new_val; 147 } 148 149 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, 150 uint64_t clear, uint64_t mask) 151 { 152 uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask; 153 stq_le_p(&s->csr[addr], new_val); 154 return new_val; 155 } 156 157 static inline void vtd_iommu_lock(IntelIOMMUState *s) 158 { 159 qemu_mutex_lock(&s->iommu_lock); 160 } 161 162 static inline void vtd_iommu_unlock(IntelIOMMUState *s) 163 { 164 qemu_mutex_unlock(&s->iommu_lock); 165 } 166 167 static void vtd_update_scalable_state(IntelIOMMUState *s) 168 { 169 uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 170 171 if (s->scalable_mode) { 172 s->root_scalable = val & VTD_RTADDR_SMT; 173 } 174 } 175 176 /* Whether the address space needs to notify new mappings */ 177 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) 178 { 179 return as->notifier_flags & IOMMU_NOTIFIER_MAP; 180 } 181 182 /* GHashTable functions */ 183 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) 184 { 185 return *((const uint64_t *)v1) == *((const uint64_t *)v2); 186 } 187 188 static guint vtd_uint64_hash(gconstpointer v) 189 { 190 return (guint)*(const uint64_t *)v; 191 } 192 193 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, 194 gpointer user_data) 195 { 196 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 197 uint16_t domain_id = *(uint16_t *)user_data; 198 return entry->domain_id == domain_id; 199 } 200 201 /* The shift of an addr for a certain level of paging structure */ 202 static inline uint32_t vtd_slpt_level_shift(uint32_t level) 203 { 204 assert(level != 0); 205 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; 206 } 207 208 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) 209 { 210 return ~((1ULL << vtd_slpt_level_shift(level)) - 1); 211 } 212 213 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, 214 gpointer user_data) 215 { 216 VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; 217 VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; 218 uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; 219 uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; 220 return (entry->domain_id == info->domain_id) && 221 (((entry->gfn & info->mask) == gfn) || 222 (entry->gfn == gfn_tlb)); 223 } 224 225 /* Reset all the gen of VTDAddressSpace to zero and set the gen of 226 * IntelIOMMUState to 1. Must be called with IOMMU lock held. 227 */ 228 static void vtd_reset_context_cache_locked(IntelIOMMUState *s) 229 { 230 VTDAddressSpace *vtd_as; 231 VTDBus *vtd_bus; 232 GHashTableIter bus_it; 233 uint32_t devfn_it; 234 235 trace_vtd_context_cache_reset(); 236 237 g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); 238 239 while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { 240 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 241 vtd_as = vtd_bus->dev_as[devfn_it]; 242 if (!vtd_as) { 243 continue; 244 } 245 vtd_as->context_cache_entry.context_cache_gen = 0; 246 } 247 } 248 s->context_cache_gen = 1; 249 } 250 251 /* Must be called with IOMMU lock held. */ 252 static void vtd_reset_iotlb_locked(IntelIOMMUState *s) 253 { 254 assert(s->iotlb); 255 g_hash_table_remove_all(s->iotlb); 256 } 257 258 static void vtd_reset_iotlb(IntelIOMMUState *s) 259 { 260 vtd_iommu_lock(s); 261 vtd_reset_iotlb_locked(s); 262 vtd_iommu_unlock(s); 263 } 264 265 static void vtd_reset_caches(IntelIOMMUState *s) 266 { 267 vtd_iommu_lock(s); 268 vtd_reset_iotlb_locked(s); 269 vtd_reset_context_cache_locked(s); 270 vtd_iommu_unlock(s); 271 } 272 273 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, 274 uint32_t level) 275 { 276 return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | 277 ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); 278 } 279 280 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) 281 { 282 return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; 283 } 284 285 /* Must be called with IOMMU lock held */ 286 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, 287 hwaddr addr) 288 { 289 VTDIOTLBEntry *entry; 290 uint64_t key; 291 int level; 292 293 for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { 294 key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), 295 source_id, level); 296 entry = g_hash_table_lookup(s->iotlb, &key); 297 if (entry) { 298 goto out; 299 } 300 } 301 302 out: 303 return entry; 304 } 305 306 /* Must be with IOMMU lock held */ 307 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, 308 uint16_t domain_id, hwaddr addr, uint64_t slpte, 309 uint8_t access_flags, uint32_t level) 310 { 311 VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); 312 uint64_t *key = g_malloc(sizeof(*key)); 313 uint64_t gfn = vtd_get_iotlb_gfn(addr, level); 314 315 trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); 316 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { 317 trace_vtd_iotlb_reset("iotlb exceeds size limit"); 318 vtd_reset_iotlb_locked(s); 319 } 320 321 entry->gfn = gfn; 322 entry->domain_id = domain_id; 323 entry->slpte = slpte; 324 entry->access_flags = access_flags; 325 entry->mask = vtd_slpt_level_page_mask(level); 326 *key = vtd_get_iotlb_key(gfn, source_id, level); 327 g_hash_table_replace(s->iotlb, key, entry); 328 } 329 330 /* Given the reg addr of both the message data and address, generate an 331 * interrupt via MSI. 332 */ 333 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, 334 hwaddr mesg_data_reg) 335 { 336 MSIMessage msi; 337 338 assert(mesg_data_reg < DMAR_REG_SIZE); 339 assert(mesg_addr_reg < DMAR_REG_SIZE); 340 341 msi.address = vtd_get_long_raw(s, mesg_addr_reg); 342 msi.data = vtd_get_long_raw(s, mesg_data_reg); 343 344 trace_vtd_irq_generate(msi.address, msi.data); 345 346 apic_get_class()->send_msi(&msi); 347 } 348 349 /* Generate a fault event to software via MSI if conditions are met. 350 * Notice that the value of FSTS_REG being passed to it should be the one 351 * before any update. 352 */ 353 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts) 354 { 355 if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO || 356 pre_fsts & VTD_FSTS_IQE) { 357 error_report_once("There are previous interrupt conditions " 358 "to be serviced by software, fault event " 359 "is not generated"); 360 return; 361 } 362 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP); 363 if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) { 364 error_report_once("Interrupt Mask set, irq is not generated"); 365 } else { 366 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 367 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 368 } 369 } 370 371 /* Check if the Fault (F) field of the Fault Recording Register referenced by 372 * @index is Set. 373 */ 374 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index) 375 { 376 /* Each reg is 128-bit */ 377 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 378 addr += 8; /* Access the high 64-bit half */ 379 380 assert(index < DMAR_FRCD_REG_NR); 381 382 return vtd_get_quad_raw(s, addr) & VTD_FRCD_F; 383 } 384 385 /* Update the PPF field of Fault Status Register. 386 * Should be called whenever change the F field of any fault recording 387 * registers. 388 */ 389 static void vtd_update_fsts_ppf(IntelIOMMUState *s) 390 { 391 uint32_t i; 392 uint32_t ppf_mask = 0; 393 394 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 395 if (vtd_is_frcd_set(s, i)) { 396 ppf_mask = VTD_FSTS_PPF; 397 break; 398 } 399 } 400 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask); 401 trace_vtd_fsts_ppf(!!ppf_mask); 402 } 403 404 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) 405 { 406 /* Each reg is 128-bit */ 407 hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 408 addr += 8; /* Access the high 64-bit half */ 409 410 assert(index < DMAR_FRCD_REG_NR); 411 412 vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F); 413 vtd_update_fsts_ppf(s); 414 } 415 416 /* Must not update F field now, should be done later */ 417 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, 418 uint16_t source_id, hwaddr addr, 419 VTDFaultReason fault, bool is_write) 420 { 421 uint64_t hi = 0, lo; 422 hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); 423 424 assert(index < DMAR_FRCD_REG_NR); 425 426 lo = VTD_FRCD_FI(addr); 427 hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); 428 if (!is_write) { 429 hi |= VTD_FRCD_T; 430 } 431 vtd_set_quad_raw(s, frcd_reg_addr, lo); 432 vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); 433 434 trace_vtd_frr_new(index, hi, lo); 435 } 436 437 /* Try to collapse multiple pending faults from the same requester */ 438 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) 439 { 440 uint32_t i; 441 uint64_t frcd_reg; 442 hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */ 443 444 for (i = 0; i < DMAR_FRCD_REG_NR; i++) { 445 frcd_reg = vtd_get_quad_raw(s, addr); 446 if ((frcd_reg & VTD_FRCD_F) && 447 ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) { 448 return true; 449 } 450 addr += 16; /* 128-bit for each */ 451 } 452 return false; 453 } 454 455 /* Log and report an DMAR (address translation) fault to software */ 456 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, 457 hwaddr addr, VTDFaultReason fault, 458 bool is_write) 459 { 460 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 461 462 assert(fault < VTD_FR_MAX); 463 464 if (fault == VTD_FR_RESERVED_ERR) { 465 /* This is not a normal fault reason case. Drop it. */ 466 return; 467 } 468 469 trace_vtd_dmar_fault(source_id, fault, addr, is_write); 470 471 if (fsts_reg & VTD_FSTS_PFO) { 472 error_report_once("New fault is not recorded due to " 473 "Primary Fault Overflow"); 474 return; 475 } 476 477 if (vtd_try_collapse_fault(s, source_id)) { 478 error_report_once("New fault is not recorded due to " 479 "compression of faults"); 480 return; 481 } 482 483 if (vtd_is_frcd_set(s, s->next_frcd_reg)) { 484 error_report_once("Next Fault Recording Reg is used, " 485 "new fault is not recorded, set PFO field"); 486 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO); 487 return; 488 } 489 490 vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); 491 492 if (fsts_reg & VTD_FSTS_PPF) { 493 error_report_once("There are pending faults already, " 494 "fault event is not generated"); 495 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); 496 s->next_frcd_reg++; 497 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 498 s->next_frcd_reg = 0; 499 } 500 } else { 501 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK, 502 VTD_FSTS_FRI(s->next_frcd_reg)); 503 vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */ 504 s->next_frcd_reg++; 505 if (s->next_frcd_reg == DMAR_FRCD_REG_NR) { 506 s->next_frcd_reg = 0; 507 } 508 /* This case actually cause the PPF to be Set. 509 * So generate fault event (interrupt). 510 */ 511 vtd_generate_fault_event(s, fsts_reg); 512 } 513 } 514 515 /* Handle Invalidation Queue Errors of queued invalidation interface error 516 * conditions. 517 */ 518 static void vtd_handle_inv_queue_error(IntelIOMMUState *s) 519 { 520 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 521 522 vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE); 523 vtd_generate_fault_event(s, fsts_reg); 524 } 525 526 /* Set the IWC field and try to generate an invalidation completion interrupt */ 527 static void vtd_generate_completion_event(IntelIOMMUState *s) 528 { 529 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) { 530 trace_vtd_inv_desc_wait_irq("One pending, skip current"); 531 return; 532 } 533 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC); 534 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP); 535 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) { 536 trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, " 537 "new event not generated"); 538 return; 539 } else { 540 /* Generate the interrupt event */ 541 trace_vtd_inv_desc_wait_irq("Generating complete event"); 542 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 543 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 544 } 545 } 546 547 static inline bool vtd_root_entry_present(IntelIOMMUState *s, 548 VTDRootEntry *re, 549 uint8_t devfn) 550 { 551 if (s->root_scalable && devfn > UINT8_MAX / 2) { 552 return re->hi & VTD_ROOT_ENTRY_P; 553 } 554 555 return re->lo & VTD_ROOT_ENTRY_P; 556 } 557 558 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, 559 VTDRootEntry *re) 560 { 561 dma_addr_t addr; 562 563 addr = s->root + index * sizeof(*re); 564 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { 565 re->lo = 0; 566 return -VTD_FR_ROOT_TABLE_INV; 567 } 568 re->lo = le64_to_cpu(re->lo); 569 re->hi = le64_to_cpu(re->hi); 570 return 0; 571 } 572 573 static inline bool vtd_ce_present(VTDContextEntry *context) 574 { 575 return context->lo & VTD_CONTEXT_ENTRY_P; 576 } 577 578 static int vtd_get_context_entry_from_root(IntelIOMMUState *s, 579 VTDRootEntry *re, 580 uint8_t index, 581 VTDContextEntry *ce) 582 { 583 dma_addr_t addr, ce_size; 584 585 /* we have checked that root entry is present */ 586 ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE : 587 VTD_CTX_ENTRY_LEGACY_SIZE; 588 589 if (s->root_scalable && index > UINT8_MAX / 2) { 590 index = index & (~VTD_DEVFN_CHECK_MASK); 591 addr = re->hi & VTD_ROOT_ENTRY_CTP; 592 } else { 593 addr = re->lo & VTD_ROOT_ENTRY_CTP; 594 } 595 596 addr = addr + index * ce_size; 597 if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) { 598 return -VTD_FR_CONTEXT_TABLE_INV; 599 } 600 601 ce->lo = le64_to_cpu(ce->lo); 602 ce->hi = le64_to_cpu(ce->hi); 603 if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) { 604 ce->val[2] = le64_to_cpu(ce->val[2]); 605 ce->val[3] = le64_to_cpu(ce->val[3]); 606 } 607 return 0; 608 } 609 610 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) 611 { 612 return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; 613 } 614 615 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) 616 { 617 return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); 618 } 619 620 /* Whether the pte indicates the address of the page frame */ 621 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) 622 { 623 return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); 624 } 625 626 /* Get the content of a spte located in @base_addr[@index] */ 627 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) 628 { 629 uint64_t slpte; 630 631 assert(index < VTD_SL_PT_ENTRY_NR); 632 633 if (dma_memory_read(&address_space_memory, 634 base_addr + index * sizeof(slpte), &slpte, 635 sizeof(slpte))) { 636 slpte = (uint64_t)-1; 637 return slpte; 638 } 639 slpte = le64_to_cpu(slpte); 640 return slpte; 641 } 642 643 /* Given an iova and the level of paging structure, return the offset 644 * of current level. 645 */ 646 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) 647 { 648 return (iova >> vtd_slpt_level_shift(level)) & 649 ((1ULL << VTD_SL_LEVEL_BITS) - 1); 650 } 651 652 /* Check Capability Register to see if the @level of page-table is supported */ 653 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) 654 { 655 return VTD_CAP_SAGAW_MASK & s->cap & 656 (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); 657 } 658 659 /* Return true if check passed, otherwise false */ 660 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, 661 VTDPASIDEntry *pe) 662 { 663 switch (VTD_PE_GET_TYPE(pe)) { 664 case VTD_SM_PASID_ENTRY_FLT: 665 case VTD_SM_PASID_ENTRY_SLT: 666 case VTD_SM_PASID_ENTRY_NESTED: 667 break; 668 case VTD_SM_PASID_ENTRY_PT: 669 if (!x86_iommu->pt_supported) { 670 return false; 671 } 672 break; 673 default: 674 /* Unknwon type */ 675 return false; 676 } 677 return true; 678 } 679 680 static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base, 681 uint32_t pasid, 682 VTDPASIDDirEntry *pdire) 683 { 684 uint32_t index; 685 dma_addr_t addr, entry_size; 686 687 index = VTD_PASID_DIR_INDEX(pasid); 688 entry_size = VTD_PASID_DIR_ENTRY_SIZE; 689 addr = pasid_dir_base + index * entry_size; 690 if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) { 691 return -VTD_FR_PASID_TABLE_INV; 692 } 693 694 return 0; 695 } 696 697 static int vtd_get_pasid_entry(IntelIOMMUState *s, 698 uint32_t pasid, 699 VTDPASIDDirEntry *pdire, 700 VTDPASIDEntry *pe) 701 { 702 uint32_t index; 703 dma_addr_t addr, entry_size; 704 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 705 706 index = VTD_PASID_TABLE_INDEX(pasid); 707 entry_size = VTD_PASID_ENTRY_SIZE; 708 addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK; 709 addr = addr + index * entry_size; 710 if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { 711 return -VTD_FR_PASID_TABLE_INV; 712 } 713 714 /* Do translation type check */ 715 if (!vtd_pe_type_check(x86_iommu, pe)) { 716 return -VTD_FR_PASID_TABLE_INV; 717 } 718 719 if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { 720 return -VTD_FR_PASID_TABLE_INV; 721 } 722 723 return 0; 724 } 725 726 static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s, 727 dma_addr_t pasid_dir_base, 728 uint32_t pasid, 729 VTDPASIDEntry *pe) 730 { 731 int ret; 732 VTDPASIDDirEntry pdire; 733 734 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 735 if (ret) { 736 return ret; 737 } 738 739 ret = vtd_get_pasid_entry(s, pasid, &pdire, pe); 740 if (ret) { 741 return ret; 742 } 743 744 return ret; 745 } 746 747 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, 748 VTDContextEntry *ce, 749 VTDPASIDEntry *pe) 750 { 751 uint32_t pasid; 752 dma_addr_t pasid_dir_base; 753 int ret = 0; 754 755 pasid = VTD_CE_GET_RID2PASID(ce); 756 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 757 ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe); 758 759 return ret; 760 } 761 762 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, 763 VTDContextEntry *ce, 764 bool *pe_fpd_set) 765 { 766 int ret; 767 uint32_t pasid; 768 dma_addr_t pasid_dir_base; 769 VTDPASIDDirEntry pdire; 770 VTDPASIDEntry pe; 771 772 pasid = VTD_CE_GET_RID2PASID(ce); 773 pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); 774 775 ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire); 776 if (ret) { 777 return ret; 778 } 779 780 if (pdire.val & VTD_PASID_DIR_FPD) { 781 *pe_fpd_set = true; 782 return 0; 783 } 784 785 ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe); 786 if (ret) { 787 return ret; 788 } 789 790 if (pe.val[0] & VTD_PASID_ENTRY_FPD) { 791 *pe_fpd_set = true; 792 } 793 794 return 0; 795 } 796 797 /* Get the page-table level that hardware should use for the second-level 798 * page-table walk from the Address Width field of context-entry. 799 */ 800 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) 801 { 802 return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW); 803 } 804 805 static uint32_t vtd_get_iova_level(IntelIOMMUState *s, 806 VTDContextEntry *ce) 807 { 808 VTDPASIDEntry pe; 809 810 if (s->root_scalable) { 811 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 812 return VTD_PE_GET_LEVEL(&pe); 813 } 814 815 return vtd_ce_get_level(ce); 816 } 817 818 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) 819 { 820 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; 821 } 822 823 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, 824 VTDContextEntry *ce) 825 { 826 VTDPASIDEntry pe; 827 828 if (s->root_scalable) { 829 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 830 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; 831 } 832 833 return vtd_ce_get_agaw(ce); 834 } 835 836 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce) 837 { 838 return ce->lo & VTD_CONTEXT_ENTRY_TT; 839 } 840 841 /* Only for Legacy Mode. Return true if check passed, otherwise false */ 842 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, 843 VTDContextEntry *ce) 844 { 845 switch (vtd_ce_get_type(ce)) { 846 case VTD_CONTEXT_TT_MULTI_LEVEL: 847 /* Always supported */ 848 break; 849 case VTD_CONTEXT_TT_DEV_IOTLB: 850 if (!x86_iommu->dt_supported) { 851 error_report_once("%s: DT specified but not supported", __func__); 852 return false; 853 } 854 break; 855 case VTD_CONTEXT_TT_PASS_THROUGH: 856 if (!x86_iommu->pt_supported) { 857 error_report_once("%s: PT specified but not supported", __func__); 858 return false; 859 } 860 break; 861 default: 862 /* Unknown type */ 863 error_report_once("%s: unknown ce type: %"PRIu32, __func__, 864 vtd_ce_get_type(ce)); 865 return false; 866 } 867 return true; 868 } 869 870 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, 871 VTDContextEntry *ce, uint8_t aw) 872 { 873 uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); 874 return 1ULL << MIN(ce_agaw, aw); 875 } 876 877 /* Return true if IOVA passes range check, otherwise false. */ 878 static inline bool vtd_iova_range_check(IntelIOMMUState *s, 879 uint64_t iova, VTDContextEntry *ce, 880 uint8_t aw) 881 { 882 /* 883 * Check if @iova is above 2^X-1, where X is the minimum of MGAW 884 * in CAP_REG and AW in context-entry. 885 */ 886 return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); 887 } 888 889 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, 890 VTDContextEntry *ce) 891 { 892 VTDPASIDEntry pe; 893 894 if (s->root_scalable) { 895 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 896 return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; 897 } 898 899 return vtd_ce_get_slpt_base(ce); 900 } 901 902 /* 903 * Rsvd field masks for spte: 904 * Index [1] to [4] 4k pages 905 * Index [5] to [8] large pages 906 */ 907 static uint64_t vtd_paging_entry_rsvd_field[9]; 908 909 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) 910 { 911 if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) { 912 /* Maybe large page */ 913 return slpte & vtd_paging_entry_rsvd_field[level + 4]; 914 } else { 915 return slpte & vtd_paging_entry_rsvd_field[level]; 916 } 917 } 918 919 /* Find the VTD address space associated with a given bus number */ 920 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) 921 { 922 VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; 923 if (!vtd_bus) { 924 /* 925 * Iterate over the registered buses to find the one which 926 * currently hold this bus number, and update the bus_num 927 * lookup table: 928 */ 929 GHashTableIter iter; 930 931 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 932 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 933 if (pci_bus_num(vtd_bus->bus) == bus_num) { 934 s->vtd_as_by_bus_num[bus_num] = vtd_bus; 935 return vtd_bus; 936 } 937 } 938 } 939 return vtd_bus; 940 } 941 942 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level 943 * of the translation, can be used for deciding the size of large page. 944 */ 945 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, 946 uint64_t iova, bool is_write, 947 uint64_t *slptep, uint32_t *slpte_level, 948 bool *reads, bool *writes, uint8_t aw_bits) 949 { 950 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 951 uint32_t level = vtd_get_iova_level(s, ce); 952 uint32_t offset; 953 uint64_t slpte; 954 uint64_t access_right_check; 955 956 if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { 957 error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", 958 __func__, iova); 959 return -VTD_FR_ADDR_BEYOND_MGAW; 960 } 961 962 /* FIXME: what is the Atomics request here? */ 963 access_right_check = is_write ? VTD_SL_W : VTD_SL_R; 964 965 while (true) { 966 offset = vtd_iova_level_offset(iova, level); 967 slpte = vtd_get_slpte(addr, offset); 968 969 if (slpte == (uint64_t)-1) { 970 error_report_once("%s: detected read error on DMAR slpte " 971 "(iova=0x%" PRIx64 ")", __func__, iova); 972 if (level == vtd_get_iova_level(s, ce)) { 973 /* Invalid programming of context-entry */ 974 return -VTD_FR_CONTEXT_ENTRY_INV; 975 } else { 976 return -VTD_FR_PAGING_ENTRY_INV; 977 } 978 } 979 *reads = (*reads) && (slpte & VTD_SL_R); 980 *writes = (*writes) && (slpte & VTD_SL_W); 981 if (!(slpte & access_right_check)) { 982 error_report_once("%s: detected slpte permission error " 983 "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " 984 "slpte=0x%" PRIx64 ", write=%d)", __func__, 985 iova, level, slpte, is_write); 986 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; 987 } 988 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 989 error_report_once("%s: detected splte reserve non-zero " 990 "iova=0x%" PRIx64 ", level=0x%" PRIx32 991 "slpte=0x%" PRIx64 ")", __func__, iova, 992 level, slpte); 993 return -VTD_FR_PAGING_ENTRY_RSVD; 994 } 995 996 if (vtd_is_last_slpte(slpte, level)) { 997 *slptep = slpte; 998 *slpte_level = level; 999 return 0; 1000 } 1001 addr = vtd_get_slpte_addr(slpte, aw_bits); 1002 level--; 1003 } 1004 } 1005 1006 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); 1007 1008 /** 1009 * Constant information used during page walking 1010 * 1011 * @hook_fn: hook func to be called when detected page 1012 * @private: private data to be passed into hook func 1013 * @notify_unmap: whether we should notify invalid entries 1014 * @as: VT-d address space of the device 1015 * @aw: maximum address width 1016 * @domain: domain ID of the page walk 1017 */ 1018 typedef struct { 1019 VTDAddressSpace *as; 1020 vtd_page_walk_hook hook_fn; 1021 void *private; 1022 bool notify_unmap; 1023 uint8_t aw; 1024 uint16_t domain_id; 1025 } vtd_page_walk_info; 1026 1027 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) 1028 { 1029 VTDAddressSpace *as = info->as; 1030 vtd_page_walk_hook hook_fn = info->hook_fn; 1031 void *private = info->private; 1032 DMAMap target = { 1033 .iova = entry->iova, 1034 .size = entry->addr_mask, 1035 .translated_addr = entry->translated_addr, 1036 .perm = entry->perm, 1037 }; 1038 DMAMap *mapped = iova_tree_find(as->iova_tree, &target); 1039 1040 if (entry->perm == IOMMU_NONE && !info->notify_unmap) { 1041 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1042 return 0; 1043 } 1044 1045 assert(hook_fn); 1046 1047 /* Update local IOVA mapped ranges */ 1048 if (entry->perm) { 1049 if (mapped) { 1050 /* If it's exactly the same translation, skip */ 1051 if (!memcmp(mapped, &target, sizeof(target))) { 1052 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, 1053 entry->translated_addr); 1054 return 0; 1055 } else { 1056 /* 1057 * Translation changed. Normally this should not 1058 * happen, but it can happen when with buggy guest 1059 * OSes. Note that there will be a small window that 1060 * we don't have map at all. But that's the best 1061 * effort we can do. The ideal way to emulate this is 1062 * atomically modify the PTE to follow what has 1063 * changed, but we can't. One example is that vfio 1064 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no 1065 * interface to modify a mapping (meanwhile it seems 1066 * meaningless to even provide one). Anyway, let's 1067 * mark this as a TODO in case one day we'll have 1068 * a better solution. 1069 */ 1070 IOMMUAccessFlags cache_perm = entry->perm; 1071 int ret; 1072 1073 /* Emulate an UNMAP */ 1074 entry->perm = IOMMU_NONE; 1075 trace_vtd_page_walk_one(info->domain_id, 1076 entry->iova, 1077 entry->translated_addr, 1078 entry->addr_mask, 1079 entry->perm); 1080 ret = hook_fn(entry, private); 1081 if (ret) { 1082 return ret; 1083 } 1084 /* Drop any existing mapping */ 1085 iova_tree_remove(as->iova_tree, &target); 1086 /* Recover the correct permission */ 1087 entry->perm = cache_perm; 1088 } 1089 } 1090 iova_tree_insert(as->iova_tree, &target); 1091 } else { 1092 if (!mapped) { 1093 /* Skip since we didn't map this range at all */ 1094 trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); 1095 return 0; 1096 } 1097 iova_tree_remove(as->iova_tree, &target); 1098 } 1099 1100 trace_vtd_page_walk_one(info->domain_id, entry->iova, 1101 entry->translated_addr, entry->addr_mask, 1102 entry->perm); 1103 return hook_fn(entry, private); 1104 } 1105 1106 /** 1107 * vtd_page_walk_level - walk over specific level for IOVA range 1108 * 1109 * @addr: base GPA addr to start the walk 1110 * @start: IOVA range start address 1111 * @end: IOVA range end address (start <= addr < end) 1112 * @read: whether parent level has read permission 1113 * @write: whether parent level has write permission 1114 * @info: constant information for the page walk 1115 */ 1116 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, 1117 uint64_t end, uint32_t level, bool read, 1118 bool write, vtd_page_walk_info *info) 1119 { 1120 bool read_cur, write_cur, entry_valid; 1121 uint32_t offset; 1122 uint64_t slpte; 1123 uint64_t subpage_size, subpage_mask; 1124 IOMMUTLBEntry entry; 1125 uint64_t iova = start; 1126 uint64_t iova_next; 1127 int ret = 0; 1128 1129 trace_vtd_page_walk_level(addr, level, start, end); 1130 1131 subpage_size = 1ULL << vtd_slpt_level_shift(level); 1132 subpage_mask = vtd_slpt_level_page_mask(level); 1133 1134 while (iova < end) { 1135 iova_next = (iova & subpage_mask) + subpage_size; 1136 1137 offset = vtd_iova_level_offset(iova, level); 1138 slpte = vtd_get_slpte(addr, offset); 1139 1140 if (slpte == (uint64_t)-1) { 1141 trace_vtd_page_walk_skip_read(iova, iova_next); 1142 goto next; 1143 } 1144 1145 if (vtd_slpte_nonzero_rsvd(slpte, level)) { 1146 trace_vtd_page_walk_skip_reserve(iova, iova_next); 1147 goto next; 1148 } 1149 1150 /* Permissions are stacked with parents' */ 1151 read_cur = read && (slpte & VTD_SL_R); 1152 write_cur = write && (slpte & VTD_SL_W); 1153 1154 /* 1155 * As long as we have either read/write permission, this is a 1156 * valid entry. The rule works for both page entries and page 1157 * table entries. 1158 */ 1159 entry_valid = read_cur | write_cur; 1160 1161 if (!vtd_is_last_slpte(slpte, level) && entry_valid) { 1162 /* 1163 * This is a valid PDE (or even bigger than PDE). We need 1164 * to walk one further level. 1165 */ 1166 ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), 1167 iova, MIN(iova_next, end), level - 1, 1168 read_cur, write_cur, info); 1169 } else { 1170 /* 1171 * This means we are either: 1172 * 1173 * (1) the real page entry (either 4K page, or huge page) 1174 * (2) the whole range is invalid 1175 * 1176 * In either case, we send an IOTLB notification down. 1177 */ 1178 entry.target_as = &address_space_memory; 1179 entry.iova = iova & subpage_mask; 1180 entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); 1181 entry.addr_mask = ~subpage_mask; 1182 /* NOTE: this is only meaningful if entry_valid == true */ 1183 entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); 1184 ret = vtd_page_walk_one(&entry, info); 1185 } 1186 1187 if (ret < 0) { 1188 return ret; 1189 } 1190 1191 next: 1192 iova = iova_next; 1193 } 1194 1195 return 0; 1196 } 1197 1198 /** 1199 * vtd_page_walk - walk specific IOVA range, and call the hook 1200 * 1201 * @s: intel iommu state 1202 * @ce: context entry to walk upon 1203 * @start: IOVA address to start the walk 1204 * @end: IOVA range end address (start <= addr < end) 1205 * @info: page walking information struct 1206 */ 1207 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, 1208 uint64_t start, uint64_t end, 1209 vtd_page_walk_info *info) 1210 { 1211 dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); 1212 uint32_t level = vtd_get_iova_level(s, ce); 1213 1214 if (!vtd_iova_range_check(s, start, ce, info->aw)) { 1215 return -VTD_FR_ADDR_BEYOND_MGAW; 1216 } 1217 1218 if (!vtd_iova_range_check(s, end, ce, info->aw)) { 1219 /* Fix end so that it reaches the maximum */ 1220 end = vtd_iova_limit(s, ce, info->aw); 1221 } 1222 1223 return vtd_page_walk_level(addr, start, end, level, true, true, info); 1224 } 1225 1226 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s, 1227 VTDRootEntry *re) 1228 { 1229 /* Legacy Mode reserved bits check */ 1230 if (!s->root_scalable && 1231 (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1232 goto rsvd_err; 1233 1234 /* Scalable Mode reserved bits check */ 1235 if (s->root_scalable && 1236 ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) || 1237 (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits)))) 1238 goto rsvd_err; 1239 1240 return 0; 1241 1242 rsvd_err: 1243 error_report_once("%s: invalid root entry: hi=0x%"PRIx64 1244 ", lo=0x%"PRIx64, 1245 __func__, re->hi, re->lo); 1246 return -VTD_FR_ROOT_ENTRY_RSVD; 1247 } 1248 1249 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s, 1250 VTDContextEntry *ce) 1251 { 1252 if (!s->root_scalable && 1253 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI || 1254 ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { 1255 error_report_once("%s: invalid context entry: hi=%"PRIx64 1256 ", lo=%"PRIx64" (reserved nonzero)", 1257 __func__, ce->hi, ce->lo); 1258 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1259 } 1260 1261 if (s->root_scalable && 1262 (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) || 1263 ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 || 1264 ce->val[2] || 1265 ce->val[3])) { 1266 error_report_once("%s: invalid context entry: val[3]=%"PRIx64 1267 ", val[2]=%"PRIx64 1268 ", val[1]=%"PRIx64 1269 ", val[0]=%"PRIx64" (reserved nonzero)", 1270 __func__, ce->val[3], ce->val[2], 1271 ce->val[1], ce->val[0]); 1272 return -VTD_FR_CONTEXT_ENTRY_RSVD; 1273 } 1274 1275 return 0; 1276 } 1277 1278 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, 1279 VTDContextEntry *ce) 1280 { 1281 VTDPASIDEntry pe; 1282 1283 /* 1284 * Make sure in Scalable Mode, a present context entry 1285 * has valid rid2pasid setting, which includes valid 1286 * rid2pasid field and corresponding pasid entry setting 1287 */ 1288 return vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1289 } 1290 1291 /* Map a device to its corresponding domain (context-entry) */ 1292 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, 1293 uint8_t devfn, VTDContextEntry *ce) 1294 { 1295 VTDRootEntry re; 1296 int ret_fr; 1297 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 1298 1299 ret_fr = vtd_get_root_entry(s, bus_num, &re); 1300 if (ret_fr) { 1301 return ret_fr; 1302 } 1303 1304 if (!vtd_root_entry_present(s, &re, devfn)) { 1305 /* Not error - it's okay we don't have root entry. */ 1306 trace_vtd_re_not_present(bus_num); 1307 return -VTD_FR_ROOT_ENTRY_P; 1308 } 1309 1310 ret_fr = vtd_root_entry_rsvd_bits_check(s, &re); 1311 if (ret_fr) { 1312 return ret_fr; 1313 } 1314 1315 ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce); 1316 if (ret_fr) { 1317 return ret_fr; 1318 } 1319 1320 if (!vtd_ce_present(ce)) { 1321 /* Not error - it's okay we don't have context entry. */ 1322 trace_vtd_ce_not_present(bus_num, devfn); 1323 return -VTD_FR_CONTEXT_ENTRY_P; 1324 } 1325 1326 ret_fr = vtd_context_entry_rsvd_bits_check(s, ce); 1327 if (ret_fr) { 1328 return ret_fr; 1329 } 1330 1331 /* Check if the programming of context-entry is valid */ 1332 if (!s->root_scalable && 1333 !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { 1334 error_report_once("%s: invalid context entry: hi=%"PRIx64 1335 ", lo=%"PRIx64" (level %d not supported)", 1336 __func__, ce->hi, ce->lo, 1337 vtd_ce_get_level(ce)); 1338 return -VTD_FR_CONTEXT_ENTRY_INV; 1339 } 1340 1341 if (!s->root_scalable) { 1342 /* Do translation type check */ 1343 if (!vtd_ce_type_check(x86_iommu, ce)) { 1344 /* Errors dumped in vtd_ce_type_check() */ 1345 return -VTD_FR_CONTEXT_ENTRY_INV; 1346 } 1347 } else { 1348 /* 1349 * Check if the programming of context-entry.rid2pasid 1350 * and corresponding pasid setting is valid, and thus 1351 * avoids to check pasid entry fetching result in future 1352 * helper function calling. 1353 */ 1354 ret_fr = vtd_ce_rid2pasid_check(s, ce); 1355 if (ret_fr) { 1356 return ret_fr; 1357 } 1358 } 1359 1360 return 0; 1361 } 1362 1363 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, 1364 void *private) 1365 { 1366 memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); 1367 return 0; 1368 } 1369 1370 static uint16_t vtd_get_domain_id(IntelIOMMUState *s, 1371 VTDContextEntry *ce) 1372 { 1373 VTDPASIDEntry pe; 1374 1375 if (s->root_scalable) { 1376 vtd_ce_get_rid2pasid_entry(s, ce, &pe); 1377 return VTD_SM_PASID_ENTRY_DID(pe.val[1]); 1378 } 1379 1380 return VTD_CONTEXT_ENTRY_DID(ce->hi); 1381 } 1382 1383 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, 1384 VTDContextEntry *ce, 1385 hwaddr addr, hwaddr size) 1386 { 1387 IntelIOMMUState *s = vtd_as->iommu_state; 1388 vtd_page_walk_info info = { 1389 .hook_fn = vtd_sync_shadow_page_hook, 1390 .private = (void *)&vtd_as->iommu, 1391 .notify_unmap = true, 1392 .aw = s->aw_bits, 1393 .as = vtd_as, 1394 .domain_id = vtd_get_domain_id(s, ce), 1395 }; 1396 1397 return vtd_page_walk(s, ce, addr, addr + size, &info); 1398 } 1399 1400 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) 1401 { 1402 int ret; 1403 VTDContextEntry ce; 1404 IOMMUNotifier *n; 1405 1406 ret = vtd_dev_to_context_entry(vtd_as->iommu_state, 1407 pci_bus_num(vtd_as->bus), 1408 vtd_as->devfn, &ce); 1409 if (ret) { 1410 if (ret == -VTD_FR_CONTEXT_ENTRY_P) { 1411 /* 1412 * It's a valid scenario to have a context entry that is 1413 * not present. For example, when a device is removed 1414 * from an existing domain then the context entry will be 1415 * zeroed by the guest before it was put into another 1416 * domain. When this happens, instead of synchronizing 1417 * the shadow pages we should invalidate all existing 1418 * mappings and notify the backends. 1419 */ 1420 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 1421 vtd_address_space_unmap(vtd_as, n); 1422 } 1423 ret = 0; 1424 } 1425 return ret; 1426 } 1427 1428 return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX); 1429 } 1430 1431 /* 1432 * Check if specific device is configed to bypass address 1433 * translation for DMA requests. In Scalable Mode, bypass 1434 * 1st-level translation or 2nd-level translation, it depends 1435 * on PGTT setting. 1436 */ 1437 static bool vtd_dev_pt_enabled(VTDAddressSpace *as) 1438 { 1439 IntelIOMMUState *s; 1440 VTDContextEntry ce; 1441 VTDPASIDEntry pe; 1442 int ret; 1443 1444 assert(as); 1445 1446 s = as->iommu_state; 1447 ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), 1448 as->devfn, &ce); 1449 if (ret) { 1450 /* 1451 * Possibly failed to parse the context entry for some reason 1452 * (e.g., during init, or any guest configuration errors on 1453 * context entries). We should assume PT not enabled for 1454 * safety. 1455 */ 1456 return false; 1457 } 1458 1459 if (s->root_scalable) { 1460 ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); 1461 if (ret) { 1462 error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, 1463 __func__, ret); 1464 return false; 1465 } 1466 return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); 1467 } 1468 1469 return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); 1470 } 1471 1472 /* Return whether the device is using IOMMU translation. */ 1473 static bool vtd_switch_address_space(VTDAddressSpace *as) 1474 { 1475 bool use_iommu; 1476 /* Whether we need to take the BQL on our own */ 1477 bool take_bql = !qemu_mutex_iothread_locked(); 1478 1479 assert(as); 1480 1481 use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); 1482 1483 trace_vtd_switch_address_space(pci_bus_num(as->bus), 1484 VTD_PCI_SLOT(as->devfn), 1485 VTD_PCI_FUNC(as->devfn), 1486 use_iommu); 1487 1488 /* 1489 * It's possible that we reach here without BQL, e.g., when called 1490 * from vtd_pt_enable_fast_path(). However the memory APIs need 1491 * it. We'd better make sure we have had it already, or, take it. 1492 */ 1493 if (take_bql) { 1494 qemu_mutex_lock_iothread(); 1495 } 1496 1497 /* Turn off first then on the other */ 1498 if (use_iommu) { 1499 memory_region_set_enabled(&as->nodmar, false); 1500 memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); 1501 } else { 1502 memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); 1503 memory_region_set_enabled(&as->nodmar, true); 1504 } 1505 1506 if (take_bql) { 1507 qemu_mutex_unlock_iothread(); 1508 } 1509 1510 return use_iommu; 1511 } 1512 1513 static void vtd_switch_address_space_all(IntelIOMMUState *s) 1514 { 1515 GHashTableIter iter; 1516 VTDBus *vtd_bus; 1517 int i; 1518 1519 g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); 1520 while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { 1521 for (i = 0; i < PCI_DEVFN_MAX; i++) { 1522 if (!vtd_bus->dev_as[i]) { 1523 continue; 1524 } 1525 vtd_switch_address_space(vtd_bus->dev_as[i]); 1526 } 1527 } 1528 } 1529 1530 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) 1531 { 1532 return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); 1533 } 1534 1535 static const bool vtd_qualified_faults[] = { 1536 [VTD_FR_RESERVED] = false, 1537 [VTD_FR_ROOT_ENTRY_P] = false, 1538 [VTD_FR_CONTEXT_ENTRY_P] = true, 1539 [VTD_FR_CONTEXT_ENTRY_INV] = true, 1540 [VTD_FR_ADDR_BEYOND_MGAW] = true, 1541 [VTD_FR_WRITE] = true, 1542 [VTD_FR_READ] = true, 1543 [VTD_FR_PAGING_ENTRY_INV] = true, 1544 [VTD_FR_ROOT_TABLE_INV] = false, 1545 [VTD_FR_CONTEXT_TABLE_INV] = false, 1546 [VTD_FR_ROOT_ENTRY_RSVD] = false, 1547 [VTD_FR_PAGING_ENTRY_RSVD] = true, 1548 [VTD_FR_CONTEXT_ENTRY_TT] = true, 1549 [VTD_FR_PASID_TABLE_INV] = false, 1550 [VTD_FR_RESERVED_ERR] = false, 1551 [VTD_FR_MAX] = false, 1552 }; 1553 1554 /* To see if a fault condition is "qualified", which is reported to software 1555 * only if the FPD field in the context-entry used to process the faulting 1556 * request is 0. 1557 */ 1558 static inline bool vtd_is_qualified_fault(VTDFaultReason fault) 1559 { 1560 return vtd_qualified_faults[fault]; 1561 } 1562 1563 static inline bool vtd_is_interrupt_addr(hwaddr addr) 1564 { 1565 return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; 1566 } 1567 1568 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) 1569 { 1570 VTDBus *vtd_bus; 1571 VTDAddressSpace *vtd_as; 1572 bool success = false; 1573 1574 vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); 1575 if (!vtd_bus) { 1576 goto out; 1577 } 1578 1579 vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; 1580 if (!vtd_as) { 1581 goto out; 1582 } 1583 1584 if (vtd_switch_address_space(vtd_as) == false) { 1585 /* We switched off IOMMU region successfully. */ 1586 success = true; 1587 } 1588 1589 out: 1590 trace_vtd_pt_enable_fast_path(source_id, success); 1591 } 1592 1593 /* Map dev to context-entry then do a paging-structures walk to do a iommu 1594 * translation. 1595 * 1596 * Called from RCU critical section. 1597 * 1598 * @bus_num: The bus number 1599 * @devfn: The devfn, which is the combined of device and function number 1600 * @is_write: The access is a write operation 1601 * @entry: IOMMUTLBEntry that contain the addr to be translated and result 1602 * 1603 * Returns true if translation is successful, otherwise false. 1604 */ 1605 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, 1606 uint8_t devfn, hwaddr addr, bool is_write, 1607 IOMMUTLBEntry *entry) 1608 { 1609 IntelIOMMUState *s = vtd_as->iommu_state; 1610 VTDContextEntry ce; 1611 uint8_t bus_num = pci_bus_num(bus); 1612 VTDContextCacheEntry *cc_entry; 1613 uint64_t slpte, page_mask; 1614 uint32_t level; 1615 uint16_t source_id = vtd_make_source_id(bus_num, devfn); 1616 int ret_fr; 1617 bool is_fpd_set = false; 1618 bool reads = true; 1619 bool writes = true; 1620 uint8_t access_flags; 1621 VTDIOTLBEntry *iotlb_entry; 1622 1623 /* 1624 * We have standalone memory region for interrupt addresses, we 1625 * should never receive translation requests in this region. 1626 */ 1627 assert(!vtd_is_interrupt_addr(addr)); 1628 1629 vtd_iommu_lock(s); 1630 1631 cc_entry = &vtd_as->context_cache_entry; 1632 1633 /* Try to fetch slpte form IOTLB */ 1634 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); 1635 if (iotlb_entry) { 1636 trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, 1637 iotlb_entry->domain_id); 1638 slpte = iotlb_entry->slpte; 1639 access_flags = iotlb_entry->access_flags; 1640 page_mask = iotlb_entry->mask; 1641 goto out; 1642 } 1643 1644 /* Try to fetch context-entry from cache first */ 1645 if (cc_entry->context_cache_gen == s->context_cache_gen) { 1646 trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi, 1647 cc_entry->context_entry.lo, 1648 cc_entry->context_cache_gen); 1649 ce = cc_entry->context_entry; 1650 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1651 if (!is_fpd_set && s->root_scalable) { 1652 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1653 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1654 } 1655 } else { 1656 ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); 1657 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; 1658 if (!ret_fr && !is_fpd_set && s->root_scalable) { 1659 ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); 1660 } 1661 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1662 /* Update context-cache */ 1663 trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, 1664 cc_entry->context_cache_gen, 1665 s->context_cache_gen); 1666 cc_entry->context_entry = ce; 1667 cc_entry->context_cache_gen = s->context_cache_gen; 1668 } 1669 1670 /* 1671 * We don't need to translate for pass-through context entries. 1672 * Also, let's ignore IOTLB caching as well for PT devices. 1673 */ 1674 if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { 1675 entry->iova = addr & VTD_PAGE_MASK_4K; 1676 entry->translated_addr = entry->iova; 1677 entry->addr_mask = ~VTD_PAGE_MASK_4K; 1678 entry->perm = IOMMU_RW; 1679 trace_vtd_translate_pt(source_id, entry->iova); 1680 1681 /* 1682 * When this happens, it means firstly caching-mode is not 1683 * enabled, and this is the first passthrough translation for 1684 * the device. Let's enable the fast path for passthrough. 1685 * 1686 * When passthrough is disabled again for the device, we can 1687 * capture it via the context entry invalidation, then the 1688 * IOMMU region can be swapped back. 1689 */ 1690 vtd_pt_enable_fast_path(s, source_id); 1691 vtd_iommu_unlock(s); 1692 return true; 1693 } 1694 1695 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, 1696 &reads, &writes, s->aw_bits); 1697 VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); 1698 1699 page_mask = vtd_slpt_level_page_mask(level); 1700 access_flags = IOMMU_ACCESS_FLAG(reads, writes); 1701 vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, 1702 access_flags, level); 1703 out: 1704 vtd_iommu_unlock(s); 1705 entry->iova = addr & page_mask; 1706 entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; 1707 entry->addr_mask = ~page_mask; 1708 entry->perm = access_flags; 1709 return true; 1710 1711 error: 1712 vtd_iommu_unlock(s); 1713 entry->iova = 0; 1714 entry->translated_addr = 0; 1715 entry->addr_mask = 0; 1716 entry->perm = IOMMU_NONE; 1717 return false; 1718 } 1719 1720 static void vtd_root_table_setup(IntelIOMMUState *s) 1721 { 1722 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); 1723 s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); 1724 1725 vtd_update_scalable_state(s); 1726 1727 trace_vtd_reg_dmar_root(s->root, s->root_scalable); 1728 } 1729 1730 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global, 1731 uint32_t index, uint32_t mask) 1732 { 1733 x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); 1734 } 1735 1736 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) 1737 { 1738 uint64_t value = 0; 1739 value = vtd_get_quad_raw(s, DMAR_IRTA_REG); 1740 s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); 1741 s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); 1742 s->intr_eime = value & VTD_IRTA_EIME; 1743 1744 /* Notify global invalidation */ 1745 vtd_iec_notify_all(s, true, 0, 0); 1746 1747 trace_vtd_reg_ir_root(s->intr_root, s->intr_size); 1748 } 1749 1750 static void vtd_iommu_replay_all(IntelIOMMUState *s) 1751 { 1752 VTDAddressSpace *vtd_as; 1753 1754 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1755 vtd_sync_shadow_page_table(vtd_as); 1756 } 1757 } 1758 1759 static void vtd_context_global_invalidate(IntelIOMMUState *s) 1760 { 1761 trace_vtd_inv_desc_cc_global(); 1762 /* Protects context cache */ 1763 vtd_iommu_lock(s); 1764 s->context_cache_gen++; 1765 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { 1766 vtd_reset_context_cache_locked(s); 1767 } 1768 vtd_iommu_unlock(s); 1769 vtd_address_space_refresh_all(s); 1770 /* 1771 * From VT-d spec 6.5.2.1, a global context entry invalidation 1772 * should be followed by a IOTLB global invalidation, so we should 1773 * be safe even without this. Hoewever, let's replay the region as 1774 * well to be safer, and go back here when we need finer tunes for 1775 * VT-d emulation codes. 1776 */ 1777 vtd_iommu_replay_all(s); 1778 } 1779 1780 /* Do a context-cache device-selective invalidation. 1781 * @func_mask: FM field after shifting 1782 */ 1783 static void vtd_context_device_invalidate(IntelIOMMUState *s, 1784 uint16_t source_id, 1785 uint16_t func_mask) 1786 { 1787 uint16_t mask; 1788 VTDBus *vtd_bus; 1789 VTDAddressSpace *vtd_as; 1790 uint8_t bus_n, devfn; 1791 uint16_t devfn_it; 1792 1793 trace_vtd_inv_desc_cc_devices(source_id, func_mask); 1794 1795 switch (func_mask & 3) { 1796 case 0: 1797 mask = 0; /* No bits in the SID field masked */ 1798 break; 1799 case 1: 1800 mask = 4; /* Mask bit 2 in the SID field */ 1801 break; 1802 case 2: 1803 mask = 6; /* Mask bit 2:1 in the SID field */ 1804 break; 1805 case 3: 1806 mask = 7; /* Mask bit 2:0 in the SID field */ 1807 break; 1808 } 1809 mask = ~mask; 1810 1811 bus_n = VTD_SID_TO_BUS(source_id); 1812 vtd_bus = vtd_find_as_from_bus_num(s, bus_n); 1813 if (vtd_bus) { 1814 devfn = VTD_SID_TO_DEVFN(source_id); 1815 for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { 1816 vtd_as = vtd_bus->dev_as[devfn_it]; 1817 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { 1818 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), 1819 VTD_PCI_FUNC(devfn_it)); 1820 vtd_iommu_lock(s); 1821 vtd_as->context_cache_entry.context_cache_gen = 0; 1822 vtd_iommu_unlock(s); 1823 /* 1824 * Do switch address space when needed, in case if the 1825 * device passthrough bit is switched. 1826 */ 1827 vtd_switch_address_space(vtd_as); 1828 /* 1829 * So a device is moving out of (or moving into) a 1830 * domain, resync the shadow page table. 1831 * This won't bring bad even if we have no such 1832 * notifier registered - the IOMMU notification 1833 * framework will skip MAP notifications if that 1834 * happened. 1835 */ 1836 vtd_sync_shadow_page_table(vtd_as); 1837 } 1838 } 1839 } 1840 } 1841 1842 /* Context-cache invalidation 1843 * Returns the Context Actual Invalidation Granularity. 1844 * @val: the content of the CCMD_REG 1845 */ 1846 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val) 1847 { 1848 uint64_t caig; 1849 uint64_t type = val & VTD_CCMD_CIRG_MASK; 1850 1851 switch (type) { 1852 case VTD_CCMD_DOMAIN_INVL: 1853 /* Fall through */ 1854 case VTD_CCMD_GLOBAL_INVL: 1855 caig = VTD_CCMD_GLOBAL_INVL_A; 1856 vtd_context_global_invalidate(s); 1857 break; 1858 1859 case VTD_CCMD_DEVICE_INVL: 1860 caig = VTD_CCMD_DEVICE_INVL_A; 1861 vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val)); 1862 break; 1863 1864 default: 1865 error_report_once("%s: invalid context: 0x%" PRIx64, 1866 __func__, val); 1867 caig = 0; 1868 } 1869 return caig; 1870 } 1871 1872 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) 1873 { 1874 trace_vtd_inv_desc_iotlb_global(); 1875 vtd_reset_iotlb(s); 1876 vtd_iommu_replay_all(s); 1877 } 1878 1879 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) 1880 { 1881 VTDContextEntry ce; 1882 VTDAddressSpace *vtd_as; 1883 1884 trace_vtd_inv_desc_iotlb_domain(domain_id); 1885 1886 vtd_iommu_lock(s); 1887 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, 1888 &domain_id); 1889 vtd_iommu_unlock(s); 1890 1891 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 1892 if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1893 vtd_as->devfn, &ce) && 1894 domain_id == vtd_get_domain_id(s, &ce)) { 1895 vtd_sync_shadow_page_table(vtd_as); 1896 } 1897 } 1898 } 1899 1900 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, 1901 uint16_t domain_id, hwaddr addr, 1902 uint8_t am) 1903 { 1904 VTDAddressSpace *vtd_as; 1905 VTDContextEntry ce; 1906 int ret; 1907 hwaddr size = (1 << am) * VTD_PAGE_SIZE; 1908 1909 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { 1910 ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), 1911 vtd_as->devfn, &ce); 1912 if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { 1913 if (vtd_as_has_map_notifier(vtd_as)) { 1914 /* 1915 * As long as we have MAP notifications registered in 1916 * any of our IOMMU notifiers, we need to sync the 1917 * shadow page table. 1918 */ 1919 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); 1920 } else { 1921 /* 1922 * For UNMAP-only notifiers, we don't need to walk the 1923 * page tables. We just deliver the PSI down to 1924 * invalidate caches. 1925 */ 1926 IOMMUTLBEntry entry = { 1927 .target_as = &address_space_memory, 1928 .iova = addr, 1929 .translated_addr = 0, 1930 .addr_mask = size - 1, 1931 .perm = IOMMU_NONE, 1932 }; 1933 memory_region_notify_iommu(&vtd_as->iommu, 0, entry); 1934 } 1935 } 1936 } 1937 } 1938 1939 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, 1940 hwaddr addr, uint8_t am) 1941 { 1942 VTDIOTLBPageInvInfo info; 1943 1944 trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am); 1945 1946 assert(am <= VTD_MAMV); 1947 info.domain_id = domain_id; 1948 info.addr = addr; 1949 info.mask = ~((1 << am) - 1); 1950 vtd_iommu_lock(s); 1951 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); 1952 vtd_iommu_unlock(s); 1953 vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); 1954 } 1955 1956 /* Flush IOTLB 1957 * Returns the IOTLB Actual Invalidation Granularity. 1958 * @val: the content of the IOTLB_REG 1959 */ 1960 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val) 1961 { 1962 uint64_t iaig; 1963 uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK; 1964 uint16_t domain_id; 1965 hwaddr addr; 1966 uint8_t am; 1967 1968 switch (type) { 1969 case VTD_TLB_GLOBAL_FLUSH: 1970 iaig = VTD_TLB_GLOBAL_FLUSH_A; 1971 vtd_iotlb_global_invalidate(s); 1972 break; 1973 1974 case VTD_TLB_DSI_FLUSH: 1975 domain_id = VTD_TLB_DID(val); 1976 iaig = VTD_TLB_DSI_FLUSH_A; 1977 vtd_iotlb_domain_invalidate(s, domain_id); 1978 break; 1979 1980 case VTD_TLB_PSI_FLUSH: 1981 domain_id = VTD_TLB_DID(val); 1982 addr = vtd_get_quad_raw(s, DMAR_IVA_REG); 1983 am = VTD_IVA_AM(addr); 1984 addr = VTD_IVA_ADDR(addr); 1985 if (am > VTD_MAMV) { 1986 error_report_once("%s: address mask overflow: 0x%" PRIx64, 1987 __func__, vtd_get_quad_raw(s, DMAR_IVA_REG)); 1988 iaig = 0; 1989 break; 1990 } 1991 iaig = VTD_TLB_PSI_FLUSH_A; 1992 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 1993 break; 1994 1995 default: 1996 error_report_once("%s: invalid granularity: 0x%" PRIx64, 1997 __func__, val); 1998 iaig = 0; 1999 } 2000 return iaig; 2001 } 2002 2003 static void vtd_fetch_inv_desc(IntelIOMMUState *s); 2004 2005 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s) 2006 { 2007 return s->qi_enabled && (s->iq_tail == s->iq_head) && 2008 (s->iq_last_desc_type == VTD_INV_DESC_WAIT); 2009 } 2010 2011 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) 2012 { 2013 uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG); 2014 2015 trace_vtd_inv_qi_enable(en); 2016 2017 if (en) { 2018 s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); 2019 /* 2^(x+8) entries */ 2020 s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0)); 2021 s->qi_enabled = true; 2022 trace_vtd_inv_qi_setup(s->iq, s->iq_size); 2023 /* Ok - report back to driver */ 2024 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES); 2025 2026 if (s->iq_tail != 0) { 2027 /* 2028 * This is a spec violation but Windows guests are known to set up 2029 * Queued Invalidation this way so we allow the write and process 2030 * Invalidation Descriptors right away. 2031 */ 2032 trace_vtd_warn_invalid_qi_tail(s->iq_tail); 2033 if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2034 vtd_fetch_inv_desc(s); 2035 } 2036 } 2037 } else { 2038 if (vtd_queued_inv_disable_check(s)) { 2039 /* disable Queued Invalidation */ 2040 vtd_set_quad_raw(s, DMAR_IQH_REG, 0); 2041 s->iq_head = 0; 2042 s->qi_enabled = false; 2043 /* Ok - report back to driver */ 2044 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0); 2045 } else { 2046 error_report_once("%s: detected improper state when disable QI " 2047 "(head=0x%x, tail=0x%x, last_type=%d)", 2048 __func__, 2049 s->iq_head, s->iq_tail, s->iq_last_desc_type); 2050 } 2051 } 2052 } 2053 2054 /* Set Root Table Pointer */ 2055 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s) 2056 { 2057 vtd_root_table_setup(s); 2058 /* Ok - report back to driver */ 2059 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS); 2060 vtd_reset_caches(s); 2061 vtd_address_space_refresh_all(s); 2062 } 2063 2064 /* Set Interrupt Remap Table Pointer */ 2065 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) 2066 { 2067 vtd_interrupt_remap_table_setup(s); 2068 /* Ok - report back to driver */ 2069 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); 2070 } 2071 2072 /* Handle Translation Enable/Disable */ 2073 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) 2074 { 2075 if (s->dmar_enabled == en) { 2076 return; 2077 } 2078 2079 trace_vtd_dmar_enable(en); 2080 2081 if (en) { 2082 s->dmar_enabled = true; 2083 /* Ok - report back to driver */ 2084 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES); 2085 } else { 2086 s->dmar_enabled = false; 2087 2088 /* Clear the index of Fault Recording Register */ 2089 s->next_frcd_reg = 0; 2090 /* Ok - report back to driver */ 2091 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); 2092 } 2093 2094 vtd_reset_caches(s); 2095 vtd_address_space_refresh_all(s); 2096 } 2097 2098 /* Handle Interrupt Remap Enable/Disable */ 2099 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) 2100 { 2101 trace_vtd_ir_enable(en); 2102 2103 if (en) { 2104 s->intr_enabled = true; 2105 /* Ok - report back to driver */ 2106 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES); 2107 } else { 2108 s->intr_enabled = false; 2109 /* Ok - report back to driver */ 2110 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0); 2111 } 2112 } 2113 2114 /* Handle write to Global Command Register */ 2115 static void vtd_handle_gcmd_write(IntelIOMMUState *s) 2116 { 2117 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); 2118 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); 2119 uint32_t changed = status ^ val; 2120 2121 trace_vtd_reg_write_gcmd(status, val); 2122 if (changed & VTD_GCMD_TE) { 2123 /* Translation enable/disable */ 2124 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); 2125 } 2126 if (val & VTD_GCMD_SRTP) { 2127 /* Set/update the root-table pointer */ 2128 vtd_handle_gcmd_srtp(s); 2129 } 2130 if (changed & VTD_GCMD_QIE) { 2131 /* Queued Invalidation Enable */ 2132 vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE); 2133 } 2134 if (val & VTD_GCMD_SIRTP) { 2135 /* Set/update the interrupt remapping root-table pointer */ 2136 vtd_handle_gcmd_sirtp(s); 2137 } 2138 if (changed & VTD_GCMD_IRE) { 2139 /* Interrupt remap enable/disable */ 2140 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); 2141 } 2142 } 2143 2144 /* Handle write to Context Command Register */ 2145 static void vtd_handle_ccmd_write(IntelIOMMUState *s) 2146 { 2147 uint64_t ret; 2148 uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG); 2149 2150 /* Context-cache invalidation request */ 2151 if (val & VTD_CCMD_ICC) { 2152 if (s->qi_enabled) { 2153 error_report_once("Queued Invalidation enabled, " 2154 "should not use register-based invalidation"); 2155 return; 2156 } 2157 ret = vtd_context_cache_invalidate(s, val); 2158 /* Invalidation completed. Change something to show */ 2159 vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL); 2160 ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK, 2161 ret); 2162 } 2163 } 2164 2165 /* Handle write to IOTLB Invalidation Register */ 2166 static void vtd_handle_iotlb_write(IntelIOMMUState *s) 2167 { 2168 uint64_t ret; 2169 uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG); 2170 2171 /* IOTLB invalidation request */ 2172 if (val & VTD_TLB_IVT) { 2173 if (s->qi_enabled) { 2174 error_report_once("Queued Invalidation enabled, " 2175 "should not use register-based invalidation"); 2176 return; 2177 } 2178 ret = vtd_iotlb_flush(s, val); 2179 /* Invalidation completed. Change something to show */ 2180 vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL); 2181 ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, 2182 VTD_TLB_FLUSH_GRANU_MASK_A, ret); 2183 } 2184 } 2185 2186 /* Fetch an Invalidation Descriptor from the Invalidation Queue */ 2187 static bool vtd_get_inv_desc(IntelIOMMUState *s, 2188 VTDInvDesc *inv_desc) 2189 { 2190 dma_addr_t base_addr = s->iq; 2191 uint32_t offset = s->iq_head; 2192 uint32_t dw = s->iq_dw ? 32 : 16; 2193 dma_addr_t addr = base_addr + offset * dw; 2194 2195 if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) { 2196 error_report_once("Read INV DESC failed."); 2197 return false; 2198 } 2199 inv_desc->lo = le64_to_cpu(inv_desc->lo); 2200 inv_desc->hi = le64_to_cpu(inv_desc->hi); 2201 if (dw == 32) { 2202 inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]); 2203 inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]); 2204 } 2205 return true; 2206 } 2207 2208 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2209 { 2210 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || 2211 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { 2212 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2213 " (reserved nonzero)", __func__, inv_desc->hi, 2214 inv_desc->lo); 2215 return false; 2216 } 2217 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { 2218 /* Status Write */ 2219 uint32_t status_data = (uint32_t)(inv_desc->lo >> 2220 VTD_INV_DESC_WAIT_DATA_SHIFT); 2221 2222 assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); 2223 2224 /* FIXME: need to be masked with HAW? */ 2225 dma_addr_t status_addr = inv_desc->hi; 2226 trace_vtd_inv_desc_wait_sw(status_addr, status_data); 2227 status_data = cpu_to_le32(status_data); 2228 if (dma_memory_write(&address_space_memory, status_addr, &status_data, 2229 sizeof(status_data))) { 2230 trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); 2231 return false; 2232 } 2233 } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { 2234 /* Interrupt flag */ 2235 vtd_generate_completion_event(s); 2236 } else { 2237 error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 2238 " (unknown type)", __func__, inv_desc->hi, 2239 inv_desc->lo); 2240 return false; 2241 } 2242 return true; 2243 } 2244 2245 static bool vtd_process_context_cache_desc(IntelIOMMUState *s, 2246 VTDInvDesc *inv_desc) 2247 { 2248 uint16_t sid, fmask; 2249 2250 if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { 2251 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2252 " (reserved nonzero)", __func__, inv_desc->hi, 2253 inv_desc->lo); 2254 return false; 2255 } 2256 switch (inv_desc->lo & VTD_INV_DESC_CC_G) { 2257 case VTD_INV_DESC_CC_DOMAIN: 2258 trace_vtd_inv_desc_cc_domain( 2259 (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo)); 2260 /* Fall through */ 2261 case VTD_INV_DESC_CC_GLOBAL: 2262 vtd_context_global_invalidate(s); 2263 break; 2264 2265 case VTD_INV_DESC_CC_DEVICE: 2266 sid = VTD_INV_DESC_CC_SID(inv_desc->lo); 2267 fmask = VTD_INV_DESC_CC_FM(inv_desc->lo); 2268 vtd_context_device_invalidate(s, sid, fmask); 2269 break; 2270 2271 default: 2272 error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 2273 " (invalid type)", __func__, inv_desc->hi, 2274 inv_desc->lo); 2275 return false; 2276 } 2277 return true; 2278 } 2279 2280 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) 2281 { 2282 uint16_t domain_id; 2283 uint8_t am; 2284 hwaddr addr; 2285 2286 if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || 2287 (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { 2288 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2289 ", lo=0x%"PRIx64" (reserved bits unzero)\n", 2290 __func__, inv_desc->hi, inv_desc->lo); 2291 return false; 2292 } 2293 2294 switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) { 2295 case VTD_INV_DESC_IOTLB_GLOBAL: 2296 vtd_iotlb_global_invalidate(s); 2297 break; 2298 2299 case VTD_INV_DESC_IOTLB_DOMAIN: 2300 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2301 vtd_iotlb_domain_invalidate(s, domain_id); 2302 break; 2303 2304 case VTD_INV_DESC_IOTLB_PAGE: 2305 domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo); 2306 addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi); 2307 am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi); 2308 if (am > VTD_MAMV) { 2309 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2310 ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n", 2311 __func__, inv_desc->hi, inv_desc->lo, 2312 am, (unsigned)VTD_MAMV); 2313 return false; 2314 } 2315 vtd_iotlb_page_invalidate(s, domain_id, addr, am); 2316 break; 2317 2318 default: 2319 error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 2320 ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n", 2321 __func__, inv_desc->hi, inv_desc->lo, 2322 inv_desc->lo & VTD_INV_DESC_IOTLB_G); 2323 return false; 2324 } 2325 return true; 2326 } 2327 2328 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, 2329 VTDInvDesc *inv_desc) 2330 { 2331 trace_vtd_inv_desc_iec(inv_desc->iec.granularity, 2332 inv_desc->iec.index, 2333 inv_desc->iec.index_mask); 2334 2335 vtd_iec_notify_all(s, !inv_desc->iec.granularity, 2336 inv_desc->iec.index, 2337 inv_desc->iec.index_mask); 2338 return true; 2339 } 2340 2341 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, 2342 VTDInvDesc *inv_desc) 2343 { 2344 VTDAddressSpace *vtd_dev_as; 2345 IOMMUTLBEntry entry; 2346 struct VTDBus *vtd_bus; 2347 hwaddr addr; 2348 uint64_t sz; 2349 uint16_t sid; 2350 uint8_t devfn; 2351 bool size; 2352 uint8_t bus_num; 2353 2354 addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); 2355 sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); 2356 devfn = sid & 0xff; 2357 bus_num = sid >> 8; 2358 size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); 2359 2360 if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || 2361 (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { 2362 error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 2363 ", lo=%"PRIx64" (reserved nonzero)", __func__, 2364 inv_desc->hi, inv_desc->lo); 2365 return false; 2366 } 2367 2368 vtd_bus = vtd_find_as_from_bus_num(s, bus_num); 2369 if (!vtd_bus) { 2370 goto done; 2371 } 2372 2373 vtd_dev_as = vtd_bus->dev_as[devfn]; 2374 if (!vtd_dev_as) { 2375 goto done; 2376 } 2377 2378 /* According to ATS spec table 2.4: 2379 * S = 0, bits 15:12 = xxxx range size: 4K 2380 * S = 1, bits 15:12 = xxx0 range size: 8K 2381 * S = 1, bits 15:12 = xx01 range size: 16K 2382 * S = 1, bits 15:12 = x011 range size: 32K 2383 * S = 1, bits 15:12 = 0111 range size: 64K 2384 * ... 2385 */ 2386 if (size) { 2387 sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); 2388 addr &= ~(sz - 1); 2389 } else { 2390 sz = VTD_PAGE_SIZE; 2391 } 2392 2393 entry.target_as = &vtd_dev_as->as; 2394 entry.addr_mask = sz - 1; 2395 entry.iova = addr; 2396 entry.perm = IOMMU_NONE; 2397 entry.translated_addr = 0; 2398 memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); 2399 2400 done: 2401 return true; 2402 } 2403 2404 static bool vtd_process_inv_desc(IntelIOMMUState *s) 2405 { 2406 VTDInvDesc inv_desc; 2407 uint8_t desc_type; 2408 2409 trace_vtd_inv_qi_head(s->iq_head); 2410 if (!vtd_get_inv_desc(s, &inv_desc)) { 2411 s->iq_last_desc_type = VTD_INV_DESC_NONE; 2412 return false; 2413 } 2414 2415 desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; 2416 /* FIXME: should update at first or at last? */ 2417 s->iq_last_desc_type = desc_type; 2418 2419 switch (desc_type) { 2420 case VTD_INV_DESC_CC: 2421 trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo); 2422 if (!vtd_process_context_cache_desc(s, &inv_desc)) { 2423 return false; 2424 } 2425 break; 2426 2427 case VTD_INV_DESC_IOTLB: 2428 trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo); 2429 if (!vtd_process_iotlb_desc(s, &inv_desc)) { 2430 return false; 2431 } 2432 break; 2433 2434 /* 2435 * TODO: the entity of below two cases will be implemented in future series. 2436 * To make guest (which integrates scalable mode support patch set in 2437 * iommu driver) work, just return true is enough so far. 2438 */ 2439 case VTD_INV_DESC_PC: 2440 break; 2441 2442 case VTD_INV_DESC_PIOTLB: 2443 break; 2444 2445 case VTD_INV_DESC_WAIT: 2446 trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo); 2447 if (!vtd_process_wait_desc(s, &inv_desc)) { 2448 return false; 2449 } 2450 break; 2451 2452 case VTD_INV_DESC_IEC: 2453 trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo); 2454 if (!vtd_process_inv_iec_desc(s, &inv_desc)) { 2455 return false; 2456 } 2457 break; 2458 2459 case VTD_INV_DESC_DEVICE: 2460 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); 2461 if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { 2462 return false; 2463 } 2464 break; 2465 2466 default: 2467 error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 2468 " (unknown type)", __func__, inv_desc.hi, 2469 inv_desc.lo); 2470 return false; 2471 } 2472 s->iq_head++; 2473 if (s->iq_head == s->iq_size) { 2474 s->iq_head = 0; 2475 } 2476 return true; 2477 } 2478 2479 /* Try to fetch and process more Invalidation Descriptors */ 2480 static void vtd_fetch_inv_desc(IntelIOMMUState *s) 2481 { 2482 trace_vtd_inv_qi_fetch(); 2483 2484 if (s->iq_tail >= s->iq_size) { 2485 /* Detects an invalid Tail pointer */ 2486 error_report_once("%s: detected invalid QI tail " 2487 "(tail=0x%x, size=0x%x)", 2488 __func__, s->iq_tail, s->iq_size); 2489 vtd_handle_inv_queue_error(s); 2490 return; 2491 } 2492 while (s->iq_head != s->iq_tail) { 2493 if (!vtd_process_inv_desc(s)) { 2494 /* Invalidation Queue Errors */ 2495 vtd_handle_inv_queue_error(s); 2496 break; 2497 } 2498 /* Must update the IQH_REG in time */ 2499 vtd_set_quad_raw(s, DMAR_IQH_REG, 2500 (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) & 2501 VTD_IQH_QH_MASK); 2502 } 2503 } 2504 2505 /* Handle write to Invalidation Queue Tail Register */ 2506 static void vtd_handle_iqt_write(IntelIOMMUState *s) 2507 { 2508 uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG); 2509 2510 if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { 2511 error_report_once("%s: RSV bit is set: val=0x%"PRIx64, 2512 __func__, val); 2513 return; 2514 } 2515 s->iq_tail = VTD_IQT_QT(s->iq_dw, val); 2516 trace_vtd_inv_qi_tail(s->iq_tail); 2517 2518 if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) { 2519 /* Process Invalidation Queue here */ 2520 vtd_fetch_inv_desc(s); 2521 } 2522 } 2523 2524 static void vtd_handle_fsts_write(IntelIOMMUState *s) 2525 { 2526 uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); 2527 uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2528 uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE; 2529 2530 if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) { 2531 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2532 trace_vtd_fsts_clear_ip(); 2533 } 2534 /* FIXME: when IQE is Clear, should we try to fetch some Invalidation 2535 * Descriptors if there are any when Queued Invalidation is enabled? 2536 */ 2537 } 2538 2539 static void vtd_handle_fectl_write(IntelIOMMUState *s) 2540 { 2541 uint32_t fectl_reg; 2542 /* FIXME: when software clears the IM field, check the IP field. But do we 2543 * need to compare the old value and the new value to conclude that 2544 * software clears the IM field? Or just check if the IM field is zero? 2545 */ 2546 fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG); 2547 2548 trace_vtd_reg_write_fectl(fectl_reg); 2549 2550 if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) { 2551 vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG); 2552 vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0); 2553 } 2554 } 2555 2556 static void vtd_handle_ics_write(IntelIOMMUState *s) 2557 { 2558 uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG); 2559 uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2560 2561 if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) { 2562 trace_vtd_reg_ics_clear_ip(); 2563 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2564 } 2565 } 2566 2567 static void vtd_handle_iectl_write(IntelIOMMUState *s) 2568 { 2569 uint32_t iectl_reg; 2570 /* FIXME: when software clears the IM field, check the IP field. But do we 2571 * need to compare the old value and the new value to conclude that 2572 * software clears the IM field? Or just check if the IM field is zero? 2573 */ 2574 iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG); 2575 2576 trace_vtd_reg_write_iectl(iectl_reg); 2577 2578 if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) { 2579 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG); 2580 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0); 2581 } 2582 } 2583 2584 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) 2585 { 2586 IntelIOMMUState *s = opaque; 2587 uint64_t val; 2588 2589 trace_vtd_reg_read(addr, size); 2590 2591 if (addr + size > DMAR_REG_SIZE) { 2592 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2593 " size=0x%u", __func__, addr, size); 2594 return (uint64_t)-1; 2595 } 2596 2597 switch (addr) { 2598 /* Root Table Address Register, 64-bit */ 2599 case DMAR_RTADDR_REG: 2600 if (size == 4) { 2601 val = s->root & ((1ULL << 32) - 1); 2602 } else { 2603 val = s->root; 2604 } 2605 break; 2606 2607 case DMAR_RTADDR_REG_HI: 2608 assert(size == 4); 2609 val = s->root >> 32; 2610 break; 2611 2612 /* Invalidation Queue Address Register, 64-bit */ 2613 case DMAR_IQA_REG: 2614 val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); 2615 if (size == 4) { 2616 val = val & ((1ULL << 32) - 1); 2617 } 2618 break; 2619 2620 case DMAR_IQA_REG_HI: 2621 assert(size == 4); 2622 val = s->iq >> 32; 2623 break; 2624 2625 default: 2626 if (size == 4) { 2627 val = vtd_get_long(s, addr); 2628 } else { 2629 val = vtd_get_quad(s, addr); 2630 } 2631 } 2632 2633 return val; 2634 } 2635 2636 static void vtd_mem_write(void *opaque, hwaddr addr, 2637 uint64_t val, unsigned size) 2638 { 2639 IntelIOMMUState *s = opaque; 2640 2641 trace_vtd_reg_write(addr, size, val); 2642 2643 if (addr + size > DMAR_REG_SIZE) { 2644 error_report_once("%s: MMIO over range: addr=0x%" PRIx64 2645 " size=0x%u", __func__, addr, size); 2646 return; 2647 } 2648 2649 switch (addr) { 2650 /* Global Command Register, 32-bit */ 2651 case DMAR_GCMD_REG: 2652 vtd_set_long(s, addr, val); 2653 vtd_handle_gcmd_write(s); 2654 break; 2655 2656 /* Context Command Register, 64-bit */ 2657 case DMAR_CCMD_REG: 2658 if (size == 4) { 2659 vtd_set_long(s, addr, val); 2660 } else { 2661 vtd_set_quad(s, addr, val); 2662 vtd_handle_ccmd_write(s); 2663 } 2664 break; 2665 2666 case DMAR_CCMD_REG_HI: 2667 assert(size == 4); 2668 vtd_set_long(s, addr, val); 2669 vtd_handle_ccmd_write(s); 2670 break; 2671 2672 /* IOTLB Invalidation Register, 64-bit */ 2673 case DMAR_IOTLB_REG: 2674 if (size == 4) { 2675 vtd_set_long(s, addr, val); 2676 } else { 2677 vtd_set_quad(s, addr, val); 2678 vtd_handle_iotlb_write(s); 2679 } 2680 break; 2681 2682 case DMAR_IOTLB_REG_HI: 2683 assert(size == 4); 2684 vtd_set_long(s, addr, val); 2685 vtd_handle_iotlb_write(s); 2686 break; 2687 2688 /* Invalidate Address Register, 64-bit */ 2689 case DMAR_IVA_REG: 2690 if (size == 4) { 2691 vtd_set_long(s, addr, val); 2692 } else { 2693 vtd_set_quad(s, addr, val); 2694 } 2695 break; 2696 2697 case DMAR_IVA_REG_HI: 2698 assert(size == 4); 2699 vtd_set_long(s, addr, val); 2700 break; 2701 2702 /* Fault Status Register, 32-bit */ 2703 case DMAR_FSTS_REG: 2704 assert(size == 4); 2705 vtd_set_long(s, addr, val); 2706 vtd_handle_fsts_write(s); 2707 break; 2708 2709 /* Fault Event Control Register, 32-bit */ 2710 case DMAR_FECTL_REG: 2711 assert(size == 4); 2712 vtd_set_long(s, addr, val); 2713 vtd_handle_fectl_write(s); 2714 break; 2715 2716 /* Fault Event Data Register, 32-bit */ 2717 case DMAR_FEDATA_REG: 2718 assert(size == 4); 2719 vtd_set_long(s, addr, val); 2720 break; 2721 2722 /* Fault Event Address Register, 32-bit */ 2723 case DMAR_FEADDR_REG: 2724 if (size == 4) { 2725 vtd_set_long(s, addr, val); 2726 } else { 2727 /* 2728 * While the register is 32-bit only, some guests (Xen...) write to 2729 * it with 64-bit. 2730 */ 2731 vtd_set_quad(s, addr, val); 2732 } 2733 break; 2734 2735 /* Fault Event Upper Address Register, 32-bit */ 2736 case DMAR_FEUADDR_REG: 2737 assert(size == 4); 2738 vtd_set_long(s, addr, val); 2739 break; 2740 2741 /* Protected Memory Enable Register, 32-bit */ 2742 case DMAR_PMEN_REG: 2743 assert(size == 4); 2744 vtd_set_long(s, addr, val); 2745 break; 2746 2747 /* Root Table Address Register, 64-bit */ 2748 case DMAR_RTADDR_REG: 2749 if (size == 4) { 2750 vtd_set_long(s, addr, val); 2751 } else { 2752 vtd_set_quad(s, addr, val); 2753 } 2754 break; 2755 2756 case DMAR_RTADDR_REG_HI: 2757 assert(size == 4); 2758 vtd_set_long(s, addr, val); 2759 break; 2760 2761 /* Invalidation Queue Tail Register, 64-bit */ 2762 case DMAR_IQT_REG: 2763 if (size == 4) { 2764 vtd_set_long(s, addr, val); 2765 } else { 2766 vtd_set_quad(s, addr, val); 2767 } 2768 vtd_handle_iqt_write(s); 2769 break; 2770 2771 case DMAR_IQT_REG_HI: 2772 assert(size == 4); 2773 vtd_set_long(s, addr, val); 2774 /* 19:63 of IQT_REG is RsvdZ, do nothing here */ 2775 break; 2776 2777 /* Invalidation Queue Address Register, 64-bit */ 2778 case DMAR_IQA_REG: 2779 if (size == 4) { 2780 vtd_set_long(s, addr, val); 2781 } else { 2782 vtd_set_quad(s, addr, val); 2783 } 2784 if (s->ecap & VTD_ECAP_SMTS && 2785 val & VTD_IQA_DW_MASK) { 2786 s->iq_dw = true; 2787 } else { 2788 s->iq_dw = false; 2789 } 2790 break; 2791 2792 case DMAR_IQA_REG_HI: 2793 assert(size == 4); 2794 vtd_set_long(s, addr, val); 2795 break; 2796 2797 /* Invalidation Completion Status Register, 32-bit */ 2798 case DMAR_ICS_REG: 2799 assert(size == 4); 2800 vtd_set_long(s, addr, val); 2801 vtd_handle_ics_write(s); 2802 break; 2803 2804 /* Invalidation Event Control Register, 32-bit */ 2805 case DMAR_IECTL_REG: 2806 assert(size == 4); 2807 vtd_set_long(s, addr, val); 2808 vtd_handle_iectl_write(s); 2809 break; 2810 2811 /* Invalidation Event Data Register, 32-bit */ 2812 case DMAR_IEDATA_REG: 2813 assert(size == 4); 2814 vtd_set_long(s, addr, val); 2815 break; 2816 2817 /* Invalidation Event Address Register, 32-bit */ 2818 case DMAR_IEADDR_REG: 2819 assert(size == 4); 2820 vtd_set_long(s, addr, val); 2821 break; 2822 2823 /* Invalidation Event Upper Address Register, 32-bit */ 2824 case DMAR_IEUADDR_REG: 2825 assert(size == 4); 2826 vtd_set_long(s, addr, val); 2827 break; 2828 2829 /* Fault Recording Registers, 128-bit */ 2830 case DMAR_FRCD_REG_0_0: 2831 if (size == 4) { 2832 vtd_set_long(s, addr, val); 2833 } else { 2834 vtd_set_quad(s, addr, val); 2835 } 2836 break; 2837 2838 case DMAR_FRCD_REG_0_1: 2839 assert(size == 4); 2840 vtd_set_long(s, addr, val); 2841 break; 2842 2843 case DMAR_FRCD_REG_0_2: 2844 if (size == 4) { 2845 vtd_set_long(s, addr, val); 2846 } else { 2847 vtd_set_quad(s, addr, val); 2848 /* May clear bit 127 (Fault), update PPF */ 2849 vtd_update_fsts_ppf(s); 2850 } 2851 break; 2852 2853 case DMAR_FRCD_REG_0_3: 2854 assert(size == 4); 2855 vtd_set_long(s, addr, val); 2856 /* May clear bit 127 (Fault), update PPF */ 2857 vtd_update_fsts_ppf(s); 2858 break; 2859 2860 case DMAR_IRTA_REG: 2861 if (size == 4) { 2862 vtd_set_long(s, addr, val); 2863 } else { 2864 vtd_set_quad(s, addr, val); 2865 } 2866 break; 2867 2868 case DMAR_IRTA_REG_HI: 2869 assert(size == 4); 2870 vtd_set_long(s, addr, val); 2871 break; 2872 2873 default: 2874 if (size == 4) { 2875 vtd_set_long(s, addr, val); 2876 } else { 2877 vtd_set_quad(s, addr, val); 2878 } 2879 } 2880 } 2881 2882 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, 2883 IOMMUAccessFlags flag, int iommu_idx) 2884 { 2885 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2886 IntelIOMMUState *s = vtd_as->iommu_state; 2887 IOMMUTLBEntry iotlb = { 2888 /* We'll fill in the rest later. */ 2889 .target_as = &address_space_memory, 2890 }; 2891 bool success; 2892 2893 if (likely(s->dmar_enabled)) { 2894 success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, 2895 addr, flag & IOMMU_WO, &iotlb); 2896 } else { 2897 /* DMAR disabled, passthrough, use 4k-page*/ 2898 iotlb.iova = addr & VTD_PAGE_MASK_4K; 2899 iotlb.translated_addr = addr & VTD_PAGE_MASK_4K; 2900 iotlb.addr_mask = ~VTD_PAGE_MASK_4K; 2901 iotlb.perm = IOMMU_RW; 2902 success = true; 2903 } 2904 2905 if (likely(success)) { 2906 trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus), 2907 VTD_PCI_SLOT(vtd_as->devfn), 2908 VTD_PCI_FUNC(vtd_as->devfn), 2909 iotlb.iova, iotlb.translated_addr, 2910 iotlb.addr_mask); 2911 } else { 2912 error_report_once("%s: detected translation failure " 2913 "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")", 2914 __func__, pci_bus_num(vtd_as->bus), 2915 VTD_PCI_SLOT(vtd_as->devfn), 2916 VTD_PCI_FUNC(vtd_as->devfn), 2917 addr); 2918 } 2919 2920 return iotlb; 2921 } 2922 2923 static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, 2924 IOMMUNotifierFlag old, 2925 IOMMUNotifierFlag new) 2926 { 2927 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); 2928 IntelIOMMUState *s = vtd_as->iommu_state; 2929 2930 if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { 2931 error_report("We need to set caching-mode=on for intel-iommu to enable " 2932 "device assignment with IOMMU protection."); 2933 exit(1); 2934 } 2935 2936 /* Update per-address-space notifier flags */ 2937 vtd_as->notifier_flags = new; 2938 2939 if (old == IOMMU_NOTIFIER_NONE) { 2940 QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); 2941 } else if (new == IOMMU_NOTIFIER_NONE) { 2942 QLIST_REMOVE(vtd_as, next); 2943 } 2944 } 2945 2946 static int vtd_post_load(void *opaque, int version_id) 2947 { 2948 IntelIOMMUState *iommu = opaque; 2949 2950 /* 2951 * Memory regions are dynamically turned on/off depending on 2952 * context entry configurations from the guest. After migration, 2953 * we need to make sure the memory regions are still correct. 2954 */ 2955 vtd_switch_address_space_all(iommu); 2956 2957 /* 2958 * We don't need to migrate the root_scalable because we can 2959 * simply do the calculation after the loading is complete. We 2960 * can actually do similar things with root, dmar_enabled, etc. 2961 * however since we've had them already so we'd better keep them 2962 * for compatibility of migration. 2963 */ 2964 vtd_update_scalable_state(iommu); 2965 2966 return 0; 2967 } 2968 2969 static const VMStateDescription vtd_vmstate = { 2970 .name = "iommu-intel", 2971 .version_id = 1, 2972 .minimum_version_id = 1, 2973 .priority = MIG_PRI_IOMMU, 2974 .post_load = vtd_post_load, 2975 .fields = (VMStateField[]) { 2976 VMSTATE_UINT64(root, IntelIOMMUState), 2977 VMSTATE_UINT64(intr_root, IntelIOMMUState), 2978 VMSTATE_UINT64(iq, IntelIOMMUState), 2979 VMSTATE_UINT32(intr_size, IntelIOMMUState), 2980 VMSTATE_UINT16(iq_head, IntelIOMMUState), 2981 VMSTATE_UINT16(iq_tail, IntelIOMMUState), 2982 VMSTATE_UINT16(iq_size, IntelIOMMUState), 2983 VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState), 2984 VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE), 2985 VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState), 2986 VMSTATE_UNUSED(1), /* bool root_extended is obsolete by VT-d */ 2987 VMSTATE_BOOL(dmar_enabled, IntelIOMMUState), 2988 VMSTATE_BOOL(qi_enabled, IntelIOMMUState), 2989 VMSTATE_BOOL(intr_enabled, IntelIOMMUState), 2990 VMSTATE_BOOL(intr_eime, IntelIOMMUState), 2991 VMSTATE_END_OF_LIST() 2992 } 2993 }; 2994 2995 static const MemoryRegionOps vtd_mem_ops = { 2996 .read = vtd_mem_read, 2997 .write = vtd_mem_write, 2998 .endianness = DEVICE_LITTLE_ENDIAN, 2999 .impl = { 3000 .min_access_size = 4, 3001 .max_access_size = 8, 3002 }, 3003 .valid = { 3004 .min_access_size = 4, 3005 .max_access_size = 8, 3006 }, 3007 }; 3008 3009 static Property vtd_properties[] = { 3010 DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), 3011 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, 3012 ON_OFF_AUTO_AUTO), 3013 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), 3014 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits, 3015 VTD_HOST_ADDRESS_WIDTH), 3016 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), 3017 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), 3018 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), 3019 DEFINE_PROP_END_OF_LIST(), 3020 }; 3021 3022 /* Read IRTE entry with specific index */ 3023 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, 3024 VTD_IR_TableEntry *entry, uint16_t sid) 3025 { 3026 static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ 3027 {0xffff, 0xfffb, 0xfff9, 0xfff8}; 3028 dma_addr_t addr = 0x00; 3029 uint16_t mask, source_id; 3030 uint8_t bus, bus_max, bus_min; 3031 3032 addr = iommu->intr_root + index * sizeof(*entry); 3033 if (dma_memory_read(&address_space_memory, addr, entry, 3034 sizeof(*entry))) { 3035 error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, 3036 __func__, index, addr); 3037 return -VTD_FR_IR_ROOT_INVAL; 3038 } 3039 3040 trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), 3041 le64_to_cpu(entry->data[0])); 3042 3043 if (!entry->irte.present) { 3044 error_report_once("%s: detected non-present IRTE " 3045 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3046 __func__, index, le64_to_cpu(entry->data[1]), 3047 le64_to_cpu(entry->data[0])); 3048 return -VTD_FR_IR_ENTRY_P; 3049 } 3050 3051 if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || 3052 entry->irte.__reserved_2) { 3053 error_report_once("%s: detected non-zero reserved IRTE " 3054 "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", 3055 __func__, index, le64_to_cpu(entry->data[1]), 3056 le64_to_cpu(entry->data[0])); 3057 return -VTD_FR_IR_IRTE_RSVD; 3058 } 3059 3060 if (sid != X86_IOMMU_SID_INVALID) { 3061 /* Validate IRTE SID */ 3062 source_id = le32_to_cpu(entry->irte.source_id); 3063 switch (entry->irte.sid_vtype) { 3064 case VTD_SVT_NONE: 3065 break; 3066 3067 case VTD_SVT_ALL: 3068 mask = vtd_svt_mask[entry->irte.sid_q]; 3069 if ((source_id & mask) != (sid & mask)) { 3070 error_report_once("%s: invalid IRTE SID " 3071 "(index=%u, sid=%u, source_id=%u)", 3072 __func__, index, sid, source_id); 3073 return -VTD_FR_IR_SID_ERR; 3074 } 3075 break; 3076 3077 case VTD_SVT_BUS: 3078 bus_max = source_id >> 8; 3079 bus_min = source_id & 0xff; 3080 bus = sid >> 8; 3081 if (bus > bus_max || bus < bus_min) { 3082 error_report_once("%s: invalid SVT_BUS " 3083 "(index=%u, bus=%u, min=%u, max=%u)", 3084 __func__, index, bus, bus_min, bus_max); 3085 return -VTD_FR_IR_SID_ERR; 3086 } 3087 break; 3088 3089 default: 3090 error_report_once("%s: detected invalid IRTE SVT " 3091 "(index=%u, type=%d)", __func__, 3092 index, entry->irte.sid_vtype); 3093 /* Take this as verification failure. */ 3094 return -VTD_FR_IR_SID_ERR; 3095 break; 3096 } 3097 } 3098 3099 return 0; 3100 } 3101 3102 /* Fetch IRQ information of specific IR index */ 3103 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, 3104 X86IOMMUIrq *irq, uint16_t sid) 3105 { 3106 VTD_IR_TableEntry irte = {}; 3107 int ret = 0; 3108 3109 ret = vtd_irte_get(iommu, index, &irte, sid); 3110 if (ret) { 3111 return ret; 3112 } 3113 3114 irq->trigger_mode = irte.irte.trigger_mode; 3115 irq->vector = irte.irte.vector; 3116 irq->delivery_mode = irte.irte.delivery_mode; 3117 irq->dest = le32_to_cpu(irte.irte.dest_id); 3118 if (!iommu->intr_eime) { 3119 #define VTD_IR_APIC_DEST_MASK (0xff00ULL) 3120 #define VTD_IR_APIC_DEST_SHIFT (8) 3121 irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> 3122 VTD_IR_APIC_DEST_SHIFT; 3123 } 3124 irq->dest_mode = irte.irte.dest_mode; 3125 irq->redir_hint = irte.irte.redir_hint; 3126 3127 trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, 3128 irq->delivery_mode, irq->dest, irq->dest_mode); 3129 3130 return 0; 3131 } 3132 3133 /* Interrupt remapping for MSI/MSI-X entry */ 3134 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, 3135 MSIMessage *origin, 3136 MSIMessage *translated, 3137 uint16_t sid) 3138 { 3139 int ret = 0; 3140 VTD_IR_MSIAddress addr; 3141 uint16_t index; 3142 X86IOMMUIrq irq = {}; 3143 3144 assert(origin && translated); 3145 3146 trace_vtd_ir_remap_msi_req(origin->address, origin->data); 3147 3148 if (!iommu || !iommu->intr_enabled) { 3149 memcpy(translated, origin, sizeof(*origin)); 3150 goto out; 3151 } 3152 3153 if (origin->address & VTD_MSI_ADDR_HI_MASK) { 3154 error_report_once("%s: MSI address high 32 bits non-zero detected: " 3155 "address=0x%" PRIx64, __func__, origin->address); 3156 return -VTD_FR_IR_REQ_RSVD; 3157 } 3158 3159 addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; 3160 if (addr.addr.__head != 0xfee) { 3161 error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, 3162 __func__, addr.data); 3163 return -VTD_FR_IR_REQ_RSVD; 3164 } 3165 3166 /* This is compatible mode. */ 3167 if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) { 3168 memcpy(translated, origin, sizeof(*origin)); 3169 goto out; 3170 } 3171 3172 index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); 3173 3174 #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) 3175 #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) 3176 3177 if (addr.addr.sub_valid) { 3178 /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */ 3179 index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; 3180 } 3181 3182 ret = vtd_remap_irq_get(iommu, index, &irq, sid); 3183 if (ret) { 3184 return ret; 3185 } 3186 3187 if (addr.addr.sub_valid) { 3188 trace_vtd_ir_remap_type("MSI"); 3189 if (origin->data & VTD_IR_MSI_DATA_RESERVED) { 3190 error_report_once("%s: invalid IR MSI " 3191 "(sid=%u, address=0x%" PRIx64 3192 ", data=0x%" PRIx32 ")", 3193 __func__, sid, origin->address, origin->data); 3194 return -VTD_FR_IR_REQ_RSVD; 3195 } 3196 } else { 3197 uint8_t vector = origin->data & 0xff; 3198 uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; 3199 3200 trace_vtd_ir_remap_type("IOAPIC"); 3201 /* IOAPIC entry vector should be aligned with IRTE vector 3202 * (see vt-d spec 5.1.5.1). */ 3203 if (vector != irq.vector) { 3204 trace_vtd_warn_ir_vector(sid, index, vector, irq.vector); 3205 } 3206 3207 /* The Trigger Mode field must match the Trigger Mode in the IRTE. 3208 * (see vt-d spec 5.1.5.1). */ 3209 if (trigger_mode != irq.trigger_mode) { 3210 trace_vtd_warn_ir_trigger(sid, index, trigger_mode, 3211 irq.trigger_mode); 3212 } 3213 } 3214 3215 /* 3216 * We'd better keep the last two bits, assuming that guest OS 3217 * might modify it. Keep it does not hurt after all. 3218 */ 3219 irq.msi_addr_last_bits = addr.addr.__not_care; 3220 3221 /* Translate X86IOMMUIrq to MSI message */ 3222 x86_iommu_irq_to_msi_message(&irq, translated); 3223 3224 out: 3225 trace_vtd_ir_remap_msi(origin->address, origin->data, 3226 translated->address, translated->data); 3227 return 0; 3228 } 3229 3230 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, 3231 MSIMessage *dst, uint16_t sid) 3232 { 3233 return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), 3234 src, dst, sid); 3235 } 3236 3237 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, 3238 uint64_t *data, unsigned size, 3239 MemTxAttrs attrs) 3240 { 3241 return MEMTX_OK; 3242 } 3243 3244 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, 3245 uint64_t value, unsigned size, 3246 MemTxAttrs attrs) 3247 { 3248 int ret = 0; 3249 MSIMessage from = {}, to = {}; 3250 uint16_t sid = X86_IOMMU_SID_INVALID; 3251 3252 from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST; 3253 from.data = (uint32_t) value; 3254 3255 if (!attrs.unspecified) { 3256 /* We have explicit Source ID */ 3257 sid = attrs.requester_id; 3258 } 3259 3260 ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); 3261 if (ret) { 3262 /* TODO: report error */ 3263 /* Drop this interrupt */ 3264 return MEMTX_ERROR; 3265 } 3266 3267 apic_get_class()->send_msi(&to); 3268 3269 return MEMTX_OK; 3270 } 3271 3272 static const MemoryRegionOps vtd_mem_ir_ops = { 3273 .read_with_attrs = vtd_mem_ir_read, 3274 .write_with_attrs = vtd_mem_ir_write, 3275 .endianness = DEVICE_LITTLE_ENDIAN, 3276 .impl = { 3277 .min_access_size = 4, 3278 .max_access_size = 4, 3279 }, 3280 .valid = { 3281 .min_access_size = 4, 3282 .max_access_size = 4, 3283 }, 3284 }; 3285 3286 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) 3287 { 3288 uintptr_t key = (uintptr_t)bus; 3289 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); 3290 VTDAddressSpace *vtd_dev_as; 3291 char name[128]; 3292 3293 if (!vtd_bus) { 3294 uintptr_t *new_key = g_malloc(sizeof(*new_key)); 3295 *new_key = (uintptr_t)bus; 3296 /* No corresponding free() */ 3297 vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ 3298 PCI_DEVFN_MAX); 3299 vtd_bus->bus = bus; 3300 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); 3301 } 3302 3303 vtd_dev_as = vtd_bus->dev_as[devfn]; 3304 3305 if (!vtd_dev_as) { 3306 snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), 3307 PCI_FUNC(devfn)); 3308 vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); 3309 3310 vtd_dev_as->bus = bus; 3311 vtd_dev_as->devfn = (uint8_t)devfn; 3312 vtd_dev_as->iommu_state = s; 3313 vtd_dev_as->context_cache_entry.context_cache_gen = 0; 3314 vtd_dev_as->iova_tree = iova_tree_new(); 3315 3316 memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX); 3317 address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root"); 3318 3319 /* 3320 * Build the DMAR-disabled container with aliases to the 3321 * shared MRs. Note that aliasing to a shared memory region 3322 * could help the memory API to detect same FlatViews so we 3323 * can have devices to share the same FlatView when DMAR is 3324 * disabled (either by not providing "intel_iommu=on" or with 3325 * "iommu=pt"). It will greatly reduce the total number of 3326 * FlatViews of the system hence VM runs faster. 3327 */ 3328 memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s), 3329 "vtd-nodmar", &s->mr_nodmar, 0, 3330 memory_region_size(&s->mr_nodmar)); 3331 3332 /* 3333 * Build the per-device DMAR-enabled container. 3334 * 3335 * TODO: currently we have per-device IOMMU memory region only 3336 * because we have per-device IOMMU notifiers for devices. If 3337 * one day we can abstract the IOMMU notifiers out of the 3338 * memory regions then we can also share the same memory 3339 * region here just like what we've done above with the nodmar 3340 * region. 3341 */ 3342 strcat(name, "-dmar"); 3343 memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu), 3344 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s), 3345 name, UINT64_MAX); 3346 memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir", 3347 &s->mr_ir, 0, memory_region_size(&s->mr_ir)); 3348 memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu), 3349 VTD_INTERRUPT_ADDR_FIRST, 3350 &vtd_dev_as->iommu_ir, 1); 3351 3352 /* 3353 * Hook both the containers under the root container, we 3354 * switch between DMAR & noDMAR by enable/disable 3355 * corresponding sub-containers 3356 */ 3357 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3358 MEMORY_REGION(&vtd_dev_as->iommu), 3359 0); 3360 memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, 3361 &vtd_dev_as->nodmar, 0); 3362 3363 vtd_switch_address_space(vtd_dev_as); 3364 } 3365 return vtd_dev_as; 3366 } 3367 3368 static uint64_t get_naturally_aligned_size(uint64_t start, 3369 uint64_t size, int gaw) 3370 { 3371 uint64_t max_mask = 1ULL << gaw; 3372 uint64_t alignment = start ? start & -start : max_mask; 3373 3374 alignment = MIN(alignment, max_mask); 3375 size = MIN(size, max_mask); 3376 3377 if (alignment <= size) { 3378 /* Increase the alignment of start */ 3379 return alignment; 3380 } else { 3381 /* Find the largest page mask from size */ 3382 return 1ULL << (63 - clz64(size)); 3383 } 3384 } 3385 3386 /* Unmap the whole range in the notifier's scope. */ 3387 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) 3388 { 3389 hwaddr size, remain; 3390 hwaddr start = n->start; 3391 hwaddr end = n->end; 3392 IntelIOMMUState *s = as->iommu_state; 3393 DMAMap map; 3394 3395 /* 3396 * Note: all the codes in this function has a assumption that IOVA 3397 * bits are no more than VTD_MGAW bits (which is restricted by 3398 * VT-d spec), otherwise we need to consider overflow of 64 bits. 3399 */ 3400 3401 if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) { 3402 /* 3403 * Don't need to unmap regions that is bigger than the whole 3404 * VT-d supported address space size 3405 */ 3406 end = VTD_ADDRESS_SIZE(s->aw_bits) - 1; 3407 } 3408 3409 assert(start <= end); 3410 size = remain = end - start + 1; 3411 3412 while (remain >= VTD_PAGE_SIZE) { 3413 IOMMUTLBEntry entry; 3414 uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits); 3415 3416 assert(mask); 3417 3418 entry.iova = start; 3419 entry.addr_mask = mask - 1; 3420 entry.target_as = &address_space_memory; 3421 entry.perm = IOMMU_NONE; 3422 /* This field is meaningless for unmap */ 3423 entry.translated_addr = 0; 3424 3425 memory_region_notify_one(n, &entry); 3426 3427 start += mask; 3428 remain -= mask; 3429 } 3430 3431 assert(!remain); 3432 3433 trace_vtd_as_unmap_whole(pci_bus_num(as->bus), 3434 VTD_PCI_SLOT(as->devfn), 3435 VTD_PCI_FUNC(as->devfn), 3436 n->start, size); 3437 3438 map.iova = n->start; 3439 map.size = size; 3440 iova_tree_remove(as->iova_tree, &map); 3441 } 3442 3443 static void vtd_address_space_unmap_all(IntelIOMMUState *s) 3444 { 3445 VTDAddressSpace *vtd_as; 3446 IOMMUNotifier *n; 3447 3448 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { 3449 IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { 3450 vtd_address_space_unmap(vtd_as, n); 3451 } 3452 } 3453 } 3454 3455 static void vtd_address_space_refresh_all(IntelIOMMUState *s) 3456 { 3457 vtd_address_space_unmap_all(s); 3458 vtd_switch_address_space_all(s); 3459 } 3460 3461 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) 3462 { 3463 memory_region_notify_one((IOMMUNotifier *)private, entry); 3464 return 0; 3465 } 3466 3467 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) 3468 { 3469 VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu); 3470 IntelIOMMUState *s = vtd_as->iommu_state; 3471 uint8_t bus_n = pci_bus_num(vtd_as->bus); 3472 VTDContextEntry ce; 3473 3474 /* 3475 * The replay can be triggered by either a invalidation or a newly 3476 * created entry. No matter what, we release existing mappings 3477 * (it means flushing caches for UNMAP-only registers). 3478 */ 3479 vtd_address_space_unmap(vtd_as, n); 3480 3481 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { 3482 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : 3483 "legacy mode", 3484 bus_n, PCI_SLOT(vtd_as->devfn), 3485 PCI_FUNC(vtd_as->devfn), 3486 vtd_get_domain_id(s, &ce), 3487 ce.hi, ce.lo); 3488 if (vtd_as_has_map_notifier(vtd_as)) { 3489 /* This is required only for MAP typed notifiers */ 3490 vtd_page_walk_info info = { 3491 .hook_fn = vtd_replay_hook, 3492 .private = (void *)n, 3493 .notify_unmap = false, 3494 .aw = s->aw_bits, 3495 .as = vtd_as, 3496 .domain_id = vtd_get_domain_id(s, &ce), 3497 }; 3498 3499 vtd_page_walk(s, &ce, 0, ~0ULL, &info); 3500 } 3501 } else { 3502 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), 3503 PCI_FUNC(vtd_as->devfn)); 3504 } 3505 3506 return; 3507 } 3508 3509 /* Do the initialization. It will also be called when reset, so pay 3510 * attention when adding new initialization stuff. 3511 */ 3512 static void vtd_init(IntelIOMMUState *s) 3513 { 3514 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3515 3516 memset(s->csr, 0, DMAR_REG_SIZE); 3517 memset(s->wmask, 0, DMAR_REG_SIZE); 3518 memset(s->w1cmask, 0, DMAR_REG_SIZE); 3519 memset(s->womask, 0, DMAR_REG_SIZE); 3520 3521 s->root = 0; 3522 s->root_scalable = false; 3523 s->dmar_enabled = false; 3524 s->intr_enabled = false; 3525 s->iq_head = 0; 3526 s->iq_tail = 0; 3527 s->iq = 0; 3528 s->iq_size = 0; 3529 s->qi_enabled = false; 3530 s->iq_last_desc_type = VTD_INV_DESC_NONE; 3531 s->iq_dw = false; 3532 s->next_frcd_reg = 0; 3533 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | 3534 VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | 3535 VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); 3536 if (s->dma_drain) { 3537 s->cap |= VTD_CAP_DRAIN; 3538 } 3539 if (s->aw_bits == VTD_HOST_AW_48BIT) { 3540 s->cap |= VTD_CAP_SAGAW_48bit; 3541 } 3542 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; 3543 3544 /* 3545 * Rsvd field masks for spte 3546 */ 3547 vtd_paging_entry_rsvd_field[0] = ~0ULL; 3548 vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits); 3549 vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); 3550 vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); 3551 vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); 3552 vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits); 3553 vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); 3554 vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); 3555 vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits); 3556 3557 if (x86_iommu_ir_supported(x86_iommu)) { 3558 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; 3559 if (s->intr_eim == ON_OFF_AUTO_ON) { 3560 s->ecap |= VTD_ECAP_EIM; 3561 } 3562 assert(s->intr_eim != ON_OFF_AUTO_AUTO); 3563 } 3564 3565 if (x86_iommu->dt_supported) { 3566 s->ecap |= VTD_ECAP_DT; 3567 } 3568 3569 if (x86_iommu->pt_supported) { 3570 s->ecap |= VTD_ECAP_PT; 3571 } 3572 3573 if (s->caching_mode) { 3574 s->cap |= VTD_CAP_CM; 3575 } 3576 3577 /* TODO: read cap/ecap from host to decide which cap to be exposed. */ 3578 if (s->scalable_mode) { 3579 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; 3580 } 3581 3582 vtd_reset_caches(s); 3583 3584 /* Define registers with default values and bit semantics */ 3585 vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); 3586 vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); 3587 vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); 3588 vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); 3589 vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); 3590 vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); 3591 vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0); 3592 vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0); 3593 vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL); 3594 3595 /* Advanced Fault Logging not supported */ 3596 vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL); 3597 vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3598 vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0); 3599 vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0); 3600 3601 /* Treated as RsvdZ when EIM in ECAP_REG is not supported 3602 * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0); 3603 */ 3604 vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0); 3605 3606 /* Treated as RO for implementations that PLMR and PHMR fields reported 3607 * as Clear in the CAP_REG. 3608 * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0); 3609 */ 3610 vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0); 3611 3612 vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0); 3613 vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0); 3614 vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0); 3615 vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL); 3616 vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0); 3617 vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0); 3618 vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0); 3619 /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */ 3620 vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0); 3621 3622 /* IOTLB registers */ 3623 vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0); 3624 vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0); 3625 vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL); 3626 3627 /* Fault Recording Registers, 128-bit */ 3628 vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0); 3629 vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); 3630 3631 /* 3632 * Interrupt remapping registers. 3633 */ 3634 vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); 3635 } 3636 3637 /* Should not reset address_spaces when reset because devices will still use 3638 * the address space they got at first (won't ask the bus again). 3639 */ 3640 static void vtd_reset(DeviceState *dev) 3641 { 3642 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3643 3644 vtd_init(s); 3645 vtd_address_space_refresh_all(s); 3646 } 3647 3648 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) 3649 { 3650 IntelIOMMUState *s = opaque; 3651 VTDAddressSpace *vtd_as; 3652 3653 assert(0 <= devfn && devfn < PCI_DEVFN_MAX); 3654 3655 vtd_as = vtd_find_add_as(s, bus, devfn); 3656 return &vtd_as->as; 3657 } 3658 3659 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) 3660 { 3661 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); 3662 3663 if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) { 3664 error_setg(errp, "eim=on cannot be selected without intremap=on"); 3665 return false; 3666 } 3667 3668 if (s->intr_eim == ON_OFF_AUTO_AUTO) { 3669 s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) 3670 && x86_iommu_ir_supported(x86_iommu) ? 3671 ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 3672 } 3673 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { 3674 if (!kvm_irqchip_in_kernel()) { 3675 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); 3676 return false; 3677 } 3678 if (!kvm_enable_x2apic()) { 3679 error_setg(errp, "eim=on requires support on the KVM side" 3680 "(X2APIC_API, first shipped in v4.7)"); 3681 return false; 3682 } 3683 } 3684 3685 /* Currently only address widths supported are 39 and 48 bits */ 3686 if ((s->aw_bits != VTD_HOST_AW_39BIT) && 3687 (s->aw_bits != VTD_HOST_AW_48BIT)) { 3688 error_setg(errp, "Supported values for x-aw-bits are: %d, %d", 3689 VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); 3690 return false; 3691 } 3692 3693 if (s->scalable_mode && !s->dma_drain) { 3694 error_setg(errp, "Need to set dma_drain for scalable mode"); 3695 return false; 3696 } 3697 3698 return true; 3699 } 3700 3701 static void vtd_realize(DeviceState *dev, Error **errp) 3702 { 3703 MachineState *ms = MACHINE(qdev_get_machine()); 3704 PCMachineState *pcms = PC_MACHINE(ms); 3705 PCIBus *bus = pcms->bus; 3706 IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); 3707 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); 3708 3709 x86_iommu->type = TYPE_INTEL; 3710 3711 if (!vtd_decide_config(s, errp)) { 3712 return; 3713 } 3714 3715 QLIST_INIT(&s->vtd_as_with_notifiers); 3716 qemu_mutex_init(&s->iommu_lock); 3717 memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); 3718 memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, 3719 "intel_iommu", DMAR_REG_SIZE); 3720 3721 /* Create the shared memory regions by all devices */ 3722 memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", 3723 UINT64_MAX); 3724 memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops, 3725 s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE); 3726 memory_region_init_alias(&s->mr_sys_alias, OBJECT(s), 3727 "vtd-sys-alias", get_system_memory(), 0, 3728 memory_region_size(get_system_memory())); 3729 memory_region_add_subregion_overlap(&s->mr_nodmar, 0, 3730 &s->mr_sys_alias, 0); 3731 memory_region_add_subregion_overlap(&s->mr_nodmar, 3732 VTD_INTERRUPT_ADDR_FIRST, 3733 &s->mr_ir, 1); 3734 3735 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); 3736 /* No corresponding destroy */ 3737 s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3738 g_free, g_free); 3739 s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, 3740 g_free, g_free); 3741 vtd_init(s); 3742 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); 3743 pci_setup_iommu(bus, vtd_host_dma_iommu, dev); 3744 /* Pseudo address space under root PCI bus. */ 3745 pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); 3746 } 3747 3748 static void vtd_class_init(ObjectClass *klass, void *data) 3749 { 3750 DeviceClass *dc = DEVICE_CLASS(klass); 3751 X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass); 3752 3753 dc->reset = vtd_reset; 3754 dc->vmsd = &vtd_vmstate; 3755 dc->props = vtd_properties; 3756 dc->hotpluggable = false; 3757 x86_class->realize = vtd_realize; 3758 x86_class->int_remap = vtd_int_remap; 3759 /* Supported by the pc-q35-* machine types */ 3760 dc->user_creatable = true; 3761 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3762 dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; 3763 } 3764 3765 static const TypeInfo vtd_info = { 3766 .name = TYPE_INTEL_IOMMU_DEVICE, 3767 .parent = TYPE_X86_IOMMU_DEVICE, 3768 .instance_size = sizeof(IntelIOMMUState), 3769 .class_init = vtd_class_init, 3770 }; 3771 3772 static void vtd_iommu_memory_region_class_init(ObjectClass *klass, 3773 void *data) 3774 { 3775 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); 3776 3777 imrc->translate = vtd_iommu_translate; 3778 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed; 3779 imrc->replay = vtd_iommu_replay; 3780 } 3781 3782 static const TypeInfo vtd_iommu_memory_region_info = { 3783 .parent = TYPE_IOMMU_MEMORY_REGION, 3784 .name = TYPE_INTEL_IOMMU_MEMORY_REGION, 3785 .class_init = vtd_iommu_memory_region_class_init, 3786 }; 3787 3788 static void vtd_register_types(void) 3789 { 3790 type_register_static(&vtd_info); 3791 type_register_static(&vtd_iommu_memory_region_info); 3792 } 3793 3794 type_init(vtd_register_types) 3795