1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dma-iommu.h> 19 #include <linux/dmi.h> 20 #include <linux/intel-svm.h> 21 #include <linux/memory.h> 22 #include <linux/pci.h> 23 #include <linux/pci-ats.h> 24 #include <linux/spinlock.h> 25 #include <linux/syscore_ops.h> 26 #include <linux/tboot.h> 27 28 #include "iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva-lib.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 static void __init check_tylersburg_isoch(void); 130 static int rwbf_quirk; 131 132 /* 133 * set to 1 to panic kernel if can't successfully enable VT-d 134 * (used when kernel is launched w/ TXT) 135 */ 136 static int force_on = 0; 137 static int intel_iommu_tboot_noforce; 138 static int no_platform_optin; 139 140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 141 142 /* 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 144 * if marked present. 145 */ 146 static phys_addr_t root_entry_lctp(struct root_entry *re) 147 { 148 if (!(re->lo & 1)) 149 return 0; 150 151 return re->lo & VTD_PAGE_MASK; 152 } 153 154 /* 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 156 * if marked present. 157 */ 158 static phys_addr_t root_entry_uctp(struct root_entry *re) 159 { 160 if (!(re->hi & 1)) 161 return 0; 162 163 return re->hi & VTD_PAGE_MASK; 164 } 165 166 static inline void context_clear_pasid_enable(struct context_entry *context) 167 { 168 context->lo &= ~(1ULL << 11); 169 } 170 171 static inline bool context_pasid_enabled(struct context_entry *context) 172 { 173 return !!(context->lo & (1ULL << 11)); 174 } 175 176 static inline void context_set_copied(struct context_entry *context) 177 { 178 context->hi |= (1ull << 3); 179 } 180 181 static inline bool context_copied(struct context_entry *context) 182 { 183 return !!(context->hi & (1ULL << 3)); 184 } 185 186 static inline bool __context_present(struct context_entry *context) 187 { 188 return (context->lo & 1); 189 } 190 191 bool context_present(struct context_entry *context) 192 { 193 return context_pasid_enabled(context) ? 194 __context_present(context) : 195 __context_present(context) && !context_copied(context); 196 } 197 198 static inline void context_set_present(struct context_entry *context) 199 { 200 context->lo |= 1; 201 } 202 203 static inline void context_set_fault_enable(struct context_entry *context) 204 { 205 context->lo &= (((u64)-1) << 2) | 1; 206 } 207 208 static inline void context_set_translation_type(struct context_entry *context, 209 unsigned long value) 210 { 211 context->lo &= (((u64)-1) << 4) | 3; 212 context->lo |= (value & 3) << 2; 213 } 214 215 static inline void context_set_address_root(struct context_entry *context, 216 unsigned long value) 217 { 218 context->lo &= ~VTD_PAGE_MASK; 219 context->lo |= value & VTD_PAGE_MASK; 220 } 221 222 static inline void context_set_address_width(struct context_entry *context, 223 unsigned long value) 224 { 225 context->hi |= value & 7; 226 } 227 228 static inline void context_set_domain_id(struct context_entry *context, 229 unsigned long value) 230 { 231 context->hi |= (value & ((1 << 16) - 1)) << 8; 232 } 233 234 static inline int context_domain_id(struct context_entry *c) 235 { 236 return((c->hi >> 8) & 0xffff); 237 } 238 239 static inline void context_clear_entry(struct context_entry *context) 240 { 241 context->lo = 0; 242 context->hi = 0; 243 } 244 245 /* 246 * This domain is a statically identity mapping domain. 247 * 1. This domain creats a static 1:1 mapping to all usable memory. 248 * 2. It maps to each iommu if successful. 249 * 3. Each iommu mapps to this domain if successful. 250 */ 251 static struct dmar_domain *si_domain; 252 static int hw_pass_through = 1; 253 254 struct dmar_rmrr_unit { 255 struct list_head list; /* list of rmrr units */ 256 struct acpi_dmar_header *hdr; /* ACPI header */ 257 u64 base_address; /* reserved base address*/ 258 u64 end_address; /* reserved end address */ 259 struct dmar_dev_scope *devices; /* target devices */ 260 int devices_cnt; /* target device count */ 261 }; 262 263 struct dmar_atsr_unit { 264 struct list_head list; /* list of ATSR units */ 265 struct acpi_dmar_header *hdr; /* ACPI header */ 266 struct dmar_dev_scope *devices; /* target devices */ 267 int devices_cnt; /* target device count */ 268 u8 include_all:1; /* include all ports */ 269 }; 270 271 struct dmar_satc_unit { 272 struct list_head list; /* list of SATC units */ 273 struct acpi_dmar_header *hdr; /* ACPI header */ 274 struct dmar_dev_scope *devices; /* target devices */ 275 struct intel_iommu *iommu; /* the corresponding iommu */ 276 int devices_cnt; /* target device count */ 277 u8 atc_required:1; /* ATS is required */ 278 }; 279 280 static LIST_HEAD(dmar_atsr_units); 281 static LIST_HEAD(dmar_rmrr_units); 282 static LIST_HEAD(dmar_satc_units); 283 284 #define for_each_rmrr_units(rmrr) \ 285 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 286 287 static void dmar_remove_one_dev_info(struct device *dev); 288 289 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 290 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 291 292 int intel_iommu_enabled = 0; 293 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 294 295 static int dmar_map_gfx = 1; 296 static int intel_iommu_superpage = 1; 297 static int iommu_identity_mapping; 298 static int iommu_skip_te_disable; 299 300 #define IDENTMAP_GFX 2 301 #define IDENTMAP_AZALIA 4 302 303 const struct iommu_ops intel_iommu_ops; 304 305 static bool translation_pre_enabled(struct intel_iommu *iommu) 306 { 307 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 308 } 309 310 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 311 { 312 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 313 } 314 315 static void init_translation_status(struct intel_iommu *iommu) 316 { 317 u32 gsts; 318 319 gsts = readl(iommu->reg + DMAR_GSTS_REG); 320 if (gsts & DMA_GSTS_TES) 321 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 322 } 323 324 static int __init intel_iommu_setup(char *str) 325 { 326 if (!str) 327 return -EINVAL; 328 329 while (*str) { 330 if (!strncmp(str, "on", 2)) { 331 dmar_disabled = 0; 332 pr_info("IOMMU enabled\n"); 333 } else if (!strncmp(str, "off", 3)) { 334 dmar_disabled = 1; 335 no_platform_optin = 1; 336 pr_info("IOMMU disabled\n"); 337 } else if (!strncmp(str, "igfx_off", 8)) { 338 dmar_map_gfx = 0; 339 pr_info("Disable GFX device mapping\n"); 340 } else if (!strncmp(str, "forcedac", 8)) { 341 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 342 iommu_dma_forcedac = true; 343 } else if (!strncmp(str, "strict", 6)) { 344 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 345 iommu_set_dma_strict(); 346 } else if (!strncmp(str, "sp_off", 6)) { 347 pr_info("Disable supported super page\n"); 348 intel_iommu_superpage = 0; 349 } else if (!strncmp(str, "sm_on", 5)) { 350 pr_info("Enable scalable mode if hardware supports\n"); 351 intel_iommu_sm = 1; 352 } else if (!strncmp(str, "sm_off", 6)) { 353 pr_info("Scalable mode is disallowed\n"); 354 intel_iommu_sm = 0; 355 } else if (!strncmp(str, "tboot_noforce", 13)) { 356 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 357 intel_iommu_tboot_noforce = 1; 358 } else { 359 pr_notice("Unknown option - '%s'\n", str); 360 } 361 362 str += strcspn(str, ","); 363 while (*str == ',') 364 str++; 365 } 366 367 return 1; 368 } 369 __setup("intel_iommu=", intel_iommu_setup); 370 371 void *alloc_pgtable_page(int node) 372 { 373 struct page *page; 374 void *vaddr = NULL; 375 376 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 377 if (page) 378 vaddr = page_address(page); 379 return vaddr; 380 } 381 382 void free_pgtable_page(void *vaddr) 383 { 384 free_page((unsigned long)vaddr); 385 } 386 387 static inline int domain_type_is_si(struct dmar_domain *domain) 388 { 389 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 390 } 391 392 static inline bool domain_use_first_level(struct dmar_domain *domain) 393 { 394 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 395 } 396 397 static inline int domain_pfn_supported(struct dmar_domain *domain, 398 unsigned long pfn) 399 { 400 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 401 402 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 403 } 404 405 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 406 { 407 unsigned long sagaw; 408 int agaw; 409 410 sagaw = cap_sagaw(iommu->cap); 411 for (agaw = width_to_agaw(max_gaw); 412 agaw >= 0; agaw--) { 413 if (test_bit(agaw, &sagaw)) 414 break; 415 } 416 417 return agaw; 418 } 419 420 /* 421 * Calculate max SAGAW for each iommu. 422 */ 423 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 424 { 425 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 426 } 427 428 /* 429 * calculate agaw for each iommu. 430 * "SAGAW" may be different across iommus, use a default agaw, and 431 * get a supported less agaw for iommus that don't support the default agaw. 432 */ 433 int iommu_calculate_agaw(struct intel_iommu *iommu) 434 { 435 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 436 } 437 438 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 439 { 440 return sm_supported(iommu) ? 441 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 442 } 443 444 static void domain_update_iommu_coherency(struct dmar_domain *domain) 445 { 446 struct iommu_domain_info *info; 447 struct dmar_drhd_unit *drhd; 448 struct intel_iommu *iommu; 449 bool found = false; 450 unsigned long i; 451 452 domain->iommu_coherency = true; 453 xa_for_each(&domain->iommu_array, i, info) { 454 found = true; 455 if (!iommu_paging_structure_coherency(info->iommu)) { 456 domain->iommu_coherency = false; 457 break; 458 } 459 } 460 if (found) 461 return; 462 463 /* No hardware attached; use lowest common denominator */ 464 rcu_read_lock(); 465 for_each_active_iommu(iommu, drhd) { 466 if (!iommu_paging_structure_coherency(iommu)) { 467 domain->iommu_coherency = false; 468 break; 469 } 470 } 471 rcu_read_unlock(); 472 } 473 474 static int domain_update_iommu_superpage(struct dmar_domain *domain, 475 struct intel_iommu *skip) 476 { 477 struct dmar_drhd_unit *drhd; 478 struct intel_iommu *iommu; 479 int mask = 0x3; 480 481 if (!intel_iommu_superpage) 482 return 0; 483 484 /* set iommu_superpage to the smallest common denominator */ 485 rcu_read_lock(); 486 for_each_active_iommu(iommu, drhd) { 487 if (iommu != skip) { 488 if (domain && domain_use_first_level(domain)) { 489 if (!cap_fl1gp_support(iommu->cap)) 490 mask = 0x1; 491 } else { 492 mask &= cap_super_page_val(iommu->cap); 493 } 494 495 if (!mask) 496 break; 497 } 498 } 499 rcu_read_unlock(); 500 501 return fls(mask); 502 } 503 504 static int domain_update_device_node(struct dmar_domain *domain) 505 { 506 struct device_domain_info *info; 507 int nid = NUMA_NO_NODE; 508 509 spin_lock(&domain->lock); 510 list_for_each_entry(info, &domain->devices, link) { 511 /* 512 * There could possibly be multiple device numa nodes as devices 513 * within the same domain may sit behind different IOMMUs. There 514 * isn't perfect answer in such situation, so we select first 515 * come first served policy. 516 */ 517 nid = dev_to_node(info->dev); 518 if (nid != NUMA_NO_NODE) 519 break; 520 } 521 spin_unlock(&domain->lock); 522 523 return nid; 524 } 525 526 static void domain_update_iotlb(struct dmar_domain *domain); 527 528 /* Return the super pagesize bitmap if supported. */ 529 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 530 { 531 unsigned long bitmap = 0; 532 533 /* 534 * 1-level super page supports page size of 2MiB, 2-level super page 535 * supports page size of both 2MiB and 1GiB. 536 */ 537 if (domain->iommu_superpage == 1) 538 bitmap |= SZ_2M; 539 else if (domain->iommu_superpage == 2) 540 bitmap |= SZ_2M | SZ_1G; 541 542 return bitmap; 543 } 544 545 /* Some capabilities may be different across iommus */ 546 static void domain_update_iommu_cap(struct dmar_domain *domain) 547 { 548 domain_update_iommu_coherency(domain); 549 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 550 551 /* 552 * If RHSA is missing, we should default to the device numa domain 553 * as fall back. 554 */ 555 if (domain->nid == NUMA_NO_NODE) 556 domain->nid = domain_update_device_node(domain); 557 558 /* 559 * First-level translation restricts the input-address to a 560 * canonical address (i.e., address bits 63:N have the same 561 * value as address bit [N-1], where N is 48-bits with 4-level 562 * paging and 57-bits with 5-level paging). Hence, skip bit 563 * [N-1]. 564 */ 565 if (domain_use_first_level(domain)) 566 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 567 else 568 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 569 570 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 571 domain_update_iotlb(domain); 572 } 573 574 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 575 u8 devfn, int alloc) 576 { 577 struct root_entry *root = &iommu->root_entry[bus]; 578 struct context_entry *context; 579 u64 *entry; 580 581 entry = &root->lo; 582 if (sm_supported(iommu)) { 583 if (devfn >= 0x80) { 584 devfn -= 0x80; 585 entry = &root->hi; 586 } 587 devfn *= 2; 588 } 589 if (*entry & 1) 590 context = phys_to_virt(*entry & VTD_PAGE_MASK); 591 else { 592 unsigned long phy_addr; 593 if (!alloc) 594 return NULL; 595 596 context = alloc_pgtable_page(iommu->node); 597 if (!context) 598 return NULL; 599 600 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 601 phy_addr = virt_to_phys((void *)context); 602 *entry = phy_addr | 1; 603 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 604 } 605 return &context[devfn]; 606 } 607 608 /** 609 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 610 * sub-hierarchy of a candidate PCI-PCI bridge 611 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 612 * @bridge: the candidate PCI-PCI bridge 613 * 614 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 615 */ 616 static bool 617 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 618 { 619 struct pci_dev *pdev, *pbridge; 620 621 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 622 return false; 623 624 pdev = to_pci_dev(dev); 625 pbridge = to_pci_dev(bridge); 626 627 if (pbridge->subordinate && 628 pbridge->subordinate->number <= pdev->bus->number && 629 pbridge->subordinate->busn_res.end >= pdev->bus->number) 630 return true; 631 632 return false; 633 } 634 635 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 636 { 637 struct dmar_drhd_unit *drhd; 638 u32 vtbar; 639 int rc; 640 641 /* We know that this device on this chipset has its own IOMMU. 642 * If we find it under a different IOMMU, then the BIOS is lying 643 * to us. Hope that the IOMMU for this device is actually 644 * disabled, and it needs no translation... 645 */ 646 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 647 if (rc) { 648 /* "can't" happen */ 649 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 650 return false; 651 } 652 vtbar &= 0xffff0000; 653 654 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 655 drhd = dmar_find_matched_drhd_unit(pdev); 656 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 657 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 658 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 659 return true; 660 } 661 662 return false; 663 } 664 665 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 666 { 667 if (!iommu || iommu->drhd->ignored) 668 return true; 669 670 if (dev_is_pci(dev)) { 671 struct pci_dev *pdev = to_pci_dev(dev); 672 673 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 674 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 675 quirk_ioat_snb_local_iommu(pdev)) 676 return true; 677 } 678 679 return false; 680 } 681 682 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 683 { 684 struct dmar_drhd_unit *drhd = NULL; 685 struct pci_dev *pdev = NULL; 686 struct intel_iommu *iommu; 687 struct device *tmp; 688 u16 segment = 0; 689 int i; 690 691 if (!dev) 692 return NULL; 693 694 if (dev_is_pci(dev)) { 695 struct pci_dev *pf_pdev; 696 697 pdev = pci_real_dma_dev(to_pci_dev(dev)); 698 699 /* VFs aren't listed in scope tables; we need to look up 700 * the PF instead to find the IOMMU. */ 701 pf_pdev = pci_physfn(pdev); 702 dev = &pf_pdev->dev; 703 segment = pci_domain_nr(pdev->bus); 704 } else if (has_acpi_companion(dev)) 705 dev = &ACPI_COMPANION(dev)->dev; 706 707 rcu_read_lock(); 708 for_each_iommu(iommu, drhd) { 709 if (pdev && segment != drhd->segment) 710 continue; 711 712 for_each_active_dev_scope(drhd->devices, 713 drhd->devices_cnt, i, tmp) { 714 if (tmp == dev) { 715 /* For a VF use its original BDF# not that of the PF 716 * which we used for the IOMMU lookup. Strictly speaking 717 * we could do this for all PCI devices; we only need to 718 * get the BDF# from the scope table for ACPI matches. */ 719 if (pdev && pdev->is_virtfn) 720 goto got_pdev; 721 722 if (bus && devfn) { 723 *bus = drhd->devices[i].bus; 724 *devfn = drhd->devices[i].devfn; 725 } 726 goto out; 727 } 728 729 if (is_downstream_to_pci_bridge(dev, tmp)) 730 goto got_pdev; 731 } 732 733 if (pdev && drhd->include_all) { 734 got_pdev: 735 if (bus && devfn) { 736 *bus = pdev->bus->number; 737 *devfn = pdev->devfn; 738 } 739 goto out; 740 } 741 } 742 iommu = NULL; 743 out: 744 if (iommu_is_dummy(iommu, dev)) 745 iommu = NULL; 746 747 rcu_read_unlock(); 748 749 return iommu; 750 } 751 752 static void domain_flush_cache(struct dmar_domain *domain, 753 void *addr, int size) 754 { 755 if (!domain->iommu_coherency) 756 clflush_cache_range(addr, size); 757 } 758 759 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 760 { 761 struct context_entry *context; 762 int ret = 0; 763 764 spin_lock(&iommu->lock); 765 context = iommu_context_addr(iommu, bus, devfn, 0); 766 if (context) 767 ret = context_present(context); 768 spin_unlock(&iommu->lock); 769 return ret; 770 } 771 772 static void free_context_table(struct intel_iommu *iommu) 773 { 774 struct context_entry *context; 775 int i; 776 777 if (!iommu->root_entry) 778 return; 779 780 for (i = 0; i < ROOT_ENTRY_NR; i++) { 781 context = iommu_context_addr(iommu, i, 0, 0); 782 if (context) 783 free_pgtable_page(context); 784 785 if (!sm_supported(iommu)) 786 continue; 787 788 context = iommu_context_addr(iommu, i, 0x80, 0); 789 if (context) 790 free_pgtable_page(context); 791 } 792 793 free_pgtable_page(iommu->root_entry); 794 iommu->root_entry = NULL; 795 } 796 797 #ifdef CONFIG_DMAR_DEBUG 798 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 799 { 800 struct device_domain_info *info; 801 struct dma_pte *parent, *pte; 802 struct dmar_domain *domain; 803 struct pci_dev *pdev; 804 int offset, level; 805 806 pdev = pci_get_domain_bus_and_slot(iommu->segment, bus, devfn); 807 if (!pdev) 808 return; 809 810 info = dev_iommu_priv_get(&pdev->dev); 811 if (!info || !info->domain) { 812 pr_info("device [%02x:%02x.%d] not probed\n", 813 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 814 return; 815 } 816 817 domain = info->domain; 818 level = agaw_to_level(domain->agaw); 819 parent = domain->pgd; 820 if (!parent) { 821 pr_info("no page table setup\n"); 822 return; 823 } 824 825 while (1) { 826 offset = pfn_level_offset(pfn, level); 827 pte = &parent[offset]; 828 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 829 pr_info("PTE not present at level %d\n", level); 830 break; 831 } 832 833 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 834 835 if (level == 1) 836 break; 837 838 parent = phys_to_virt(dma_pte_addr(pte)); 839 level--; 840 } 841 } 842 843 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 844 unsigned long long addr, u32 pasid) 845 { 846 struct pasid_dir_entry *dir, *pde; 847 struct pasid_entry *entries, *pte; 848 struct context_entry *ctx_entry; 849 struct root_entry *rt_entry; 850 u8 devfn = source_id & 0xff; 851 u8 bus = source_id >> 8; 852 int i, dir_index, index; 853 854 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 855 856 /* root entry dump */ 857 rt_entry = &iommu->root_entry[bus]; 858 if (!rt_entry) { 859 pr_info("root table entry is not present\n"); 860 return; 861 } 862 863 if (sm_supported(iommu)) 864 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 865 rt_entry->hi, rt_entry->lo); 866 else 867 pr_info("root entry: 0x%016llx", rt_entry->lo); 868 869 /* context entry dump */ 870 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 871 if (!ctx_entry) { 872 pr_info("context table entry is not present\n"); 873 return; 874 } 875 876 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 877 ctx_entry->hi, ctx_entry->lo); 878 879 /* legacy mode does not require PASID entries */ 880 if (!sm_supported(iommu)) 881 goto pgtable_walk; 882 883 /* get the pointer to pasid directory entry */ 884 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 885 if (!dir) { 886 pr_info("pasid directory entry is not present\n"); 887 return; 888 } 889 /* For request-without-pasid, get the pasid from context entry */ 890 if (intel_iommu_sm && pasid == INVALID_IOASID) 891 pasid = PASID_RID2PASID; 892 893 dir_index = pasid >> PASID_PDE_SHIFT; 894 pde = &dir[dir_index]; 895 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 896 897 /* get the pointer to the pasid table entry */ 898 entries = get_pasid_table_from_pde(pde); 899 if (!entries) { 900 pr_info("pasid table entry is not present\n"); 901 return; 902 } 903 index = pasid & PASID_PTE_MASK; 904 pte = &entries[index]; 905 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 906 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 907 908 pgtable_walk: 909 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 910 } 911 #endif 912 913 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 914 unsigned long pfn, int *target_level) 915 { 916 struct dma_pte *parent, *pte; 917 int level = agaw_to_level(domain->agaw); 918 int offset; 919 920 BUG_ON(!domain->pgd); 921 922 if (!domain_pfn_supported(domain, pfn)) 923 /* Address beyond IOMMU's addressing capabilities. */ 924 return NULL; 925 926 parent = domain->pgd; 927 928 while (1) { 929 void *tmp_page; 930 931 offset = pfn_level_offset(pfn, level); 932 pte = &parent[offset]; 933 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 934 break; 935 if (level == *target_level) 936 break; 937 938 if (!dma_pte_present(pte)) { 939 uint64_t pteval; 940 941 tmp_page = alloc_pgtable_page(domain->nid); 942 943 if (!tmp_page) 944 return NULL; 945 946 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 947 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 948 if (domain_use_first_level(domain)) { 949 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 950 if (iommu_is_dma_domain(&domain->domain)) 951 pteval |= DMA_FL_PTE_ACCESS; 952 } 953 if (cmpxchg64(&pte->val, 0ULL, pteval)) 954 /* Someone else set it while we were thinking; use theirs. */ 955 free_pgtable_page(tmp_page); 956 else 957 domain_flush_cache(domain, pte, sizeof(*pte)); 958 } 959 if (level == 1) 960 break; 961 962 parent = phys_to_virt(dma_pte_addr(pte)); 963 level--; 964 } 965 966 if (!*target_level) 967 *target_level = level; 968 969 return pte; 970 } 971 972 /* return address's pte at specific level */ 973 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 974 unsigned long pfn, 975 int level, int *large_page) 976 { 977 struct dma_pte *parent, *pte; 978 int total = agaw_to_level(domain->agaw); 979 int offset; 980 981 parent = domain->pgd; 982 while (level <= total) { 983 offset = pfn_level_offset(pfn, total); 984 pte = &parent[offset]; 985 if (level == total) 986 return pte; 987 988 if (!dma_pte_present(pte)) { 989 *large_page = total; 990 break; 991 } 992 993 if (dma_pte_superpage(pte)) { 994 *large_page = total; 995 return pte; 996 } 997 998 parent = phys_to_virt(dma_pte_addr(pte)); 999 total--; 1000 } 1001 return NULL; 1002 } 1003 1004 /* clear last level pte, a tlb flush should be followed */ 1005 static void dma_pte_clear_range(struct dmar_domain *domain, 1006 unsigned long start_pfn, 1007 unsigned long last_pfn) 1008 { 1009 unsigned int large_page; 1010 struct dma_pte *first_pte, *pte; 1011 1012 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1013 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1014 BUG_ON(start_pfn > last_pfn); 1015 1016 /* we don't need lock here; nobody else touches the iova range */ 1017 do { 1018 large_page = 1; 1019 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1020 if (!pte) { 1021 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1022 continue; 1023 } 1024 do { 1025 dma_clear_pte(pte); 1026 start_pfn += lvl_to_nr_pages(large_page); 1027 pte++; 1028 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1029 1030 domain_flush_cache(domain, first_pte, 1031 (void *)pte - (void *)first_pte); 1032 1033 } while (start_pfn && start_pfn <= last_pfn); 1034 } 1035 1036 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1037 int retain_level, struct dma_pte *pte, 1038 unsigned long pfn, unsigned long start_pfn, 1039 unsigned long last_pfn) 1040 { 1041 pfn = max(start_pfn, pfn); 1042 pte = &pte[pfn_level_offset(pfn, level)]; 1043 1044 do { 1045 unsigned long level_pfn; 1046 struct dma_pte *level_pte; 1047 1048 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1049 goto next; 1050 1051 level_pfn = pfn & level_mask(level); 1052 level_pte = phys_to_virt(dma_pte_addr(pte)); 1053 1054 if (level > 2) { 1055 dma_pte_free_level(domain, level - 1, retain_level, 1056 level_pte, level_pfn, start_pfn, 1057 last_pfn); 1058 } 1059 1060 /* 1061 * Free the page table if we're below the level we want to 1062 * retain and the range covers the entire table. 1063 */ 1064 if (level < retain_level && !(start_pfn > level_pfn || 1065 last_pfn < level_pfn + level_size(level) - 1)) { 1066 dma_clear_pte(pte); 1067 domain_flush_cache(domain, pte, sizeof(*pte)); 1068 free_pgtable_page(level_pte); 1069 } 1070 next: 1071 pfn += level_size(level); 1072 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1073 } 1074 1075 /* 1076 * clear last level (leaf) ptes and free page table pages below the 1077 * level we wish to keep intact. 1078 */ 1079 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1080 unsigned long start_pfn, 1081 unsigned long last_pfn, 1082 int retain_level) 1083 { 1084 dma_pte_clear_range(domain, start_pfn, last_pfn); 1085 1086 /* We don't need lock here; nobody else touches the iova range */ 1087 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1088 domain->pgd, 0, start_pfn, last_pfn); 1089 1090 /* free pgd */ 1091 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1092 free_pgtable_page(domain->pgd); 1093 domain->pgd = NULL; 1094 } 1095 } 1096 1097 /* When a page at a given level is being unlinked from its parent, we don't 1098 need to *modify* it at all. All we need to do is make a list of all the 1099 pages which can be freed just as soon as we've flushed the IOTLB and we 1100 know the hardware page-walk will no longer touch them. 1101 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1102 be freed. */ 1103 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1104 int level, struct dma_pte *pte, 1105 struct list_head *freelist) 1106 { 1107 struct page *pg; 1108 1109 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1110 list_add_tail(&pg->lru, freelist); 1111 1112 if (level == 1) 1113 return; 1114 1115 pte = page_address(pg); 1116 do { 1117 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1118 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1119 pte++; 1120 } while (!first_pte_in_page(pte)); 1121 } 1122 1123 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1124 struct dma_pte *pte, unsigned long pfn, 1125 unsigned long start_pfn, unsigned long last_pfn, 1126 struct list_head *freelist) 1127 { 1128 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1129 1130 pfn = max(start_pfn, pfn); 1131 pte = &pte[pfn_level_offset(pfn, level)]; 1132 1133 do { 1134 unsigned long level_pfn = pfn & level_mask(level); 1135 1136 if (!dma_pte_present(pte)) 1137 goto next; 1138 1139 /* If range covers entire pagetable, free it */ 1140 if (start_pfn <= level_pfn && 1141 last_pfn >= level_pfn + level_size(level) - 1) { 1142 /* These suborbinate page tables are going away entirely. Don't 1143 bother to clear them; we're just going to *free* them. */ 1144 if (level > 1 && !dma_pte_superpage(pte)) 1145 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1146 1147 dma_clear_pte(pte); 1148 if (!first_pte) 1149 first_pte = pte; 1150 last_pte = pte; 1151 } else if (level > 1) { 1152 /* Recurse down into a level that isn't *entirely* obsolete */ 1153 dma_pte_clear_level(domain, level - 1, 1154 phys_to_virt(dma_pte_addr(pte)), 1155 level_pfn, start_pfn, last_pfn, 1156 freelist); 1157 } 1158 next: 1159 pfn = level_pfn + level_size(level); 1160 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1161 1162 if (first_pte) 1163 domain_flush_cache(domain, first_pte, 1164 (void *)++last_pte - (void *)first_pte); 1165 } 1166 1167 /* We can't just free the pages because the IOMMU may still be walking 1168 the page tables, and may have cached the intermediate levels. The 1169 pages can only be freed after the IOTLB flush has been done. */ 1170 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1171 unsigned long last_pfn, struct list_head *freelist) 1172 { 1173 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1174 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1175 BUG_ON(start_pfn > last_pfn); 1176 1177 /* we don't need lock here; nobody else touches the iova range */ 1178 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1179 domain->pgd, 0, start_pfn, last_pfn, freelist); 1180 1181 /* free pgd */ 1182 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1183 struct page *pgd_page = virt_to_page(domain->pgd); 1184 list_add_tail(&pgd_page->lru, freelist); 1185 domain->pgd = NULL; 1186 } 1187 } 1188 1189 /* iommu handling */ 1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1191 { 1192 struct root_entry *root; 1193 1194 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1195 if (!root) { 1196 pr_err("Allocating root entry for %s failed\n", 1197 iommu->name); 1198 return -ENOMEM; 1199 } 1200 1201 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1202 iommu->root_entry = root; 1203 1204 return 0; 1205 } 1206 1207 static void iommu_set_root_entry(struct intel_iommu *iommu) 1208 { 1209 u64 addr; 1210 u32 sts; 1211 unsigned long flag; 1212 1213 addr = virt_to_phys(iommu->root_entry); 1214 if (sm_supported(iommu)) 1215 addr |= DMA_RTADDR_SMT; 1216 1217 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1218 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1219 1220 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1221 1222 /* Make sure hardware complete it */ 1223 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1224 readl, (sts & DMA_GSTS_RTPS), sts); 1225 1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1227 1228 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1229 if (sm_supported(iommu)) 1230 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1231 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1232 } 1233 1234 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1235 { 1236 u32 val; 1237 unsigned long flag; 1238 1239 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1240 return; 1241 1242 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1243 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1244 1245 /* Make sure hardware complete it */ 1246 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1247 readl, (!(val & DMA_GSTS_WBFS)), val); 1248 1249 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1250 } 1251 1252 /* return value determine if we need a write buffer flush */ 1253 static void __iommu_flush_context(struct intel_iommu *iommu, 1254 u16 did, u16 source_id, u8 function_mask, 1255 u64 type) 1256 { 1257 u64 val = 0; 1258 unsigned long flag; 1259 1260 switch (type) { 1261 case DMA_CCMD_GLOBAL_INVL: 1262 val = DMA_CCMD_GLOBAL_INVL; 1263 break; 1264 case DMA_CCMD_DOMAIN_INVL: 1265 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1266 break; 1267 case DMA_CCMD_DEVICE_INVL: 1268 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1269 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1270 break; 1271 default: 1272 BUG(); 1273 } 1274 val |= DMA_CCMD_ICC; 1275 1276 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1277 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1278 1279 /* Make sure hardware complete it */ 1280 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1281 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1282 1283 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1284 } 1285 1286 /* return value determine if we need a write buffer flush */ 1287 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1288 u64 addr, unsigned int size_order, u64 type) 1289 { 1290 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1291 u64 val = 0, val_iva = 0; 1292 unsigned long flag; 1293 1294 switch (type) { 1295 case DMA_TLB_GLOBAL_FLUSH: 1296 /* global flush doesn't need set IVA_REG */ 1297 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1298 break; 1299 case DMA_TLB_DSI_FLUSH: 1300 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1301 break; 1302 case DMA_TLB_PSI_FLUSH: 1303 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1304 /* IH bit is passed in as part of address */ 1305 val_iva = size_order | addr; 1306 break; 1307 default: 1308 BUG(); 1309 } 1310 /* Note: set drain read/write */ 1311 #if 0 1312 /* 1313 * This is probably to be super secure.. Looks like we can 1314 * ignore it without any impact. 1315 */ 1316 if (cap_read_drain(iommu->cap)) 1317 val |= DMA_TLB_READ_DRAIN; 1318 #endif 1319 if (cap_write_drain(iommu->cap)) 1320 val |= DMA_TLB_WRITE_DRAIN; 1321 1322 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1323 /* Note: Only uses first TLB reg currently */ 1324 if (val_iva) 1325 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1326 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1327 1328 /* Make sure hardware complete it */ 1329 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1330 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1331 1332 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1333 1334 /* check IOTLB invalidation granularity */ 1335 if (DMA_TLB_IAIG(val) == 0) 1336 pr_err("Flush IOTLB failed\n"); 1337 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1338 pr_debug("TLB flush request %Lx, actual %Lx\n", 1339 (unsigned long long)DMA_TLB_IIRG(type), 1340 (unsigned long long)DMA_TLB_IAIG(val)); 1341 } 1342 1343 static struct device_domain_info * 1344 iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu, 1345 u8 bus, u8 devfn) 1346 { 1347 struct device_domain_info *info; 1348 1349 if (!iommu->qi) 1350 return NULL; 1351 1352 spin_lock(&domain->lock); 1353 list_for_each_entry(info, &domain->devices, link) { 1354 if (info->iommu == iommu && info->bus == bus && 1355 info->devfn == devfn) { 1356 spin_unlock(&domain->lock); 1357 return info->ats_supported ? info : NULL; 1358 } 1359 } 1360 spin_unlock(&domain->lock); 1361 1362 return NULL; 1363 } 1364 1365 static void domain_update_iotlb(struct dmar_domain *domain) 1366 { 1367 struct device_domain_info *info; 1368 bool has_iotlb_device = false; 1369 1370 spin_lock(&domain->lock); 1371 list_for_each_entry(info, &domain->devices, link) { 1372 if (info->ats_enabled) { 1373 has_iotlb_device = true; 1374 break; 1375 } 1376 } 1377 domain->has_iotlb_device = has_iotlb_device; 1378 spin_unlock(&domain->lock); 1379 } 1380 1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1382 { 1383 struct pci_dev *pdev; 1384 1385 if (!info || !dev_is_pci(info->dev)) 1386 return; 1387 1388 pdev = to_pci_dev(info->dev); 1389 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1390 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1391 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1392 * reserved, which should be set to 0. 1393 */ 1394 if (!ecap_dit(info->iommu->ecap)) 1395 info->pfsid = 0; 1396 else { 1397 struct pci_dev *pf_pdev; 1398 1399 /* pdev will be returned if device is not a vf */ 1400 pf_pdev = pci_physfn(pdev); 1401 info->pfsid = pci_dev_id(pf_pdev); 1402 } 1403 1404 #ifdef CONFIG_INTEL_IOMMU_SVM 1405 /* The PCIe spec, in its wisdom, declares that the behaviour of 1406 the device if you enable PASID support after ATS support is 1407 undefined. So always enable PASID support on devices which 1408 have it, even if we can't yet know if we're ever going to 1409 use it. */ 1410 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1411 info->pasid_enabled = 1; 1412 1413 if (info->pri_supported && 1414 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1415 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1416 info->pri_enabled = 1; 1417 #endif 1418 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1419 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1420 info->ats_enabled = 1; 1421 domain_update_iotlb(info->domain); 1422 info->ats_qdep = pci_ats_queue_depth(pdev); 1423 } 1424 } 1425 1426 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1427 { 1428 struct pci_dev *pdev; 1429 1430 if (!dev_is_pci(info->dev)) 1431 return; 1432 1433 pdev = to_pci_dev(info->dev); 1434 1435 if (info->ats_enabled) { 1436 pci_disable_ats(pdev); 1437 info->ats_enabled = 0; 1438 domain_update_iotlb(info->domain); 1439 } 1440 #ifdef CONFIG_INTEL_IOMMU_SVM 1441 if (info->pri_enabled) { 1442 pci_disable_pri(pdev); 1443 info->pri_enabled = 0; 1444 } 1445 if (info->pasid_enabled) { 1446 pci_disable_pasid(pdev); 1447 info->pasid_enabled = 0; 1448 } 1449 #endif 1450 } 1451 1452 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1453 u64 addr, unsigned int mask) 1454 { 1455 u16 sid, qdep; 1456 1457 if (!info || !info->ats_enabled) 1458 return; 1459 1460 sid = info->bus << 8 | info->devfn; 1461 qdep = info->ats_qdep; 1462 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1463 qdep, addr, mask); 1464 } 1465 1466 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1467 u64 addr, unsigned mask) 1468 { 1469 struct device_domain_info *info; 1470 1471 if (!domain->has_iotlb_device) 1472 return; 1473 1474 spin_lock(&domain->lock); 1475 list_for_each_entry(info, &domain->devices, link) 1476 __iommu_flush_dev_iotlb(info, addr, mask); 1477 spin_unlock(&domain->lock); 1478 } 1479 1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1481 struct dmar_domain *domain, 1482 unsigned long pfn, unsigned int pages, 1483 int ih, int map) 1484 { 1485 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1486 unsigned int mask = ilog2(aligned_pages); 1487 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1488 u16 did = domain_id_iommu(domain, iommu); 1489 1490 BUG_ON(pages == 0); 1491 1492 if (ih) 1493 ih = 1 << 6; 1494 1495 if (domain_use_first_level(domain)) { 1496 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1497 } else { 1498 unsigned long bitmask = aligned_pages - 1; 1499 1500 /* 1501 * PSI masks the low order bits of the base address. If the 1502 * address isn't aligned to the mask, then compute a mask value 1503 * needed to ensure the target range is flushed. 1504 */ 1505 if (unlikely(bitmask & pfn)) { 1506 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1507 1508 /* 1509 * Since end_pfn <= pfn + bitmask, the only way bits 1510 * higher than bitmask can differ in pfn and end_pfn is 1511 * by carrying. This means after masking out bitmask, 1512 * high bits starting with the first set bit in 1513 * shared_bits are all equal in both pfn and end_pfn. 1514 */ 1515 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1516 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1517 } 1518 1519 /* 1520 * Fallback to domain selective flush if no PSI support or 1521 * the size is too big. 1522 */ 1523 if (!cap_pgsel_inv(iommu->cap) || 1524 mask > cap_max_amask_val(iommu->cap)) 1525 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1526 DMA_TLB_DSI_FLUSH); 1527 else 1528 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1529 DMA_TLB_PSI_FLUSH); 1530 } 1531 1532 /* 1533 * In caching mode, changes of pages from non-present to present require 1534 * flush. However, device IOTLB doesn't need to be flushed in this case. 1535 */ 1536 if (!cap_caching_mode(iommu->cap) || !map) 1537 iommu_flush_dev_iotlb(domain, addr, mask); 1538 } 1539 1540 /* Notification for newly created mappings */ 1541 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1542 struct dmar_domain *domain, 1543 unsigned long pfn, unsigned int pages) 1544 { 1545 /* 1546 * It's a non-present to present mapping. Only flush if caching mode 1547 * and second level. 1548 */ 1549 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1550 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1551 else 1552 iommu_flush_write_buffer(iommu); 1553 } 1554 1555 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1556 { 1557 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1558 struct iommu_domain_info *info; 1559 unsigned long idx; 1560 1561 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1562 struct intel_iommu *iommu = info->iommu; 1563 u16 did = domain_id_iommu(dmar_domain, iommu); 1564 1565 if (domain_use_first_level(dmar_domain)) 1566 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1567 else 1568 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1569 DMA_TLB_DSI_FLUSH); 1570 1571 if (!cap_caching_mode(iommu->cap)) 1572 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1573 } 1574 } 1575 1576 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1577 { 1578 u32 pmen; 1579 unsigned long flags; 1580 1581 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1582 return; 1583 1584 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1585 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1586 pmen &= ~DMA_PMEN_EPM; 1587 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1588 1589 /* wait for the protected region status bit to clear */ 1590 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1591 readl, !(pmen & DMA_PMEN_PRS), pmen); 1592 1593 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1594 } 1595 1596 static void iommu_enable_translation(struct intel_iommu *iommu) 1597 { 1598 u32 sts; 1599 unsigned long flags; 1600 1601 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1602 iommu->gcmd |= DMA_GCMD_TE; 1603 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1604 1605 /* Make sure hardware complete it */ 1606 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1607 readl, (sts & DMA_GSTS_TES), sts); 1608 1609 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1610 } 1611 1612 static void iommu_disable_translation(struct intel_iommu *iommu) 1613 { 1614 u32 sts; 1615 unsigned long flag; 1616 1617 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1618 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1619 return; 1620 1621 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1622 iommu->gcmd &= ~DMA_GCMD_TE; 1623 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1624 1625 /* Make sure hardware complete it */ 1626 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1627 readl, (!(sts & DMA_GSTS_TES)), sts); 1628 1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1630 } 1631 1632 static int iommu_init_domains(struct intel_iommu *iommu) 1633 { 1634 u32 ndomains; 1635 1636 ndomains = cap_ndoms(iommu->cap); 1637 pr_debug("%s: Number of Domains supported <%d>\n", 1638 iommu->name, ndomains); 1639 1640 spin_lock_init(&iommu->lock); 1641 1642 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1643 if (!iommu->domain_ids) 1644 return -ENOMEM; 1645 1646 /* 1647 * If Caching mode is set, then invalid translations are tagged 1648 * with domain-id 0, hence we need to pre-allocate it. We also 1649 * use domain-id 0 as a marker for non-allocated domain-id, so 1650 * make sure it is not used for a real domain. 1651 */ 1652 set_bit(0, iommu->domain_ids); 1653 1654 /* 1655 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1656 * entry for first-level or pass-through translation modes should 1657 * be programmed with a domain id different from those used for 1658 * second-level or nested translation. We reserve a domain id for 1659 * this purpose. 1660 */ 1661 if (sm_supported(iommu)) 1662 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1663 1664 return 0; 1665 } 1666 1667 static void disable_dmar_iommu(struct intel_iommu *iommu) 1668 { 1669 if (!iommu->domain_ids) 1670 return; 1671 1672 /* 1673 * All iommu domains must have been detached from the devices, 1674 * hence there should be no domain IDs in use. 1675 */ 1676 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1677 > NUM_RESERVED_DID)) 1678 return; 1679 1680 if (iommu->gcmd & DMA_GCMD_TE) 1681 iommu_disable_translation(iommu); 1682 } 1683 1684 static void free_dmar_iommu(struct intel_iommu *iommu) 1685 { 1686 if (iommu->domain_ids) { 1687 bitmap_free(iommu->domain_ids); 1688 iommu->domain_ids = NULL; 1689 } 1690 1691 /* free context mapping */ 1692 free_context_table(iommu); 1693 1694 #ifdef CONFIG_INTEL_IOMMU_SVM 1695 if (pasid_supported(iommu)) { 1696 if (ecap_prs(iommu->ecap)) 1697 intel_svm_finish_prq(iommu); 1698 } 1699 if (vccap_pasid(iommu->vccap)) 1700 ioasid_unregister_allocator(&iommu->pasid_allocator); 1701 1702 #endif 1703 } 1704 1705 /* 1706 * Check and return whether first level is used by default for 1707 * DMA translation. 1708 */ 1709 static bool first_level_by_default(unsigned int type) 1710 { 1711 /* Only SL is available in legacy mode */ 1712 if (!scalable_mode_support()) 1713 return false; 1714 1715 /* Only level (either FL or SL) is available, just use it */ 1716 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1717 return intel_cap_flts_sanity(); 1718 1719 /* Both levels are available, decide it based on domain type */ 1720 return type != IOMMU_DOMAIN_UNMANAGED; 1721 } 1722 1723 static struct dmar_domain *alloc_domain(unsigned int type) 1724 { 1725 struct dmar_domain *domain; 1726 1727 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1728 if (!domain) 1729 return NULL; 1730 1731 domain->nid = NUMA_NO_NODE; 1732 if (first_level_by_default(type)) 1733 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1734 domain->has_iotlb_device = false; 1735 INIT_LIST_HEAD(&domain->devices); 1736 spin_lock_init(&domain->lock); 1737 xa_init(&domain->iommu_array); 1738 1739 return domain; 1740 } 1741 1742 static int domain_attach_iommu(struct dmar_domain *domain, 1743 struct intel_iommu *iommu) 1744 { 1745 struct iommu_domain_info *info, *curr; 1746 unsigned long ndomains; 1747 int num, ret = -ENOSPC; 1748 1749 info = kzalloc(sizeof(*info), GFP_KERNEL); 1750 if (!info) 1751 return -ENOMEM; 1752 1753 spin_lock(&iommu->lock); 1754 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1755 if (curr) { 1756 curr->refcnt++; 1757 spin_unlock(&iommu->lock); 1758 kfree(info); 1759 return 0; 1760 } 1761 1762 ndomains = cap_ndoms(iommu->cap); 1763 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1764 if (num >= ndomains) { 1765 pr_err("%s: No free domain ids\n", iommu->name); 1766 goto err_unlock; 1767 } 1768 1769 set_bit(num, iommu->domain_ids); 1770 info->refcnt = 1; 1771 info->did = num; 1772 info->iommu = iommu; 1773 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1774 NULL, info, GFP_ATOMIC); 1775 if (curr) { 1776 ret = xa_err(curr) ? : -EBUSY; 1777 goto err_clear; 1778 } 1779 domain_update_iommu_cap(domain); 1780 1781 spin_unlock(&iommu->lock); 1782 return 0; 1783 1784 err_clear: 1785 clear_bit(info->did, iommu->domain_ids); 1786 err_unlock: 1787 spin_unlock(&iommu->lock); 1788 kfree(info); 1789 return ret; 1790 } 1791 1792 static void domain_detach_iommu(struct dmar_domain *domain, 1793 struct intel_iommu *iommu) 1794 { 1795 struct iommu_domain_info *info; 1796 1797 spin_lock(&iommu->lock); 1798 info = xa_load(&domain->iommu_array, iommu->seq_id); 1799 if (--info->refcnt == 0) { 1800 clear_bit(info->did, iommu->domain_ids); 1801 xa_erase(&domain->iommu_array, iommu->seq_id); 1802 domain->nid = NUMA_NO_NODE; 1803 domain_update_iommu_cap(domain); 1804 kfree(info); 1805 } 1806 spin_unlock(&iommu->lock); 1807 } 1808 1809 static inline int guestwidth_to_adjustwidth(int gaw) 1810 { 1811 int agaw; 1812 int r = (gaw - 12) % 9; 1813 1814 if (r == 0) 1815 agaw = gaw; 1816 else 1817 agaw = gaw + 9 - r; 1818 if (agaw > 64) 1819 agaw = 64; 1820 return agaw; 1821 } 1822 1823 static void domain_exit(struct dmar_domain *domain) 1824 { 1825 if (domain->pgd) { 1826 LIST_HEAD(freelist); 1827 1828 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1829 put_pages_list(&freelist); 1830 } 1831 1832 if (WARN_ON(!list_empty(&domain->devices))) 1833 return; 1834 1835 kfree(domain); 1836 } 1837 1838 /* 1839 * Get the PASID directory size for scalable mode context entry. 1840 * Value of X in the PDTS field of a scalable mode context entry 1841 * indicates PASID directory with 2^(X + 7) entries. 1842 */ 1843 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1844 { 1845 unsigned long pds, max_pde; 1846 1847 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1848 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1849 if (pds < 7) 1850 return 0; 1851 1852 return pds - 7; 1853 } 1854 1855 /* 1856 * Set the RID_PASID field of a scalable mode context entry. The 1857 * IOMMU hardware will use the PASID value set in this field for 1858 * DMA translations of DMA requests without PASID. 1859 */ 1860 static inline void 1861 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1862 { 1863 context->hi |= pasid & ((1 << 20) - 1); 1864 } 1865 1866 /* 1867 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1868 * entry. 1869 */ 1870 static inline void context_set_sm_dte(struct context_entry *context) 1871 { 1872 context->lo |= (1 << 2); 1873 } 1874 1875 /* 1876 * Set the PRE(Page Request Enable) field of a scalable mode context 1877 * entry. 1878 */ 1879 static inline void context_set_sm_pre(struct context_entry *context) 1880 { 1881 context->lo |= (1 << 4); 1882 } 1883 1884 /* Convert value to context PASID directory size field coding. */ 1885 #define context_pdts(pds) (((pds) & 0x7) << 9) 1886 1887 static int domain_context_mapping_one(struct dmar_domain *domain, 1888 struct intel_iommu *iommu, 1889 struct pasid_table *table, 1890 u8 bus, u8 devfn) 1891 { 1892 struct device_domain_info *info = 1893 iommu_support_dev_iotlb(domain, iommu, bus, devfn); 1894 u16 did = domain_id_iommu(domain, iommu); 1895 int translation = CONTEXT_TT_MULTI_LEVEL; 1896 struct context_entry *context; 1897 int ret; 1898 1899 WARN_ON(did == 0); 1900 1901 if (hw_pass_through && domain_type_is_si(domain)) 1902 translation = CONTEXT_TT_PASS_THROUGH; 1903 1904 pr_debug("Set context mapping for %02x:%02x.%d\n", 1905 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1906 1907 BUG_ON(!domain->pgd); 1908 1909 spin_lock(&iommu->lock); 1910 ret = -ENOMEM; 1911 context = iommu_context_addr(iommu, bus, devfn, 1); 1912 if (!context) 1913 goto out_unlock; 1914 1915 ret = 0; 1916 if (context_present(context)) 1917 goto out_unlock; 1918 1919 /* 1920 * For kdump cases, old valid entries may be cached due to the 1921 * in-flight DMA and copied pgtable, but there is no unmapping 1922 * behaviour for them, thus we need an explicit cache flush for 1923 * the newly-mapped device. For kdump, at this point, the device 1924 * is supposed to finish reset at its driver probe stage, so no 1925 * in-flight DMA will exist, and we don't need to worry anymore 1926 * hereafter. 1927 */ 1928 if (context_copied(context)) { 1929 u16 did_old = context_domain_id(context); 1930 1931 if (did_old < cap_ndoms(iommu->cap)) { 1932 iommu->flush.flush_context(iommu, did_old, 1933 (((u16)bus) << 8) | devfn, 1934 DMA_CCMD_MASK_NOBIT, 1935 DMA_CCMD_DEVICE_INVL); 1936 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1937 DMA_TLB_DSI_FLUSH); 1938 } 1939 } 1940 1941 context_clear_entry(context); 1942 1943 if (sm_supported(iommu)) { 1944 unsigned long pds; 1945 1946 WARN_ON(!table); 1947 1948 /* Setup the PASID DIR pointer: */ 1949 pds = context_get_sm_pds(table); 1950 context->lo = (u64)virt_to_phys(table->table) | 1951 context_pdts(pds); 1952 1953 /* Setup the RID_PASID field: */ 1954 context_set_sm_rid2pasid(context, PASID_RID2PASID); 1955 1956 /* 1957 * Setup the Device-TLB enable bit and Page request 1958 * Enable bit: 1959 */ 1960 if (info && info->ats_supported) 1961 context_set_sm_dte(context); 1962 if (info && info->pri_supported) 1963 context_set_sm_pre(context); 1964 } else { 1965 struct dma_pte *pgd = domain->pgd; 1966 int agaw; 1967 1968 context_set_domain_id(context, did); 1969 1970 if (translation != CONTEXT_TT_PASS_THROUGH) { 1971 /* 1972 * Skip top levels of page tables for iommu which has 1973 * less agaw than default. Unnecessary for PT mode. 1974 */ 1975 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1976 ret = -ENOMEM; 1977 pgd = phys_to_virt(dma_pte_addr(pgd)); 1978 if (!dma_pte_present(pgd)) 1979 goto out_unlock; 1980 } 1981 1982 if (info && info->ats_supported) 1983 translation = CONTEXT_TT_DEV_IOTLB; 1984 else 1985 translation = CONTEXT_TT_MULTI_LEVEL; 1986 1987 context_set_address_root(context, virt_to_phys(pgd)); 1988 context_set_address_width(context, agaw); 1989 } else { 1990 /* 1991 * In pass through mode, AW must be programmed to 1992 * indicate the largest AGAW value supported by 1993 * hardware. And ASR is ignored by hardware. 1994 */ 1995 context_set_address_width(context, iommu->msagaw); 1996 } 1997 1998 context_set_translation_type(context, translation); 1999 } 2000 2001 context_set_fault_enable(context); 2002 context_set_present(context); 2003 if (!ecap_coherent(iommu->ecap)) 2004 clflush_cache_range(context, sizeof(*context)); 2005 2006 /* 2007 * It's a non-present to present mapping. If hardware doesn't cache 2008 * non-present entry we only need to flush the write-buffer. If the 2009 * _does_ cache non-present entries, then it does so in the special 2010 * domain #0, which we have to flush: 2011 */ 2012 if (cap_caching_mode(iommu->cap)) { 2013 iommu->flush.flush_context(iommu, 0, 2014 (((u16)bus) << 8) | devfn, 2015 DMA_CCMD_MASK_NOBIT, 2016 DMA_CCMD_DEVICE_INVL); 2017 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2018 } else { 2019 iommu_flush_write_buffer(iommu); 2020 } 2021 iommu_enable_dev_iotlb(info); 2022 2023 ret = 0; 2024 2025 out_unlock: 2026 spin_unlock(&iommu->lock); 2027 2028 return ret; 2029 } 2030 2031 struct domain_context_mapping_data { 2032 struct dmar_domain *domain; 2033 struct intel_iommu *iommu; 2034 struct pasid_table *table; 2035 }; 2036 2037 static int domain_context_mapping_cb(struct pci_dev *pdev, 2038 u16 alias, void *opaque) 2039 { 2040 struct domain_context_mapping_data *data = opaque; 2041 2042 return domain_context_mapping_one(data->domain, data->iommu, 2043 data->table, PCI_BUS_NUM(alias), 2044 alias & 0xff); 2045 } 2046 2047 static int 2048 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2049 { 2050 struct domain_context_mapping_data data; 2051 struct pasid_table *table; 2052 struct intel_iommu *iommu; 2053 u8 bus, devfn; 2054 2055 iommu = device_to_iommu(dev, &bus, &devfn); 2056 if (!iommu) 2057 return -ENODEV; 2058 2059 table = intel_pasid_get_table(dev); 2060 2061 if (!dev_is_pci(dev)) 2062 return domain_context_mapping_one(domain, iommu, table, 2063 bus, devfn); 2064 2065 data.domain = domain; 2066 data.iommu = iommu; 2067 data.table = table; 2068 2069 return pci_for_each_dma_alias(to_pci_dev(dev), 2070 &domain_context_mapping_cb, &data); 2071 } 2072 2073 static int domain_context_mapped_cb(struct pci_dev *pdev, 2074 u16 alias, void *opaque) 2075 { 2076 struct intel_iommu *iommu = opaque; 2077 2078 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2079 } 2080 2081 static int domain_context_mapped(struct device *dev) 2082 { 2083 struct intel_iommu *iommu; 2084 u8 bus, devfn; 2085 2086 iommu = device_to_iommu(dev, &bus, &devfn); 2087 if (!iommu) 2088 return -ENODEV; 2089 2090 if (!dev_is_pci(dev)) 2091 return device_context_mapped(iommu, bus, devfn); 2092 2093 return !pci_for_each_dma_alias(to_pci_dev(dev), 2094 domain_context_mapped_cb, iommu); 2095 } 2096 2097 /* Returns a number of VTD pages, but aligned to MM page size */ 2098 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2099 size_t size) 2100 { 2101 host_addr &= ~PAGE_MASK; 2102 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2103 } 2104 2105 /* Return largest possible superpage level for a given mapping */ 2106 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2107 unsigned long iov_pfn, 2108 unsigned long phy_pfn, 2109 unsigned long pages) 2110 { 2111 int support, level = 1; 2112 unsigned long pfnmerge; 2113 2114 support = domain->iommu_superpage; 2115 2116 /* To use a large page, the virtual *and* physical addresses 2117 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2118 of them will mean we have to use smaller pages. So just 2119 merge them and check both at once. */ 2120 pfnmerge = iov_pfn | phy_pfn; 2121 2122 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2123 pages >>= VTD_STRIDE_SHIFT; 2124 if (!pages) 2125 break; 2126 pfnmerge >>= VTD_STRIDE_SHIFT; 2127 level++; 2128 support--; 2129 } 2130 return level; 2131 } 2132 2133 /* 2134 * Ensure that old small page tables are removed to make room for superpage(s). 2135 * We're going to add new large pages, so make sure we don't remove their parent 2136 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2137 */ 2138 static void switch_to_super_page(struct dmar_domain *domain, 2139 unsigned long start_pfn, 2140 unsigned long end_pfn, int level) 2141 { 2142 unsigned long lvl_pages = lvl_to_nr_pages(level); 2143 struct iommu_domain_info *info; 2144 struct dma_pte *pte = NULL; 2145 unsigned long i; 2146 2147 while (start_pfn <= end_pfn) { 2148 if (!pte) 2149 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2150 2151 if (dma_pte_present(pte)) { 2152 dma_pte_free_pagetable(domain, start_pfn, 2153 start_pfn + lvl_pages - 1, 2154 level + 1); 2155 2156 xa_for_each(&domain->iommu_array, i, info) 2157 iommu_flush_iotlb_psi(info->iommu, domain, 2158 start_pfn, lvl_pages, 2159 0, 0); 2160 } 2161 2162 pte++; 2163 start_pfn += lvl_pages; 2164 if (first_pte_in_page(pte)) 2165 pte = NULL; 2166 } 2167 } 2168 2169 static int 2170 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2171 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2172 { 2173 struct dma_pte *first_pte = NULL, *pte = NULL; 2174 unsigned int largepage_lvl = 0; 2175 unsigned long lvl_pages = 0; 2176 phys_addr_t pteval; 2177 u64 attr; 2178 2179 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2180 2181 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2182 return -EINVAL; 2183 2184 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2185 attr |= DMA_FL_PTE_PRESENT; 2186 if (domain_use_first_level(domain)) { 2187 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2188 if (prot & DMA_PTE_WRITE) 2189 attr |= DMA_FL_PTE_DIRTY; 2190 } 2191 2192 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2193 2194 while (nr_pages > 0) { 2195 uint64_t tmp; 2196 2197 if (!pte) { 2198 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2199 phys_pfn, nr_pages); 2200 2201 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2202 if (!pte) 2203 return -ENOMEM; 2204 first_pte = pte; 2205 2206 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2207 2208 /* It is large page*/ 2209 if (largepage_lvl > 1) { 2210 unsigned long end_pfn; 2211 unsigned long pages_to_remove; 2212 2213 pteval |= DMA_PTE_LARGE_PAGE; 2214 pages_to_remove = min_t(unsigned long, nr_pages, 2215 nr_pte_to_next_page(pte) * lvl_pages); 2216 end_pfn = iov_pfn + pages_to_remove - 1; 2217 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2218 } else { 2219 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2220 } 2221 2222 } 2223 /* We don't need lock here, nobody else 2224 * touches the iova range 2225 */ 2226 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2227 if (tmp) { 2228 static int dumps = 5; 2229 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2230 iov_pfn, tmp, (unsigned long long)pteval); 2231 if (dumps) { 2232 dumps--; 2233 debug_dma_dump_mappings(NULL); 2234 } 2235 WARN_ON(1); 2236 } 2237 2238 nr_pages -= lvl_pages; 2239 iov_pfn += lvl_pages; 2240 phys_pfn += lvl_pages; 2241 pteval += lvl_pages * VTD_PAGE_SIZE; 2242 2243 /* If the next PTE would be the first in a new page, then we 2244 * need to flush the cache on the entries we've just written. 2245 * And then we'll need to recalculate 'pte', so clear it and 2246 * let it get set again in the if (!pte) block above. 2247 * 2248 * If we're done (!nr_pages) we need to flush the cache too. 2249 * 2250 * Also if we've been setting superpages, we may need to 2251 * recalculate 'pte' and switch back to smaller pages for the 2252 * end of the mapping, if the trailing size is not enough to 2253 * use another superpage (i.e. nr_pages < lvl_pages). 2254 */ 2255 pte++; 2256 if (!nr_pages || first_pte_in_page(pte) || 2257 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2258 domain_flush_cache(domain, first_pte, 2259 (void *)pte - (void *)first_pte); 2260 pte = NULL; 2261 } 2262 } 2263 2264 return 0; 2265 } 2266 2267 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2268 { 2269 struct intel_iommu *iommu = info->iommu; 2270 struct context_entry *context; 2271 u16 did_old; 2272 2273 if (!iommu) 2274 return; 2275 2276 spin_lock(&iommu->lock); 2277 context = iommu_context_addr(iommu, bus, devfn, 0); 2278 if (!context) { 2279 spin_unlock(&iommu->lock); 2280 return; 2281 } 2282 2283 if (sm_supported(iommu)) { 2284 if (hw_pass_through && domain_type_is_si(info->domain)) 2285 did_old = FLPT_DEFAULT_DID; 2286 else 2287 did_old = domain_id_iommu(info->domain, iommu); 2288 } else { 2289 did_old = context_domain_id(context); 2290 } 2291 2292 context_clear_entry(context); 2293 __iommu_flush_cache(iommu, context, sizeof(*context)); 2294 spin_unlock(&iommu->lock); 2295 iommu->flush.flush_context(iommu, 2296 did_old, 2297 (((u16)bus) << 8) | devfn, 2298 DMA_CCMD_MASK_NOBIT, 2299 DMA_CCMD_DEVICE_INVL); 2300 2301 if (sm_supported(iommu)) 2302 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2303 2304 iommu->flush.flush_iotlb(iommu, 2305 did_old, 2306 0, 2307 0, 2308 DMA_TLB_DSI_FLUSH); 2309 2310 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2311 } 2312 2313 static int domain_setup_first_level(struct intel_iommu *iommu, 2314 struct dmar_domain *domain, 2315 struct device *dev, 2316 u32 pasid) 2317 { 2318 struct dma_pte *pgd = domain->pgd; 2319 int agaw, level; 2320 int flags = 0; 2321 2322 /* 2323 * Skip top levels of page tables for iommu which has 2324 * less agaw than default. Unnecessary for PT mode. 2325 */ 2326 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2327 pgd = phys_to_virt(dma_pte_addr(pgd)); 2328 if (!dma_pte_present(pgd)) 2329 return -ENOMEM; 2330 } 2331 2332 level = agaw_to_level(agaw); 2333 if (level != 4 && level != 5) 2334 return -EINVAL; 2335 2336 if (pasid != PASID_RID2PASID) 2337 flags |= PASID_FLAG_SUPERVISOR_MODE; 2338 if (level == 5) 2339 flags |= PASID_FLAG_FL5LP; 2340 2341 if (domain->force_snooping) 2342 flags |= PASID_FLAG_PAGE_SNOOP; 2343 2344 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2345 domain_id_iommu(domain, iommu), 2346 flags); 2347 } 2348 2349 static bool dev_is_real_dma_subdevice(struct device *dev) 2350 { 2351 return dev && dev_is_pci(dev) && 2352 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2353 } 2354 2355 static int iommu_domain_identity_map(struct dmar_domain *domain, 2356 unsigned long first_vpfn, 2357 unsigned long last_vpfn) 2358 { 2359 /* 2360 * RMRR range might have overlap with physical memory range, 2361 * clear it first 2362 */ 2363 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2364 2365 return __domain_mapping(domain, first_vpfn, 2366 first_vpfn, last_vpfn - first_vpfn + 1, 2367 DMA_PTE_READ|DMA_PTE_WRITE); 2368 } 2369 2370 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2371 2372 static int __init si_domain_init(int hw) 2373 { 2374 struct dmar_rmrr_unit *rmrr; 2375 struct device *dev; 2376 int i, nid, ret; 2377 2378 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2379 if (!si_domain) 2380 return -EFAULT; 2381 2382 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2383 domain_exit(si_domain); 2384 return -EFAULT; 2385 } 2386 2387 if (hw) 2388 return 0; 2389 2390 for_each_online_node(nid) { 2391 unsigned long start_pfn, end_pfn; 2392 int i; 2393 2394 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2395 ret = iommu_domain_identity_map(si_domain, 2396 mm_to_dma_pfn(start_pfn), 2397 mm_to_dma_pfn(end_pfn)); 2398 if (ret) 2399 return ret; 2400 } 2401 } 2402 2403 /* 2404 * Identity map the RMRRs so that devices with RMRRs could also use 2405 * the si_domain. 2406 */ 2407 for_each_rmrr_units(rmrr) { 2408 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2409 i, dev) { 2410 unsigned long long start = rmrr->base_address; 2411 unsigned long long end = rmrr->end_address; 2412 2413 if (WARN_ON(end < start || 2414 end >> agaw_to_width(si_domain->agaw))) 2415 continue; 2416 2417 ret = iommu_domain_identity_map(si_domain, 2418 mm_to_dma_pfn(start >> PAGE_SHIFT), 2419 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2420 if (ret) 2421 return ret; 2422 } 2423 } 2424 2425 return 0; 2426 } 2427 2428 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2429 { 2430 struct device_domain_info *info = dev_iommu_priv_get(dev); 2431 struct intel_iommu *iommu; 2432 u8 bus, devfn; 2433 int ret; 2434 2435 iommu = device_to_iommu(dev, &bus, &devfn); 2436 if (!iommu) 2437 return -ENODEV; 2438 2439 ret = domain_attach_iommu(domain, iommu); 2440 if (ret) 2441 return ret; 2442 info->domain = domain; 2443 spin_lock(&domain->lock); 2444 list_add(&info->link, &domain->devices); 2445 spin_unlock(&domain->lock); 2446 2447 /* PASID table is mandatory for a PCI device in scalable mode. */ 2448 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2449 ret = intel_pasid_alloc_table(dev); 2450 if (ret) { 2451 dev_err(dev, "PASID table allocation failed\n"); 2452 dmar_remove_one_dev_info(dev); 2453 return ret; 2454 } 2455 2456 /* Setup the PASID entry for requests without PASID: */ 2457 if (hw_pass_through && domain_type_is_si(domain)) 2458 ret = intel_pasid_setup_pass_through(iommu, domain, 2459 dev, PASID_RID2PASID); 2460 else if (domain_use_first_level(domain)) 2461 ret = domain_setup_first_level(iommu, domain, dev, 2462 PASID_RID2PASID); 2463 else 2464 ret = intel_pasid_setup_second_level(iommu, domain, 2465 dev, PASID_RID2PASID); 2466 if (ret) { 2467 dev_err(dev, "Setup RID2PASID failed\n"); 2468 dmar_remove_one_dev_info(dev); 2469 return ret; 2470 } 2471 } 2472 2473 ret = domain_context_mapping(domain, dev); 2474 if (ret) { 2475 dev_err(dev, "Domain context map failed\n"); 2476 dmar_remove_one_dev_info(dev); 2477 return ret; 2478 } 2479 2480 return 0; 2481 } 2482 2483 static bool device_has_rmrr(struct device *dev) 2484 { 2485 struct dmar_rmrr_unit *rmrr; 2486 struct device *tmp; 2487 int i; 2488 2489 rcu_read_lock(); 2490 for_each_rmrr_units(rmrr) { 2491 /* 2492 * Return TRUE if this RMRR contains the device that 2493 * is passed in. 2494 */ 2495 for_each_active_dev_scope(rmrr->devices, 2496 rmrr->devices_cnt, i, tmp) 2497 if (tmp == dev || 2498 is_downstream_to_pci_bridge(dev, tmp)) { 2499 rcu_read_unlock(); 2500 return true; 2501 } 2502 } 2503 rcu_read_unlock(); 2504 return false; 2505 } 2506 2507 /** 2508 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2509 * is relaxable (ie. is allowed to be not enforced under some conditions) 2510 * @dev: device handle 2511 * 2512 * We assume that PCI USB devices with RMRRs have them largely 2513 * for historical reasons and that the RMRR space is not actively used post 2514 * boot. This exclusion may change if vendors begin to abuse it. 2515 * 2516 * The same exception is made for graphics devices, with the requirement that 2517 * any use of the RMRR regions will be torn down before assigning the device 2518 * to a guest. 2519 * 2520 * Return: true if the RMRR is relaxable, false otherwise 2521 */ 2522 static bool device_rmrr_is_relaxable(struct device *dev) 2523 { 2524 struct pci_dev *pdev; 2525 2526 if (!dev_is_pci(dev)) 2527 return false; 2528 2529 pdev = to_pci_dev(dev); 2530 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2531 return true; 2532 else 2533 return false; 2534 } 2535 2536 /* 2537 * There are a couple cases where we need to restrict the functionality of 2538 * devices associated with RMRRs. The first is when evaluating a device for 2539 * identity mapping because problems exist when devices are moved in and out 2540 * of domains and their respective RMRR information is lost. This means that 2541 * a device with associated RMRRs will never be in a "passthrough" domain. 2542 * The second is use of the device through the IOMMU API. This interface 2543 * expects to have full control of the IOVA space for the device. We cannot 2544 * satisfy both the requirement that RMRR access is maintained and have an 2545 * unencumbered IOVA space. We also have no ability to quiesce the device's 2546 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2547 * We therefore prevent devices associated with an RMRR from participating in 2548 * the IOMMU API, which eliminates them from device assignment. 2549 * 2550 * In both cases, devices which have relaxable RMRRs are not concerned by this 2551 * restriction. See device_rmrr_is_relaxable comment. 2552 */ 2553 static bool device_is_rmrr_locked(struct device *dev) 2554 { 2555 if (!device_has_rmrr(dev)) 2556 return false; 2557 2558 if (device_rmrr_is_relaxable(dev)) 2559 return false; 2560 2561 return true; 2562 } 2563 2564 /* 2565 * Return the required default domain type for a specific device. 2566 * 2567 * @dev: the device in query 2568 * @startup: true if this is during early boot 2569 * 2570 * Returns: 2571 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2572 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2573 * - 0: both identity and dynamic domains work for this device 2574 */ 2575 static int device_def_domain_type(struct device *dev) 2576 { 2577 if (dev_is_pci(dev)) { 2578 struct pci_dev *pdev = to_pci_dev(dev); 2579 2580 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2581 return IOMMU_DOMAIN_IDENTITY; 2582 2583 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2584 return IOMMU_DOMAIN_IDENTITY; 2585 } 2586 2587 return 0; 2588 } 2589 2590 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2591 { 2592 /* 2593 * Start from the sane iommu hardware state. 2594 * If the queued invalidation is already initialized by us 2595 * (for example, while enabling interrupt-remapping) then 2596 * we got the things already rolling from a sane state. 2597 */ 2598 if (!iommu->qi) { 2599 /* 2600 * Clear any previous faults. 2601 */ 2602 dmar_fault(-1, iommu); 2603 /* 2604 * Disable queued invalidation if supported and already enabled 2605 * before OS handover. 2606 */ 2607 dmar_disable_qi(iommu); 2608 } 2609 2610 if (dmar_enable_qi(iommu)) { 2611 /* 2612 * Queued Invalidate not enabled, use Register Based Invalidate 2613 */ 2614 iommu->flush.flush_context = __iommu_flush_context; 2615 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2616 pr_info("%s: Using Register based invalidation\n", 2617 iommu->name); 2618 } else { 2619 iommu->flush.flush_context = qi_flush_context; 2620 iommu->flush.flush_iotlb = qi_flush_iotlb; 2621 pr_info("%s: Using Queued invalidation\n", iommu->name); 2622 } 2623 } 2624 2625 static int copy_context_table(struct intel_iommu *iommu, 2626 struct root_entry *old_re, 2627 struct context_entry **tbl, 2628 int bus, bool ext) 2629 { 2630 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2631 struct context_entry *new_ce = NULL, ce; 2632 struct context_entry *old_ce = NULL; 2633 struct root_entry re; 2634 phys_addr_t old_ce_phys; 2635 2636 tbl_idx = ext ? bus * 2 : bus; 2637 memcpy(&re, old_re, sizeof(re)); 2638 2639 for (devfn = 0; devfn < 256; devfn++) { 2640 /* First calculate the correct index */ 2641 idx = (ext ? devfn * 2 : devfn) % 256; 2642 2643 if (idx == 0) { 2644 /* First save what we may have and clean up */ 2645 if (new_ce) { 2646 tbl[tbl_idx] = new_ce; 2647 __iommu_flush_cache(iommu, new_ce, 2648 VTD_PAGE_SIZE); 2649 pos = 1; 2650 } 2651 2652 if (old_ce) 2653 memunmap(old_ce); 2654 2655 ret = 0; 2656 if (devfn < 0x80) 2657 old_ce_phys = root_entry_lctp(&re); 2658 else 2659 old_ce_phys = root_entry_uctp(&re); 2660 2661 if (!old_ce_phys) { 2662 if (ext && devfn == 0) { 2663 /* No LCTP, try UCTP */ 2664 devfn = 0x7f; 2665 continue; 2666 } else { 2667 goto out; 2668 } 2669 } 2670 2671 ret = -ENOMEM; 2672 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2673 MEMREMAP_WB); 2674 if (!old_ce) 2675 goto out; 2676 2677 new_ce = alloc_pgtable_page(iommu->node); 2678 if (!new_ce) 2679 goto out_unmap; 2680 2681 ret = 0; 2682 } 2683 2684 /* Now copy the context entry */ 2685 memcpy(&ce, old_ce + idx, sizeof(ce)); 2686 2687 if (!__context_present(&ce)) 2688 continue; 2689 2690 did = context_domain_id(&ce); 2691 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2692 set_bit(did, iommu->domain_ids); 2693 2694 /* 2695 * We need a marker for copied context entries. This 2696 * marker needs to work for the old format as well as 2697 * for extended context entries. 2698 * 2699 * Bit 67 of the context entry is used. In the old 2700 * format this bit is available to software, in the 2701 * extended format it is the PGE bit, but PGE is ignored 2702 * by HW if PASIDs are disabled (and thus still 2703 * available). 2704 * 2705 * So disable PASIDs first and then mark the entry 2706 * copied. This means that we don't copy PASID 2707 * translations from the old kernel, but this is fine as 2708 * faults there are not fatal. 2709 */ 2710 context_clear_pasid_enable(&ce); 2711 context_set_copied(&ce); 2712 2713 new_ce[idx] = ce; 2714 } 2715 2716 tbl[tbl_idx + pos] = new_ce; 2717 2718 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2719 2720 out_unmap: 2721 memunmap(old_ce); 2722 2723 out: 2724 return ret; 2725 } 2726 2727 static int copy_translation_tables(struct intel_iommu *iommu) 2728 { 2729 struct context_entry **ctxt_tbls; 2730 struct root_entry *old_rt; 2731 phys_addr_t old_rt_phys; 2732 int ctxt_table_entries; 2733 u64 rtaddr_reg; 2734 int bus, ret; 2735 bool new_ext, ext; 2736 2737 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2738 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2739 new_ext = !!ecap_ecs(iommu->ecap); 2740 2741 /* 2742 * The RTT bit can only be changed when translation is disabled, 2743 * but disabling translation means to open a window for data 2744 * corruption. So bail out and don't copy anything if we would 2745 * have to change the bit. 2746 */ 2747 if (new_ext != ext) 2748 return -EINVAL; 2749 2750 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2751 if (!old_rt_phys) 2752 return -EINVAL; 2753 2754 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2755 if (!old_rt) 2756 return -ENOMEM; 2757 2758 /* This is too big for the stack - allocate it from slab */ 2759 ctxt_table_entries = ext ? 512 : 256; 2760 ret = -ENOMEM; 2761 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2762 if (!ctxt_tbls) 2763 goto out_unmap; 2764 2765 for (bus = 0; bus < 256; bus++) { 2766 ret = copy_context_table(iommu, &old_rt[bus], 2767 ctxt_tbls, bus, ext); 2768 if (ret) { 2769 pr_err("%s: Failed to copy context table for bus %d\n", 2770 iommu->name, bus); 2771 continue; 2772 } 2773 } 2774 2775 spin_lock(&iommu->lock); 2776 2777 /* Context tables are copied, now write them to the root_entry table */ 2778 for (bus = 0; bus < 256; bus++) { 2779 int idx = ext ? bus * 2 : bus; 2780 u64 val; 2781 2782 if (ctxt_tbls[idx]) { 2783 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2784 iommu->root_entry[bus].lo = val; 2785 } 2786 2787 if (!ext || !ctxt_tbls[idx + 1]) 2788 continue; 2789 2790 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2791 iommu->root_entry[bus].hi = val; 2792 } 2793 2794 spin_unlock(&iommu->lock); 2795 2796 kfree(ctxt_tbls); 2797 2798 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2799 2800 ret = 0; 2801 2802 out_unmap: 2803 memunmap(old_rt); 2804 2805 return ret; 2806 } 2807 2808 #ifdef CONFIG_INTEL_IOMMU_SVM 2809 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 2810 { 2811 struct intel_iommu *iommu = data; 2812 ioasid_t ioasid; 2813 2814 if (!iommu) 2815 return INVALID_IOASID; 2816 /* 2817 * VT-d virtual command interface always uses the full 20 bit 2818 * PASID range. Host can partition guest PASID range based on 2819 * policies but it is out of guest's control. 2820 */ 2821 if (min < PASID_MIN || max > intel_pasid_max_id) 2822 return INVALID_IOASID; 2823 2824 if (vcmd_alloc_pasid(iommu, &ioasid)) 2825 return INVALID_IOASID; 2826 2827 return ioasid; 2828 } 2829 2830 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 2831 { 2832 struct intel_iommu *iommu = data; 2833 2834 if (!iommu) 2835 return; 2836 /* 2837 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 2838 * We can only free the PASID when all the devices are unbound. 2839 */ 2840 if (ioasid_find(NULL, ioasid, NULL)) { 2841 pr_alert("Cannot free active IOASID %d\n", ioasid); 2842 return; 2843 } 2844 vcmd_free_pasid(iommu, ioasid); 2845 } 2846 2847 static void register_pasid_allocator(struct intel_iommu *iommu) 2848 { 2849 /* 2850 * If we are running in the host, no need for custom allocator 2851 * in that PASIDs are allocated from the host system-wide. 2852 */ 2853 if (!cap_caching_mode(iommu->cap)) 2854 return; 2855 2856 if (!sm_supported(iommu)) { 2857 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 2858 return; 2859 } 2860 2861 /* 2862 * Register a custom PASID allocator if we are running in a guest, 2863 * guest PASID must be obtained via virtual command interface. 2864 * There can be multiple vIOMMUs in each guest but only one allocator 2865 * is active. All vIOMMU allocators will eventually be calling the same 2866 * host allocator. 2867 */ 2868 if (!vccap_pasid(iommu->vccap)) 2869 return; 2870 2871 pr_info("Register custom PASID allocator\n"); 2872 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 2873 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 2874 iommu->pasid_allocator.pdata = (void *)iommu; 2875 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 2876 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 2877 /* 2878 * Disable scalable mode on this IOMMU if there 2879 * is no custom allocator. Mixing SM capable vIOMMU 2880 * and non-SM vIOMMU are not supported. 2881 */ 2882 intel_iommu_sm = 0; 2883 } 2884 } 2885 #endif 2886 2887 static int __init init_dmars(void) 2888 { 2889 struct dmar_drhd_unit *drhd; 2890 struct intel_iommu *iommu; 2891 int ret; 2892 2893 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2894 if (ret) 2895 goto free_iommu; 2896 2897 for_each_iommu(iommu, drhd) { 2898 if (drhd->ignored) { 2899 iommu_disable_translation(iommu); 2900 continue; 2901 } 2902 2903 /* 2904 * Find the max pasid size of all IOMMU's in the system. 2905 * We need to ensure the system pasid table is no bigger 2906 * than the smallest supported. 2907 */ 2908 if (pasid_supported(iommu)) { 2909 u32 temp = 2 << ecap_pss(iommu->ecap); 2910 2911 intel_pasid_max_id = min_t(u32, temp, 2912 intel_pasid_max_id); 2913 } 2914 2915 intel_iommu_init_qi(iommu); 2916 2917 ret = iommu_init_domains(iommu); 2918 if (ret) 2919 goto free_iommu; 2920 2921 init_translation_status(iommu); 2922 2923 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2924 iommu_disable_translation(iommu); 2925 clear_translation_pre_enabled(iommu); 2926 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2927 iommu->name); 2928 } 2929 2930 /* 2931 * TBD: 2932 * we could share the same root & context tables 2933 * among all IOMMU's. Need to Split it later. 2934 */ 2935 ret = iommu_alloc_root_entry(iommu); 2936 if (ret) 2937 goto free_iommu; 2938 2939 if (translation_pre_enabled(iommu)) { 2940 pr_info("Translation already enabled - trying to copy translation structures\n"); 2941 2942 ret = copy_translation_tables(iommu); 2943 if (ret) { 2944 /* 2945 * We found the IOMMU with translation 2946 * enabled - but failed to copy over the 2947 * old root-entry table. Try to proceed 2948 * by disabling translation now and 2949 * allocating a clean root-entry table. 2950 * This might cause DMAR faults, but 2951 * probably the dump will still succeed. 2952 */ 2953 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2954 iommu->name); 2955 iommu_disable_translation(iommu); 2956 clear_translation_pre_enabled(iommu); 2957 } else { 2958 pr_info("Copied translation tables from previous kernel for %s\n", 2959 iommu->name); 2960 } 2961 } 2962 2963 if (!ecap_pass_through(iommu->ecap)) 2964 hw_pass_through = 0; 2965 intel_svm_check(iommu); 2966 } 2967 2968 /* 2969 * Now that qi is enabled on all iommus, set the root entry and flush 2970 * caches. This is required on some Intel X58 chipsets, otherwise the 2971 * flush_context function will loop forever and the boot hangs. 2972 */ 2973 for_each_active_iommu(iommu, drhd) { 2974 iommu_flush_write_buffer(iommu); 2975 #ifdef CONFIG_INTEL_IOMMU_SVM 2976 register_pasid_allocator(iommu); 2977 #endif 2978 iommu_set_root_entry(iommu); 2979 } 2980 2981 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2982 dmar_map_gfx = 0; 2983 #endif 2984 2985 if (!dmar_map_gfx) 2986 iommu_identity_mapping |= IDENTMAP_GFX; 2987 2988 check_tylersburg_isoch(); 2989 2990 ret = si_domain_init(hw_pass_through); 2991 if (ret) 2992 goto free_iommu; 2993 2994 /* 2995 * for each drhd 2996 * enable fault log 2997 * global invalidate context cache 2998 * global invalidate iotlb 2999 * enable translation 3000 */ 3001 for_each_iommu(iommu, drhd) { 3002 if (drhd->ignored) { 3003 /* 3004 * we always have to disable PMRs or DMA may fail on 3005 * this device 3006 */ 3007 if (force_on) 3008 iommu_disable_protect_mem_regions(iommu); 3009 continue; 3010 } 3011 3012 iommu_flush_write_buffer(iommu); 3013 3014 #ifdef CONFIG_INTEL_IOMMU_SVM 3015 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3016 /* 3017 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3018 * could cause possible lock race condition. 3019 */ 3020 up_write(&dmar_global_lock); 3021 ret = intel_svm_enable_prq(iommu); 3022 down_write(&dmar_global_lock); 3023 if (ret) 3024 goto free_iommu; 3025 } 3026 #endif 3027 ret = dmar_set_interrupt(iommu); 3028 if (ret) 3029 goto free_iommu; 3030 } 3031 3032 return 0; 3033 3034 free_iommu: 3035 for_each_active_iommu(iommu, drhd) { 3036 disable_dmar_iommu(iommu); 3037 free_dmar_iommu(iommu); 3038 } 3039 3040 return ret; 3041 } 3042 3043 static void __init init_no_remapping_devices(void) 3044 { 3045 struct dmar_drhd_unit *drhd; 3046 struct device *dev; 3047 int i; 3048 3049 for_each_drhd_unit(drhd) { 3050 if (!drhd->include_all) { 3051 for_each_active_dev_scope(drhd->devices, 3052 drhd->devices_cnt, i, dev) 3053 break; 3054 /* ignore DMAR unit if no devices exist */ 3055 if (i == drhd->devices_cnt) 3056 drhd->ignored = 1; 3057 } 3058 } 3059 3060 for_each_active_drhd_unit(drhd) { 3061 if (drhd->include_all) 3062 continue; 3063 3064 for_each_active_dev_scope(drhd->devices, 3065 drhd->devices_cnt, i, dev) 3066 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3067 break; 3068 if (i < drhd->devices_cnt) 3069 continue; 3070 3071 /* This IOMMU has *only* gfx devices. Either bypass it or 3072 set the gfx_mapped flag, as appropriate */ 3073 drhd->gfx_dedicated = 1; 3074 if (!dmar_map_gfx) 3075 drhd->ignored = 1; 3076 } 3077 } 3078 3079 #ifdef CONFIG_SUSPEND 3080 static int init_iommu_hw(void) 3081 { 3082 struct dmar_drhd_unit *drhd; 3083 struct intel_iommu *iommu = NULL; 3084 3085 for_each_active_iommu(iommu, drhd) 3086 if (iommu->qi) 3087 dmar_reenable_qi(iommu); 3088 3089 for_each_iommu(iommu, drhd) { 3090 if (drhd->ignored) { 3091 /* 3092 * we always have to disable PMRs or DMA may fail on 3093 * this device 3094 */ 3095 if (force_on) 3096 iommu_disable_protect_mem_regions(iommu); 3097 continue; 3098 } 3099 3100 iommu_flush_write_buffer(iommu); 3101 iommu_set_root_entry(iommu); 3102 iommu_enable_translation(iommu); 3103 iommu_disable_protect_mem_regions(iommu); 3104 } 3105 3106 return 0; 3107 } 3108 3109 static void iommu_flush_all(void) 3110 { 3111 struct dmar_drhd_unit *drhd; 3112 struct intel_iommu *iommu; 3113 3114 for_each_active_iommu(iommu, drhd) { 3115 iommu->flush.flush_context(iommu, 0, 0, 0, 3116 DMA_CCMD_GLOBAL_INVL); 3117 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3118 DMA_TLB_GLOBAL_FLUSH); 3119 } 3120 } 3121 3122 static int iommu_suspend(void) 3123 { 3124 struct dmar_drhd_unit *drhd; 3125 struct intel_iommu *iommu = NULL; 3126 unsigned long flag; 3127 3128 for_each_active_iommu(iommu, drhd) { 3129 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3130 GFP_KERNEL); 3131 if (!iommu->iommu_state) 3132 goto nomem; 3133 } 3134 3135 iommu_flush_all(); 3136 3137 for_each_active_iommu(iommu, drhd) { 3138 iommu_disable_translation(iommu); 3139 3140 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3141 3142 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3143 readl(iommu->reg + DMAR_FECTL_REG); 3144 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3145 readl(iommu->reg + DMAR_FEDATA_REG); 3146 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3147 readl(iommu->reg + DMAR_FEADDR_REG); 3148 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3149 readl(iommu->reg + DMAR_FEUADDR_REG); 3150 3151 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3152 } 3153 return 0; 3154 3155 nomem: 3156 for_each_active_iommu(iommu, drhd) 3157 kfree(iommu->iommu_state); 3158 3159 return -ENOMEM; 3160 } 3161 3162 static void iommu_resume(void) 3163 { 3164 struct dmar_drhd_unit *drhd; 3165 struct intel_iommu *iommu = NULL; 3166 unsigned long flag; 3167 3168 if (init_iommu_hw()) { 3169 if (force_on) 3170 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3171 else 3172 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3173 return; 3174 } 3175 3176 for_each_active_iommu(iommu, drhd) { 3177 3178 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3179 3180 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3181 iommu->reg + DMAR_FECTL_REG); 3182 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3183 iommu->reg + DMAR_FEDATA_REG); 3184 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3185 iommu->reg + DMAR_FEADDR_REG); 3186 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3187 iommu->reg + DMAR_FEUADDR_REG); 3188 3189 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3190 } 3191 3192 for_each_active_iommu(iommu, drhd) 3193 kfree(iommu->iommu_state); 3194 } 3195 3196 static struct syscore_ops iommu_syscore_ops = { 3197 .resume = iommu_resume, 3198 .suspend = iommu_suspend, 3199 }; 3200 3201 static void __init init_iommu_pm_ops(void) 3202 { 3203 register_syscore_ops(&iommu_syscore_ops); 3204 } 3205 3206 #else 3207 static inline void init_iommu_pm_ops(void) {} 3208 #endif /* CONFIG_PM */ 3209 3210 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3211 { 3212 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3213 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3214 rmrr->end_address <= rmrr->base_address || 3215 arch_rmrr_sanity_check(rmrr)) 3216 return -EINVAL; 3217 3218 return 0; 3219 } 3220 3221 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3222 { 3223 struct acpi_dmar_reserved_memory *rmrr; 3224 struct dmar_rmrr_unit *rmrru; 3225 3226 rmrr = (struct acpi_dmar_reserved_memory *)header; 3227 if (rmrr_sanity_check(rmrr)) { 3228 pr_warn(FW_BUG 3229 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3230 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3231 rmrr->base_address, rmrr->end_address, 3232 dmi_get_system_info(DMI_BIOS_VENDOR), 3233 dmi_get_system_info(DMI_BIOS_VERSION), 3234 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3235 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3236 } 3237 3238 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3239 if (!rmrru) 3240 goto out; 3241 3242 rmrru->hdr = header; 3243 3244 rmrru->base_address = rmrr->base_address; 3245 rmrru->end_address = rmrr->end_address; 3246 3247 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3248 ((void *)rmrr) + rmrr->header.length, 3249 &rmrru->devices_cnt); 3250 if (rmrru->devices_cnt && rmrru->devices == NULL) 3251 goto free_rmrru; 3252 3253 list_add(&rmrru->list, &dmar_rmrr_units); 3254 3255 return 0; 3256 free_rmrru: 3257 kfree(rmrru); 3258 out: 3259 return -ENOMEM; 3260 } 3261 3262 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3263 { 3264 struct dmar_atsr_unit *atsru; 3265 struct acpi_dmar_atsr *tmp; 3266 3267 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3268 dmar_rcu_check()) { 3269 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3270 if (atsr->segment != tmp->segment) 3271 continue; 3272 if (atsr->header.length != tmp->header.length) 3273 continue; 3274 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3275 return atsru; 3276 } 3277 3278 return NULL; 3279 } 3280 3281 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3282 { 3283 struct acpi_dmar_atsr *atsr; 3284 struct dmar_atsr_unit *atsru; 3285 3286 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3287 return 0; 3288 3289 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3290 atsru = dmar_find_atsr(atsr); 3291 if (atsru) 3292 return 0; 3293 3294 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3295 if (!atsru) 3296 return -ENOMEM; 3297 3298 /* 3299 * If memory is allocated from slab by ACPI _DSM method, we need to 3300 * copy the memory content because the memory buffer will be freed 3301 * on return. 3302 */ 3303 atsru->hdr = (void *)(atsru + 1); 3304 memcpy(atsru->hdr, hdr, hdr->length); 3305 atsru->include_all = atsr->flags & 0x1; 3306 if (!atsru->include_all) { 3307 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3308 (void *)atsr + atsr->header.length, 3309 &atsru->devices_cnt); 3310 if (atsru->devices_cnt && atsru->devices == NULL) { 3311 kfree(atsru); 3312 return -ENOMEM; 3313 } 3314 } 3315 3316 list_add_rcu(&atsru->list, &dmar_atsr_units); 3317 3318 return 0; 3319 } 3320 3321 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3322 { 3323 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3324 kfree(atsru); 3325 } 3326 3327 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3328 { 3329 struct acpi_dmar_atsr *atsr; 3330 struct dmar_atsr_unit *atsru; 3331 3332 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3333 atsru = dmar_find_atsr(atsr); 3334 if (atsru) { 3335 list_del_rcu(&atsru->list); 3336 synchronize_rcu(); 3337 intel_iommu_free_atsr(atsru); 3338 } 3339 3340 return 0; 3341 } 3342 3343 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3344 { 3345 int i; 3346 struct device *dev; 3347 struct acpi_dmar_atsr *atsr; 3348 struct dmar_atsr_unit *atsru; 3349 3350 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3351 atsru = dmar_find_atsr(atsr); 3352 if (!atsru) 3353 return 0; 3354 3355 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3356 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3357 i, dev) 3358 return -EBUSY; 3359 } 3360 3361 return 0; 3362 } 3363 3364 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3365 { 3366 struct dmar_satc_unit *satcu; 3367 struct acpi_dmar_satc *tmp; 3368 3369 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3370 dmar_rcu_check()) { 3371 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3372 if (satc->segment != tmp->segment) 3373 continue; 3374 if (satc->header.length != tmp->header.length) 3375 continue; 3376 if (memcmp(satc, tmp, satc->header.length) == 0) 3377 return satcu; 3378 } 3379 3380 return NULL; 3381 } 3382 3383 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3384 { 3385 struct acpi_dmar_satc *satc; 3386 struct dmar_satc_unit *satcu; 3387 3388 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3389 return 0; 3390 3391 satc = container_of(hdr, struct acpi_dmar_satc, header); 3392 satcu = dmar_find_satc(satc); 3393 if (satcu) 3394 return 0; 3395 3396 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3397 if (!satcu) 3398 return -ENOMEM; 3399 3400 satcu->hdr = (void *)(satcu + 1); 3401 memcpy(satcu->hdr, hdr, hdr->length); 3402 satcu->atc_required = satc->flags & 0x1; 3403 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3404 (void *)satc + satc->header.length, 3405 &satcu->devices_cnt); 3406 if (satcu->devices_cnt && !satcu->devices) { 3407 kfree(satcu); 3408 return -ENOMEM; 3409 } 3410 list_add_rcu(&satcu->list, &dmar_satc_units); 3411 3412 return 0; 3413 } 3414 3415 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3416 { 3417 int sp, ret; 3418 struct intel_iommu *iommu = dmaru->iommu; 3419 3420 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3421 if (ret) 3422 goto out; 3423 3424 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3425 pr_warn("%s: Doesn't support hardware pass through.\n", 3426 iommu->name); 3427 return -ENXIO; 3428 } 3429 3430 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3431 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3432 pr_warn("%s: Doesn't support large page.\n", 3433 iommu->name); 3434 return -ENXIO; 3435 } 3436 3437 /* 3438 * Disable translation if already enabled prior to OS handover. 3439 */ 3440 if (iommu->gcmd & DMA_GCMD_TE) 3441 iommu_disable_translation(iommu); 3442 3443 ret = iommu_init_domains(iommu); 3444 if (ret == 0) 3445 ret = iommu_alloc_root_entry(iommu); 3446 if (ret) 3447 goto out; 3448 3449 intel_svm_check(iommu); 3450 3451 if (dmaru->ignored) { 3452 /* 3453 * we always have to disable PMRs or DMA may fail on this device 3454 */ 3455 if (force_on) 3456 iommu_disable_protect_mem_regions(iommu); 3457 return 0; 3458 } 3459 3460 intel_iommu_init_qi(iommu); 3461 iommu_flush_write_buffer(iommu); 3462 3463 #ifdef CONFIG_INTEL_IOMMU_SVM 3464 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3465 ret = intel_svm_enable_prq(iommu); 3466 if (ret) 3467 goto disable_iommu; 3468 } 3469 #endif 3470 ret = dmar_set_interrupt(iommu); 3471 if (ret) 3472 goto disable_iommu; 3473 3474 iommu_set_root_entry(iommu); 3475 iommu_enable_translation(iommu); 3476 3477 iommu_disable_protect_mem_regions(iommu); 3478 return 0; 3479 3480 disable_iommu: 3481 disable_dmar_iommu(iommu); 3482 out: 3483 free_dmar_iommu(iommu); 3484 return ret; 3485 } 3486 3487 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3488 { 3489 int ret = 0; 3490 struct intel_iommu *iommu = dmaru->iommu; 3491 3492 if (!intel_iommu_enabled) 3493 return 0; 3494 if (iommu == NULL) 3495 return -EINVAL; 3496 3497 if (insert) { 3498 ret = intel_iommu_add(dmaru); 3499 } else { 3500 disable_dmar_iommu(iommu); 3501 free_dmar_iommu(iommu); 3502 } 3503 3504 return ret; 3505 } 3506 3507 static void intel_iommu_free_dmars(void) 3508 { 3509 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3510 struct dmar_atsr_unit *atsru, *atsr_n; 3511 struct dmar_satc_unit *satcu, *satc_n; 3512 3513 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3514 list_del(&rmrru->list); 3515 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3516 kfree(rmrru); 3517 } 3518 3519 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3520 list_del(&atsru->list); 3521 intel_iommu_free_atsr(atsru); 3522 } 3523 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3524 list_del(&satcu->list); 3525 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3526 kfree(satcu); 3527 } 3528 } 3529 3530 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3531 { 3532 struct dmar_satc_unit *satcu; 3533 struct acpi_dmar_satc *satc; 3534 struct device *tmp; 3535 int i; 3536 3537 dev = pci_physfn(dev); 3538 rcu_read_lock(); 3539 3540 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3541 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3542 if (satc->segment != pci_domain_nr(dev->bus)) 3543 continue; 3544 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3545 if (to_pci_dev(tmp) == dev) 3546 goto out; 3547 } 3548 satcu = NULL; 3549 out: 3550 rcu_read_unlock(); 3551 return satcu; 3552 } 3553 3554 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3555 { 3556 int i, ret = 1; 3557 struct pci_bus *bus; 3558 struct pci_dev *bridge = NULL; 3559 struct device *tmp; 3560 struct acpi_dmar_atsr *atsr; 3561 struct dmar_atsr_unit *atsru; 3562 struct dmar_satc_unit *satcu; 3563 3564 dev = pci_physfn(dev); 3565 satcu = dmar_find_matched_satc_unit(dev); 3566 if (satcu) 3567 /* 3568 * This device supports ATS as it is in SATC table. 3569 * When IOMMU is in legacy mode, enabling ATS is done 3570 * automatically by HW for the device that requires 3571 * ATS, hence OS should not enable this device ATS 3572 * to avoid duplicated TLB invalidation. 3573 */ 3574 return !(satcu->atc_required && !sm_supported(iommu)); 3575 3576 for (bus = dev->bus; bus; bus = bus->parent) { 3577 bridge = bus->self; 3578 /* If it's an integrated device, allow ATS */ 3579 if (!bridge) 3580 return 1; 3581 /* Connected via non-PCIe: no ATS */ 3582 if (!pci_is_pcie(bridge) || 3583 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3584 return 0; 3585 /* If we found the root port, look it up in the ATSR */ 3586 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3587 break; 3588 } 3589 3590 rcu_read_lock(); 3591 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3592 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3593 if (atsr->segment != pci_domain_nr(dev->bus)) 3594 continue; 3595 3596 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3597 if (tmp == &bridge->dev) 3598 goto out; 3599 3600 if (atsru->include_all) 3601 goto out; 3602 } 3603 ret = 0; 3604 out: 3605 rcu_read_unlock(); 3606 3607 return ret; 3608 } 3609 3610 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3611 { 3612 int ret; 3613 struct dmar_rmrr_unit *rmrru; 3614 struct dmar_atsr_unit *atsru; 3615 struct dmar_satc_unit *satcu; 3616 struct acpi_dmar_atsr *atsr; 3617 struct acpi_dmar_reserved_memory *rmrr; 3618 struct acpi_dmar_satc *satc; 3619 3620 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3621 return 0; 3622 3623 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3624 rmrr = container_of(rmrru->hdr, 3625 struct acpi_dmar_reserved_memory, header); 3626 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3627 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3628 ((void *)rmrr) + rmrr->header.length, 3629 rmrr->segment, rmrru->devices, 3630 rmrru->devices_cnt); 3631 if (ret < 0) 3632 return ret; 3633 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3634 dmar_remove_dev_scope(info, rmrr->segment, 3635 rmrru->devices, rmrru->devices_cnt); 3636 } 3637 } 3638 3639 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3640 if (atsru->include_all) 3641 continue; 3642 3643 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3644 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3645 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3646 (void *)atsr + atsr->header.length, 3647 atsr->segment, atsru->devices, 3648 atsru->devices_cnt); 3649 if (ret > 0) 3650 break; 3651 else if (ret < 0) 3652 return ret; 3653 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3654 if (dmar_remove_dev_scope(info, atsr->segment, 3655 atsru->devices, atsru->devices_cnt)) 3656 break; 3657 } 3658 } 3659 list_for_each_entry(satcu, &dmar_satc_units, list) { 3660 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3661 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3662 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3663 (void *)satc + satc->header.length, 3664 satc->segment, satcu->devices, 3665 satcu->devices_cnt); 3666 if (ret > 0) 3667 break; 3668 else if (ret < 0) 3669 return ret; 3670 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3671 if (dmar_remove_dev_scope(info, satc->segment, 3672 satcu->devices, satcu->devices_cnt)) 3673 break; 3674 } 3675 } 3676 3677 return 0; 3678 } 3679 3680 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3681 unsigned long val, void *v) 3682 { 3683 struct memory_notify *mhp = v; 3684 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3685 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3686 mhp->nr_pages - 1); 3687 3688 switch (val) { 3689 case MEM_GOING_ONLINE: 3690 if (iommu_domain_identity_map(si_domain, 3691 start_vpfn, last_vpfn)) { 3692 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3693 start_vpfn, last_vpfn); 3694 return NOTIFY_BAD; 3695 } 3696 break; 3697 3698 case MEM_OFFLINE: 3699 case MEM_CANCEL_ONLINE: 3700 { 3701 struct dmar_drhd_unit *drhd; 3702 struct intel_iommu *iommu; 3703 LIST_HEAD(freelist); 3704 3705 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3706 3707 rcu_read_lock(); 3708 for_each_active_iommu(iommu, drhd) 3709 iommu_flush_iotlb_psi(iommu, si_domain, 3710 start_vpfn, mhp->nr_pages, 3711 list_empty(&freelist), 0); 3712 rcu_read_unlock(); 3713 put_pages_list(&freelist); 3714 } 3715 break; 3716 } 3717 3718 return NOTIFY_OK; 3719 } 3720 3721 static struct notifier_block intel_iommu_memory_nb = { 3722 .notifier_call = intel_iommu_memory_notifier, 3723 .priority = 0 3724 }; 3725 3726 static void intel_disable_iommus(void) 3727 { 3728 struct intel_iommu *iommu = NULL; 3729 struct dmar_drhd_unit *drhd; 3730 3731 for_each_iommu(iommu, drhd) 3732 iommu_disable_translation(iommu); 3733 } 3734 3735 void intel_iommu_shutdown(void) 3736 { 3737 struct dmar_drhd_unit *drhd; 3738 struct intel_iommu *iommu = NULL; 3739 3740 if (no_iommu || dmar_disabled) 3741 return; 3742 3743 down_write(&dmar_global_lock); 3744 3745 /* Disable PMRs explicitly here. */ 3746 for_each_iommu(iommu, drhd) 3747 iommu_disable_protect_mem_regions(iommu); 3748 3749 /* Make sure the IOMMUs are switched off */ 3750 intel_disable_iommus(); 3751 3752 up_write(&dmar_global_lock); 3753 } 3754 3755 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3756 { 3757 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3758 3759 return container_of(iommu_dev, struct intel_iommu, iommu); 3760 } 3761 3762 static ssize_t version_show(struct device *dev, 3763 struct device_attribute *attr, char *buf) 3764 { 3765 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3766 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3767 return sprintf(buf, "%d:%d\n", 3768 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3769 } 3770 static DEVICE_ATTR_RO(version); 3771 3772 static ssize_t address_show(struct device *dev, 3773 struct device_attribute *attr, char *buf) 3774 { 3775 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3776 return sprintf(buf, "%llx\n", iommu->reg_phys); 3777 } 3778 static DEVICE_ATTR_RO(address); 3779 3780 static ssize_t cap_show(struct device *dev, 3781 struct device_attribute *attr, char *buf) 3782 { 3783 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3784 return sprintf(buf, "%llx\n", iommu->cap); 3785 } 3786 static DEVICE_ATTR_RO(cap); 3787 3788 static ssize_t ecap_show(struct device *dev, 3789 struct device_attribute *attr, char *buf) 3790 { 3791 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3792 return sprintf(buf, "%llx\n", iommu->ecap); 3793 } 3794 static DEVICE_ATTR_RO(ecap); 3795 3796 static ssize_t domains_supported_show(struct device *dev, 3797 struct device_attribute *attr, char *buf) 3798 { 3799 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3800 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 3801 } 3802 static DEVICE_ATTR_RO(domains_supported); 3803 3804 static ssize_t domains_used_show(struct device *dev, 3805 struct device_attribute *attr, char *buf) 3806 { 3807 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3808 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 3809 cap_ndoms(iommu->cap))); 3810 } 3811 static DEVICE_ATTR_RO(domains_used); 3812 3813 static struct attribute *intel_iommu_attrs[] = { 3814 &dev_attr_version.attr, 3815 &dev_attr_address.attr, 3816 &dev_attr_cap.attr, 3817 &dev_attr_ecap.attr, 3818 &dev_attr_domains_supported.attr, 3819 &dev_attr_domains_used.attr, 3820 NULL, 3821 }; 3822 3823 static struct attribute_group intel_iommu_group = { 3824 .name = "intel-iommu", 3825 .attrs = intel_iommu_attrs, 3826 }; 3827 3828 const struct attribute_group *intel_iommu_groups[] = { 3829 &intel_iommu_group, 3830 NULL, 3831 }; 3832 3833 static inline bool has_external_pci(void) 3834 { 3835 struct pci_dev *pdev = NULL; 3836 3837 for_each_pci_dev(pdev) 3838 if (pdev->external_facing) 3839 return true; 3840 3841 return false; 3842 } 3843 3844 static int __init platform_optin_force_iommu(void) 3845 { 3846 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3847 return 0; 3848 3849 if (no_iommu || dmar_disabled) 3850 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3851 3852 /* 3853 * If Intel-IOMMU is disabled by default, we will apply identity 3854 * map for all devices except those marked as being untrusted. 3855 */ 3856 if (dmar_disabled) 3857 iommu_set_default_passthrough(false); 3858 3859 dmar_disabled = 0; 3860 no_iommu = 0; 3861 3862 return 1; 3863 } 3864 3865 static int __init probe_acpi_namespace_devices(void) 3866 { 3867 struct dmar_drhd_unit *drhd; 3868 /* To avoid a -Wunused-but-set-variable warning. */ 3869 struct intel_iommu *iommu __maybe_unused; 3870 struct device *dev; 3871 int i, ret = 0; 3872 3873 for_each_active_iommu(iommu, drhd) { 3874 for_each_active_dev_scope(drhd->devices, 3875 drhd->devices_cnt, i, dev) { 3876 struct acpi_device_physical_node *pn; 3877 struct iommu_group *group; 3878 struct acpi_device *adev; 3879 3880 if (dev->bus != &acpi_bus_type) 3881 continue; 3882 3883 adev = to_acpi_device(dev); 3884 mutex_lock(&adev->physical_node_lock); 3885 list_for_each_entry(pn, 3886 &adev->physical_node_list, node) { 3887 group = iommu_group_get(pn->dev); 3888 if (group) { 3889 iommu_group_put(group); 3890 continue; 3891 } 3892 3893 pn->dev->bus->iommu_ops = &intel_iommu_ops; 3894 ret = iommu_probe_device(pn->dev); 3895 if (ret) 3896 break; 3897 } 3898 mutex_unlock(&adev->physical_node_lock); 3899 3900 if (ret) 3901 return ret; 3902 } 3903 } 3904 3905 return 0; 3906 } 3907 3908 static __init int tboot_force_iommu(void) 3909 { 3910 if (!tboot_enabled()) 3911 return 0; 3912 3913 if (no_iommu || dmar_disabled) 3914 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3915 3916 dmar_disabled = 0; 3917 no_iommu = 0; 3918 3919 return 1; 3920 } 3921 3922 int __init intel_iommu_init(void) 3923 { 3924 int ret = -ENODEV; 3925 struct dmar_drhd_unit *drhd; 3926 struct intel_iommu *iommu; 3927 3928 /* 3929 * Intel IOMMU is required for a TXT/tboot launch or platform 3930 * opt in, so enforce that. 3931 */ 3932 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3933 platform_optin_force_iommu(); 3934 3935 down_write(&dmar_global_lock); 3936 if (dmar_table_init()) { 3937 if (force_on) 3938 panic("tboot: Failed to initialize DMAR table\n"); 3939 goto out_free_dmar; 3940 } 3941 3942 if (dmar_dev_scope_init() < 0) { 3943 if (force_on) 3944 panic("tboot: Failed to initialize DMAR device scope\n"); 3945 goto out_free_dmar; 3946 } 3947 3948 up_write(&dmar_global_lock); 3949 3950 /* 3951 * The bus notifier takes the dmar_global_lock, so lockdep will 3952 * complain later when we register it under the lock. 3953 */ 3954 dmar_register_bus_notifier(); 3955 3956 down_write(&dmar_global_lock); 3957 3958 if (!no_iommu) 3959 intel_iommu_debugfs_init(); 3960 3961 if (no_iommu || dmar_disabled) { 3962 /* 3963 * We exit the function here to ensure IOMMU's remapping and 3964 * mempool aren't setup, which means that the IOMMU's PMRs 3965 * won't be disabled via the call to init_dmars(). So disable 3966 * it explicitly here. The PMRs were setup by tboot prior to 3967 * calling SENTER, but the kernel is expected to reset/tear 3968 * down the PMRs. 3969 */ 3970 if (intel_iommu_tboot_noforce) { 3971 for_each_iommu(iommu, drhd) 3972 iommu_disable_protect_mem_regions(iommu); 3973 } 3974 3975 /* 3976 * Make sure the IOMMUs are switched off, even when we 3977 * boot into a kexec kernel and the previous kernel left 3978 * them enabled 3979 */ 3980 intel_disable_iommus(); 3981 goto out_free_dmar; 3982 } 3983 3984 if (list_empty(&dmar_rmrr_units)) 3985 pr_info("No RMRR found\n"); 3986 3987 if (list_empty(&dmar_atsr_units)) 3988 pr_info("No ATSR found\n"); 3989 3990 if (list_empty(&dmar_satc_units)) 3991 pr_info("No SATC found\n"); 3992 3993 init_no_remapping_devices(); 3994 3995 ret = init_dmars(); 3996 if (ret) { 3997 if (force_on) 3998 panic("tboot: Failed to initialize DMARs\n"); 3999 pr_err("Initialization failed\n"); 4000 goto out_free_dmar; 4001 } 4002 up_write(&dmar_global_lock); 4003 4004 init_iommu_pm_ops(); 4005 4006 down_read(&dmar_global_lock); 4007 for_each_active_iommu(iommu, drhd) { 4008 /* 4009 * The flush queue implementation does not perform 4010 * page-selective invalidations that are required for efficient 4011 * TLB flushes in virtual environments. The benefit of batching 4012 * is likely to be much lower than the overhead of synchronizing 4013 * the virtual and physical IOMMU page-tables. 4014 */ 4015 if (cap_caching_mode(iommu->cap)) { 4016 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4017 iommu_set_dma_strict(); 4018 } 4019 iommu_device_sysfs_add(&iommu->iommu, NULL, 4020 intel_iommu_groups, 4021 "%s", iommu->name); 4022 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4023 } 4024 up_read(&dmar_global_lock); 4025 4026 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4027 if (si_domain && !hw_pass_through) 4028 register_memory_notifier(&intel_iommu_memory_nb); 4029 4030 down_read(&dmar_global_lock); 4031 if (probe_acpi_namespace_devices()) 4032 pr_warn("ACPI name space devices didn't probe correctly\n"); 4033 4034 /* Finally, we enable the DMA remapping hardware. */ 4035 for_each_iommu(iommu, drhd) { 4036 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4037 iommu_enable_translation(iommu); 4038 4039 iommu_disable_protect_mem_regions(iommu); 4040 } 4041 up_read(&dmar_global_lock); 4042 4043 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4044 4045 intel_iommu_enabled = 1; 4046 4047 return 0; 4048 4049 out_free_dmar: 4050 intel_iommu_free_dmars(); 4051 up_write(&dmar_global_lock); 4052 return ret; 4053 } 4054 4055 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4056 { 4057 struct device_domain_info *info = opaque; 4058 4059 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4060 return 0; 4061 } 4062 4063 /* 4064 * NB - intel-iommu lacks any sort of reference counting for the users of 4065 * dependent devices. If multiple endpoints have intersecting dependent 4066 * devices, unbinding the driver from any one of them will possibly leave 4067 * the others unable to operate. 4068 */ 4069 static void domain_context_clear(struct device_domain_info *info) 4070 { 4071 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4072 return; 4073 4074 pci_for_each_dma_alias(to_pci_dev(info->dev), 4075 &domain_context_clear_one_cb, info); 4076 } 4077 4078 static void dmar_remove_one_dev_info(struct device *dev) 4079 { 4080 struct device_domain_info *info = dev_iommu_priv_get(dev); 4081 struct dmar_domain *domain = info->domain; 4082 struct intel_iommu *iommu = info->iommu; 4083 4084 if (!dev_is_real_dma_subdevice(info->dev)) { 4085 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4086 intel_pasid_tear_down_entry(iommu, info->dev, 4087 PASID_RID2PASID, false); 4088 4089 iommu_disable_dev_iotlb(info); 4090 domain_context_clear(info); 4091 intel_pasid_free_table(info->dev); 4092 } 4093 4094 spin_lock(&domain->lock); 4095 list_del(&info->link); 4096 spin_unlock(&domain->lock); 4097 4098 domain_detach_iommu(domain, iommu); 4099 info->domain = NULL; 4100 } 4101 4102 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4103 { 4104 int adjust_width; 4105 4106 /* calculate AGAW */ 4107 domain->gaw = guest_width; 4108 adjust_width = guestwidth_to_adjustwidth(guest_width); 4109 domain->agaw = width_to_agaw(adjust_width); 4110 4111 domain->iommu_coherency = false; 4112 domain->iommu_superpage = 0; 4113 domain->max_addr = 0; 4114 4115 /* always allocate the top pgd */ 4116 domain->pgd = alloc_pgtable_page(domain->nid); 4117 if (!domain->pgd) 4118 return -ENOMEM; 4119 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4120 return 0; 4121 } 4122 4123 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4124 { 4125 struct dmar_domain *dmar_domain; 4126 struct iommu_domain *domain; 4127 4128 switch (type) { 4129 case IOMMU_DOMAIN_DMA: 4130 case IOMMU_DOMAIN_DMA_FQ: 4131 case IOMMU_DOMAIN_UNMANAGED: 4132 dmar_domain = alloc_domain(type); 4133 if (!dmar_domain) { 4134 pr_err("Can't allocate dmar_domain\n"); 4135 return NULL; 4136 } 4137 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4138 pr_err("Domain initialization failed\n"); 4139 domain_exit(dmar_domain); 4140 return NULL; 4141 } 4142 4143 domain = &dmar_domain->domain; 4144 domain->geometry.aperture_start = 0; 4145 domain->geometry.aperture_end = 4146 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4147 domain->geometry.force_aperture = true; 4148 4149 return domain; 4150 case IOMMU_DOMAIN_IDENTITY: 4151 return &si_domain->domain; 4152 default: 4153 return NULL; 4154 } 4155 4156 return NULL; 4157 } 4158 4159 static void intel_iommu_domain_free(struct iommu_domain *domain) 4160 { 4161 if (domain != &si_domain->domain) 4162 domain_exit(to_dmar_domain(domain)); 4163 } 4164 4165 static int prepare_domain_attach_device(struct iommu_domain *domain, 4166 struct device *dev) 4167 { 4168 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4169 struct intel_iommu *iommu; 4170 int addr_width; 4171 4172 iommu = device_to_iommu(dev, NULL, NULL); 4173 if (!iommu) 4174 return -ENODEV; 4175 4176 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4177 return -EOPNOTSUPP; 4178 4179 /* check if this iommu agaw is sufficient for max mapped address */ 4180 addr_width = agaw_to_width(iommu->agaw); 4181 if (addr_width > cap_mgaw(iommu->cap)) 4182 addr_width = cap_mgaw(iommu->cap); 4183 4184 if (dmar_domain->max_addr > (1LL << addr_width)) { 4185 dev_err(dev, "%s: iommu width (%d) is not " 4186 "sufficient for the mapped address (%llx)\n", 4187 __func__, addr_width, dmar_domain->max_addr); 4188 return -EFAULT; 4189 } 4190 dmar_domain->gaw = addr_width; 4191 4192 /* 4193 * Knock out extra levels of page tables if necessary 4194 */ 4195 while (iommu->agaw < dmar_domain->agaw) { 4196 struct dma_pte *pte; 4197 4198 pte = dmar_domain->pgd; 4199 if (dma_pte_present(pte)) { 4200 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4201 free_pgtable_page(pte); 4202 } 4203 dmar_domain->agaw--; 4204 } 4205 4206 return 0; 4207 } 4208 4209 static int intel_iommu_attach_device(struct iommu_domain *domain, 4210 struct device *dev) 4211 { 4212 int ret; 4213 4214 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4215 device_is_rmrr_locked(dev)) { 4216 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4217 return -EPERM; 4218 } 4219 4220 /* normally dev is not mapped */ 4221 if (unlikely(domain_context_mapped(dev))) { 4222 struct device_domain_info *info = dev_iommu_priv_get(dev); 4223 4224 if (info->domain) 4225 dmar_remove_one_dev_info(dev); 4226 } 4227 4228 ret = prepare_domain_attach_device(domain, dev); 4229 if (ret) 4230 return ret; 4231 4232 return domain_add_dev_info(to_dmar_domain(domain), dev); 4233 } 4234 4235 static void intel_iommu_detach_device(struct iommu_domain *domain, 4236 struct device *dev) 4237 { 4238 dmar_remove_one_dev_info(dev); 4239 } 4240 4241 static int intel_iommu_map(struct iommu_domain *domain, 4242 unsigned long iova, phys_addr_t hpa, 4243 size_t size, int iommu_prot, gfp_t gfp) 4244 { 4245 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4246 u64 max_addr; 4247 int prot = 0; 4248 4249 if (iommu_prot & IOMMU_READ) 4250 prot |= DMA_PTE_READ; 4251 if (iommu_prot & IOMMU_WRITE) 4252 prot |= DMA_PTE_WRITE; 4253 if (dmar_domain->set_pte_snp) 4254 prot |= DMA_PTE_SNP; 4255 4256 max_addr = iova + size; 4257 if (dmar_domain->max_addr < max_addr) { 4258 u64 end; 4259 4260 /* check if minimum agaw is sufficient for mapped address */ 4261 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4262 if (end < max_addr) { 4263 pr_err("%s: iommu width (%d) is not " 4264 "sufficient for the mapped address (%llx)\n", 4265 __func__, dmar_domain->gaw, max_addr); 4266 return -EFAULT; 4267 } 4268 dmar_domain->max_addr = max_addr; 4269 } 4270 /* Round up size to next multiple of PAGE_SIZE, if it and 4271 the low bits of hpa would take us onto the next page */ 4272 size = aligned_nrpages(hpa, size); 4273 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4274 hpa >> VTD_PAGE_SHIFT, size, prot); 4275 } 4276 4277 static int intel_iommu_map_pages(struct iommu_domain *domain, 4278 unsigned long iova, phys_addr_t paddr, 4279 size_t pgsize, size_t pgcount, 4280 int prot, gfp_t gfp, size_t *mapped) 4281 { 4282 unsigned long pgshift = __ffs(pgsize); 4283 size_t size = pgcount << pgshift; 4284 int ret; 4285 4286 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4287 return -EINVAL; 4288 4289 if (!IS_ALIGNED(iova | paddr, pgsize)) 4290 return -EINVAL; 4291 4292 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4293 if (!ret && mapped) 4294 *mapped = size; 4295 4296 return ret; 4297 } 4298 4299 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4300 unsigned long iova, size_t size, 4301 struct iommu_iotlb_gather *gather) 4302 { 4303 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4304 unsigned long start_pfn, last_pfn; 4305 int level = 0; 4306 4307 /* Cope with horrid API which requires us to unmap more than the 4308 size argument if it happens to be a large-page mapping. */ 4309 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4310 4311 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4312 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4313 4314 start_pfn = iova >> VTD_PAGE_SHIFT; 4315 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4316 4317 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4318 4319 if (dmar_domain->max_addr == iova + size) 4320 dmar_domain->max_addr = iova; 4321 4322 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4323 4324 return size; 4325 } 4326 4327 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4328 unsigned long iova, 4329 size_t pgsize, size_t pgcount, 4330 struct iommu_iotlb_gather *gather) 4331 { 4332 unsigned long pgshift = __ffs(pgsize); 4333 size_t size = pgcount << pgshift; 4334 4335 return intel_iommu_unmap(domain, iova, size, gather); 4336 } 4337 4338 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4339 struct iommu_iotlb_gather *gather) 4340 { 4341 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4342 unsigned long iova_pfn = IOVA_PFN(gather->start); 4343 size_t size = gather->end - gather->start; 4344 struct iommu_domain_info *info; 4345 unsigned long start_pfn; 4346 unsigned long nrpages; 4347 unsigned long i; 4348 4349 nrpages = aligned_nrpages(gather->start, size); 4350 start_pfn = mm_to_dma_pfn(iova_pfn); 4351 4352 xa_for_each(&dmar_domain->iommu_array, i, info) 4353 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4354 start_pfn, nrpages, 4355 list_empty(&gather->freelist), 0); 4356 4357 put_pages_list(&gather->freelist); 4358 } 4359 4360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4361 dma_addr_t iova) 4362 { 4363 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4364 struct dma_pte *pte; 4365 int level = 0; 4366 u64 phys = 0; 4367 4368 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4369 if (pte && dma_pte_present(pte)) 4370 phys = dma_pte_addr(pte) + 4371 (iova & (BIT_MASK(level_to_offset_bits(level) + 4372 VTD_PAGE_SHIFT) - 1)); 4373 4374 return phys; 4375 } 4376 4377 static bool domain_support_force_snooping(struct dmar_domain *domain) 4378 { 4379 struct device_domain_info *info; 4380 bool support = true; 4381 4382 assert_spin_locked(&domain->lock); 4383 list_for_each_entry(info, &domain->devices, link) { 4384 if (!ecap_sc_support(info->iommu->ecap)) { 4385 support = false; 4386 break; 4387 } 4388 } 4389 4390 return support; 4391 } 4392 4393 static void domain_set_force_snooping(struct dmar_domain *domain) 4394 { 4395 struct device_domain_info *info; 4396 4397 assert_spin_locked(&domain->lock); 4398 /* 4399 * Second level page table supports per-PTE snoop control. The 4400 * iommu_map() interface will handle this by setting SNP bit. 4401 */ 4402 if (!domain_use_first_level(domain)) { 4403 domain->set_pte_snp = true; 4404 return; 4405 } 4406 4407 list_for_each_entry(info, &domain->devices, link) 4408 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4409 PASID_RID2PASID); 4410 } 4411 4412 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4413 { 4414 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4415 4416 if (dmar_domain->force_snooping) 4417 return true; 4418 4419 spin_lock(&dmar_domain->lock); 4420 if (!domain_support_force_snooping(dmar_domain)) { 4421 spin_unlock(&dmar_domain->lock); 4422 return false; 4423 } 4424 4425 domain_set_force_snooping(dmar_domain); 4426 dmar_domain->force_snooping = true; 4427 spin_unlock(&dmar_domain->lock); 4428 4429 return true; 4430 } 4431 4432 static bool intel_iommu_capable(enum iommu_cap cap) 4433 { 4434 if (cap == IOMMU_CAP_CACHE_COHERENCY) 4435 return true; 4436 if (cap == IOMMU_CAP_INTR_REMAP) 4437 return irq_remapping_enabled == 1; 4438 if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) 4439 return dmar_platform_optin(); 4440 4441 return false; 4442 } 4443 4444 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4445 { 4446 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4447 struct device_domain_info *info; 4448 struct intel_iommu *iommu; 4449 u8 bus, devfn; 4450 4451 iommu = device_to_iommu(dev, &bus, &devfn); 4452 if (!iommu) 4453 return ERR_PTR(-ENODEV); 4454 4455 info = kzalloc(sizeof(*info), GFP_KERNEL); 4456 if (!info) 4457 return ERR_PTR(-ENOMEM); 4458 4459 if (dev_is_real_dma_subdevice(dev)) { 4460 info->bus = pdev->bus->number; 4461 info->devfn = pdev->devfn; 4462 info->segment = pci_domain_nr(pdev->bus); 4463 } else { 4464 info->bus = bus; 4465 info->devfn = devfn; 4466 info->segment = iommu->segment; 4467 } 4468 4469 info->dev = dev; 4470 info->iommu = iommu; 4471 if (dev_is_pci(dev)) { 4472 if (ecap_dev_iotlb_support(iommu->ecap) && 4473 pci_ats_supported(pdev) && 4474 dmar_ats_supported(pdev, iommu)) 4475 info->ats_supported = 1; 4476 4477 if (sm_supported(iommu)) { 4478 if (pasid_supported(iommu)) { 4479 int features = pci_pasid_features(pdev); 4480 4481 if (features >= 0) 4482 info->pasid_supported = features | 1; 4483 } 4484 4485 if (info->ats_supported && ecap_prs(iommu->ecap) && 4486 pci_pri_supported(pdev)) 4487 info->pri_supported = 1; 4488 } 4489 } 4490 4491 dev_iommu_priv_set(dev, info); 4492 4493 return &iommu->iommu; 4494 } 4495 4496 static void intel_iommu_release_device(struct device *dev) 4497 { 4498 struct device_domain_info *info = dev_iommu_priv_get(dev); 4499 4500 dmar_remove_one_dev_info(dev); 4501 dev_iommu_priv_set(dev, NULL); 4502 kfree(info); 4503 set_dma_ops(dev, NULL); 4504 } 4505 4506 static void intel_iommu_probe_finalize(struct device *dev) 4507 { 4508 set_dma_ops(dev, NULL); 4509 iommu_setup_dma_ops(dev, 0, U64_MAX); 4510 } 4511 4512 static void intel_iommu_get_resv_regions(struct device *device, 4513 struct list_head *head) 4514 { 4515 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4516 struct iommu_resv_region *reg; 4517 struct dmar_rmrr_unit *rmrr; 4518 struct device *i_dev; 4519 int i; 4520 4521 down_read(&dmar_global_lock); 4522 for_each_rmrr_units(rmrr) { 4523 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4524 i, i_dev) { 4525 struct iommu_resv_region *resv; 4526 enum iommu_resv_type type; 4527 size_t length; 4528 4529 if (i_dev != device && 4530 !is_downstream_to_pci_bridge(device, i_dev)) 4531 continue; 4532 4533 length = rmrr->end_address - rmrr->base_address + 1; 4534 4535 type = device_rmrr_is_relaxable(device) ? 4536 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4537 4538 resv = iommu_alloc_resv_region(rmrr->base_address, 4539 length, prot, type); 4540 if (!resv) 4541 break; 4542 4543 list_add_tail(&resv->list, head); 4544 } 4545 } 4546 up_read(&dmar_global_lock); 4547 4548 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4549 if (dev_is_pci(device)) { 4550 struct pci_dev *pdev = to_pci_dev(device); 4551 4552 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4553 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4554 IOMMU_RESV_DIRECT_RELAXABLE); 4555 if (reg) 4556 list_add_tail(®->list, head); 4557 } 4558 } 4559 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4560 4561 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4562 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4563 0, IOMMU_RESV_MSI); 4564 if (!reg) 4565 return; 4566 list_add_tail(®->list, head); 4567 } 4568 4569 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 4570 { 4571 struct device_domain_info *info = dev_iommu_priv_get(dev); 4572 struct context_entry *context; 4573 struct dmar_domain *domain; 4574 u64 ctx_lo; 4575 int ret; 4576 4577 domain = info->domain; 4578 if (!domain) 4579 return -EINVAL; 4580 4581 spin_lock(&iommu->lock); 4582 ret = -EINVAL; 4583 if (!info->pasid_supported) 4584 goto out; 4585 4586 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 4587 if (WARN_ON(!context)) 4588 goto out; 4589 4590 ctx_lo = context[0].lo; 4591 4592 if (!(ctx_lo & CONTEXT_PASIDE)) { 4593 ctx_lo |= CONTEXT_PASIDE; 4594 context[0].lo = ctx_lo; 4595 wmb(); 4596 iommu->flush.flush_context(iommu, 4597 domain_id_iommu(domain, iommu), 4598 PCI_DEVID(info->bus, info->devfn), 4599 DMA_CCMD_MASK_NOBIT, 4600 DMA_CCMD_DEVICE_INVL); 4601 } 4602 4603 /* Enable PASID support in the device, if it wasn't already */ 4604 if (!info->pasid_enabled) 4605 iommu_enable_dev_iotlb(info); 4606 4607 ret = 0; 4608 4609 out: 4610 spin_unlock(&iommu->lock); 4611 4612 return ret; 4613 } 4614 4615 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4616 { 4617 if (dev_is_pci(dev)) 4618 return pci_device_group(dev); 4619 return generic_device_group(dev); 4620 } 4621 4622 static int intel_iommu_enable_sva(struct device *dev) 4623 { 4624 struct device_domain_info *info = dev_iommu_priv_get(dev); 4625 struct intel_iommu *iommu; 4626 int ret; 4627 4628 if (!info || dmar_disabled) 4629 return -EINVAL; 4630 4631 iommu = info->iommu; 4632 if (!iommu) 4633 return -EINVAL; 4634 4635 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4636 return -ENODEV; 4637 4638 if (intel_iommu_enable_pasid(iommu, dev)) 4639 return -ENODEV; 4640 4641 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 4642 return -EINVAL; 4643 4644 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4645 if (!ret) 4646 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4647 4648 return ret; 4649 } 4650 4651 static int intel_iommu_disable_sva(struct device *dev) 4652 { 4653 struct device_domain_info *info = dev_iommu_priv_get(dev); 4654 struct intel_iommu *iommu = info->iommu; 4655 int ret; 4656 4657 ret = iommu_unregister_device_fault_handler(dev); 4658 if (!ret) 4659 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 4660 4661 return ret; 4662 } 4663 4664 static int intel_iommu_enable_iopf(struct device *dev) 4665 { 4666 struct device_domain_info *info = dev_iommu_priv_get(dev); 4667 4668 if (info && info->pri_supported) 4669 return 0; 4670 4671 return -ENODEV; 4672 } 4673 4674 static int 4675 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4676 { 4677 switch (feat) { 4678 case IOMMU_DEV_FEAT_IOPF: 4679 return intel_iommu_enable_iopf(dev); 4680 4681 case IOMMU_DEV_FEAT_SVA: 4682 return intel_iommu_enable_sva(dev); 4683 4684 default: 4685 return -ENODEV; 4686 } 4687 } 4688 4689 static int 4690 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4691 { 4692 switch (feat) { 4693 case IOMMU_DEV_FEAT_IOPF: 4694 return 0; 4695 4696 case IOMMU_DEV_FEAT_SVA: 4697 return intel_iommu_disable_sva(dev); 4698 4699 default: 4700 return -ENODEV; 4701 } 4702 } 4703 4704 static bool intel_iommu_is_attach_deferred(struct device *dev) 4705 { 4706 struct device_domain_info *info = dev_iommu_priv_get(dev); 4707 4708 return translation_pre_enabled(info->iommu) && !info->domain; 4709 } 4710 4711 /* 4712 * Check that the device does not live on an external facing PCI port that is 4713 * marked as untrusted. Such devices should not be able to apply quirks and 4714 * thus not be able to bypass the IOMMU restrictions. 4715 */ 4716 static bool risky_device(struct pci_dev *pdev) 4717 { 4718 if (pdev->untrusted) { 4719 pci_info(pdev, 4720 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4721 pdev->vendor, pdev->device); 4722 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4723 return true; 4724 } 4725 return false; 4726 } 4727 4728 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4729 unsigned long iova, size_t size) 4730 { 4731 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4732 unsigned long pages = aligned_nrpages(iova, size); 4733 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4734 struct iommu_domain_info *info; 4735 unsigned long i; 4736 4737 xa_for_each(&dmar_domain->iommu_array, i, info) 4738 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4739 } 4740 4741 const struct iommu_ops intel_iommu_ops = { 4742 .capable = intel_iommu_capable, 4743 .domain_alloc = intel_iommu_domain_alloc, 4744 .probe_device = intel_iommu_probe_device, 4745 .probe_finalize = intel_iommu_probe_finalize, 4746 .release_device = intel_iommu_release_device, 4747 .get_resv_regions = intel_iommu_get_resv_regions, 4748 .device_group = intel_iommu_device_group, 4749 .dev_enable_feat = intel_iommu_dev_enable_feat, 4750 .dev_disable_feat = intel_iommu_dev_disable_feat, 4751 .is_attach_deferred = intel_iommu_is_attach_deferred, 4752 .def_domain_type = device_def_domain_type, 4753 .pgsize_bitmap = SZ_4K, 4754 #ifdef CONFIG_INTEL_IOMMU_SVM 4755 .sva_bind = intel_svm_bind, 4756 .sva_unbind = intel_svm_unbind, 4757 .sva_get_pasid = intel_svm_get_pasid, 4758 .page_response = intel_svm_page_response, 4759 #endif 4760 .default_domain_ops = &(const struct iommu_domain_ops) { 4761 .attach_dev = intel_iommu_attach_device, 4762 .detach_dev = intel_iommu_detach_device, 4763 .map_pages = intel_iommu_map_pages, 4764 .unmap_pages = intel_iommu_unmap_pages, 4765 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4766 .flush_iotlb_all = intel_flush_iotlb_all, 4767 .iotlb_sync = intel_iommu_tlb_sync, 4768 .iova_to_phys = intel_iommu_iova_to_phys, 4769 .free = intel_iommu_domain_free, 4770 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4771 } 4772 }; 4773 4774 static void quirk_iommu_igfx(struct pci_dev *dev) 4775 { 4776 if (risky_device(dev)) 4777 return; 4778 4779 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4780 dmar_map_gfx = 0; 4781 } 4782 4783 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4791 4792 /* Broadwell igfx malfunctions with dmar */ 4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4817 4818 static void quirk_iommu_rwbf(struct pci_dev *dev) 4819 { 4820 if (risky_device(dev)) 4821 return; 4822 4823 /* 4824 * Mobile 4 Series Chipset neglects to set RWBF capability, 4825 * but needs it. Same seems to hold for the desktop versions. 4826 */ 4827 pci_info(dev, "Forcing write-buffer flush capability\n"); 4828 rwbf_quirk = 1; 4829 } 4830 4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4838 4839 #define GGC 0x52 4840 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4841 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4842 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4843 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4844 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4845 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4846 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4847 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4848 4849 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4850 { 4851 unsigned short ggc; 4852 4853 if (risky_device(dev)) 4854 return; 4855 4856 if (pci_read_config_word(dev, GGC, &ggc)) 4857 return; 4858 4859 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4860 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4861 dmar_map_gfx = 0; 4862 } else if (dmar_map_gfx) { 4863 /* we have to ensure the gfx device is idle before we flush */ 4864 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4865 iommu_set_dma_strict(); 4866 } 4867 } 4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4872 4873 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4874 { 4875 unsigned short ver; 4876 4877 if (!IS_GFX_DEVICE(dev)) 4878 return; 4879 4880 ver = (dev->device >> 8) & 0xff; 4881 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4882 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4883 ver != 0x9a && ver != 0xa7) 4884 return; 4885 4886 if (risky_device(dev)) 4887 return; 4888 4889 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4890 iommu_skip_te_disable = 1; 4891 } 4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4893 4894 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4895 ISOCH DMAR unit for the Azalia sound device, but not give it any 4896 TLB entries, which causes it to deadlock. Check for that. We do 4897 this in a function called from init_dmars(), instead of in a PCI 4898 quirk, because we don't want to print the obnoxious "BIOS broken" 4899 message if VT-d is actually disabled. 4900 */ 4901 static void __init check_tylersburg_isoch(void) 4902 { 4903 struct pci_dev *pdev; 4904 uint32_t vtisochctrl; 4905 4906 /* If there's no Azalia in the system anyway, forget it. */ 4907 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4908 if (!pdev) 4909 return; 4910 4911 if (risky_device(pdev)) { 4912 pci_dev_put(pdev); 4913 return; 4914 } 4915 4916 pci_dev_put(pdev); 4917 4918 /* System Management Registers. Might be hidden, in which case 4919 we can't do the sanity check. But that's OK, because the 4920 known-broken BIOSes _don't_ actually hide it, so far. */ 4921 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4922 if (!pdev) 4923 return; 4924 4925 if (risky_device(pdev)) { 4926 pci_dev_put(pdev); 4927 return; 4928 } 4929 4930 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4931 pci_dev_put(pdev); 4932 return; 4933 } 4934 4935 pci_dev_put(pdev); 4936 4937 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4938 if (vtisochctrl & 1) 4939 return; 4940 4941 /* Drop all bits other than the number of TLB entries */ 4942 vtisochctrl &= 0x1c; 4943 4944 /* If we have the recommended number of TLB entries (16), fine. */ 4945 if (vtisochctrl == 0x10) 4946 return; 4947 4948 /* Zero TLB entries? You get to ride the short bus to school. */ 4949 if (!vtisochctrl) { 4950 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4951 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4952 dmi_get_system_info(DMI_BIOS_VENDOR), 4953 dmi_get_system_info(DMI_BIOS_VERSION), 4954 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4955 iommu_identity_mapping |= IDENTMAP_AZALIA; 4956 return; 4957 } 4958 4959 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4960 vtisochctrl); 4961 } 4962