1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 26 #include "iommu.h" 27 #include "../dma-iommu.h" 28 #include "../irq_remapping.h" 29 #include "../iommu-sva.h" 30 #include "pasid.h" 31 #include "cap_audit.h" 32 #include "perfmon.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 static void __init check_tylersburg_isoch(void); 130 static int rwbf_quirk; 131 132 /* 133 * set to 1 to panic kernel if can't successfully enable VT-d 134 * (used when kernel is launched w/ TXT) 135 */ 136 static int force_on = 0; 137 static int intel_iommu_tboot_noforce; 138 static int no_platform_optin; 139 140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 141 142 /* 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 144 * if marked present. 145 */ 146 static phys_addr_t root_entry_lctp(struct root_entry *re) 147 { 148 if (!(re->lo & 1)) 149 return 0; 150 151 return re->lo & VTD_PAGE_MASK; 152 } 153 154 /* 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 156 * if marked present. 157 */ 158 static phys_addr_t root_entry_uctp(struct root_entry *re) 159 { 160 if (!(re->hi & 1)) 161 return 0; 162 163 return re->hi & VTD_PAGE_MASK; 164 } 165 166 static inline void context_set_present(struct context_entry *context) 167 { 168 context->lo |= 1; 169 } 170 171 static inline void context_set_fault_enable(struct context_entry *context) 172 { 173 context->lo &= (((u64)-1) << 2) | 1; 174 } 175 176 static inline void context_set_translation_type(struct context_entry *context, 177 unsigned long value) 178 { 179 context->lo &= (((u64)-1) << 4) | 3; 180 context->lo |= (value & 3) << 2; 181 } 182 183 static inline void context_set_address_root(struct context_entry *context, 184 unsigned long value) 185 { 186 context->lo &= ~VTD_PAGE_MASK; 187 context->lo |= value & VTD_PAGE_MASK; 188 } 189 190 static inline void context_set_address_width(struct context_entry *context, 191 unsigned long value) 192 { 193 context->hi |= value & 7; 194 } 195 196 static inline void context_set_domain_id(struct context_entry *context, 197 unsigned long value) 198 { 199 context->hi |= (value & ((1 << 16) - 1)) << 8; 200 } 201 202 static inline void context_set_pasid(struct context_entry *context) 203 { 204 context->lo |= CONTEXT_PASIDE; 205 } 206 207 static inline int context_domain_id(struct context_entry *c) 208 { 209 return((c->hi >> 8) & 0xffff); 210 } 211 212 static inline void context_clear_entry(struct context_entry *context) 213 { 214 context->lo = 0; 215 context->hi = 0; 216 } 217 218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 219 { 220 if (!iommu->copied_tables) 221 return false; 222 223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 224 } 225 226 static inline void 227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 228 { 229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 230 } 231 232 static inline void 233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 234 { 235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 236 } 237 238 /* 239 * This domain is a statically identity mapping domain. 240 * 1. This domain creats a static 1:1 mapping to all usable memory. 241 * 2. It maps to each iommu if successful. 242 * 3. Each iommu mapps to this domain if successful. 243 */ 244 static struct dmar_domain *si_domain; 245 static int hw_pass_through = 1; 246 247 struct dmar_rmrr_unit { 248 struct list_head list; /* list of rmrr units */ 249 struct acpi_dmar_header *hdr; /* ACPI header */ 250 u64 base_address; /* reserved base address*/ 251 u64 end_address; /* reserved end address */ 252 struct dmar_dev_scope *devices; /* target devices */ 253 int devices_cnt; /* target device count */ 254 }; 255 256 struct dmar_atsr_unit { 257 struct list_head list; /* list of ATSR units */ 258 struct acpi_dmar_header *hdr; /* ACPI header */ 259 struct dmar_dev_scope *devices; /* target devices */ 260 int devices_cnt; /* target device count */ 261 u8 include_all:1; /* include all ports */ 262 }; 263 264 struct dmar_satc_unit { 265 struct list_head list; /* list of SATC units */ 266 struct acpi_dmar_header *hdr; /* ACPI header */ 267 struct dmar_dev_scope *devices; /* target devices */ 268 struct intel_iommu *iommu; /* the corresponding iommu */ 269 int devices_cnt; /* target device count */ 270 u8 atc_required:1; /* ATS is required */ 271 }; 272 273 static LIST_HEAD(dmar_atsr_units); 274 static LIST_HEAD(dmar_rmrr_units); 275 static LIST_HEAD(dmar_satc_units); 276 277 #define for_each_rmrr_units(rmrr) \ 278 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 279 280 static void device_block_translation(struct device *dev); 281 static void intel_iommu_domain_free(struct iommu_domain *domain); 282 283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 285 286 int intel_iommu_enabled = 0; 287 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 288 289 static int dmar_map_gfx = 1; 290 static int intel_iommu_superpage = 1; 291 static int iommu_identity_mapping; 292 static int iommu_skip_te_disable; 293 294 #define IDENTMAP_GFX 2 295 #define IDENTMAP_AZALIA 4 296 297 const struct iommu_ops intel_iommu_ops; 298 299 static bool translation_pre_enabled(struct intel_iommu *iommu) 300 { 301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 302 } 303 304 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 307 } 308 309 static void init_translation_status(struct intel_iommu *iommu) 310 { 311 u32 gsts; 312 313 gsts = readl(iommu->reg + DMAR_GSTS_REG); 314 if (gsts & DMA_GSTS_TES) 315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 316 } 317 318 static int __init intel_iommu_setup(char *str) 319 { 320 if (!str) 321 return -EINVAL; 322 323 while (*str) { 324 if (!strncmp(str, "on", 2)) { 325 dmar_disabled = 0; 326 pr_info("IOMMU enabled\n"); 327 } else if (!strncmp(str, "off", 3)) { 328 dmar_disabled = 1; 329 no_platform_optin = 1; 330 pr_info("IOMMU disabled\n"); 331 } else if (!strncmp(str, "igfx_off", 8)) { 332 dmar_map_gfx = 0; 333 pr_info("Disable GFX device mapping\n"); 334 } else if (!strncmp(str, "forcedac", 8)) { 335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 336 iommu_dma_forcedac = true; 337 } else if (!strncmp(str, "strict", 6)) { 338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 339 iommu_set_dma_strict(); 340 } else if (!strncmp(str, "sp_off", 6)) { 341 pr_info("Disable supported super page\n"); 342 intel_iommu_superpage = 0; 343 } else if (!strncmp(str, "sm_on", 5)) { 344 pr_info("Enable scalable mode if hardware supports\n"); 345 intel_iommu_sm = 1; 346 } else if (!strncmp(str, "sm_off", 6)) { 347 pr_info("Scalable mode is disallowed\n"); 348 intel_iommu_sm = 0; 349 } else if (!strncmp(str, "tboot_noforce", 13)) { 350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 351 intel_iommu_tboot_noforce = 1; 352 } else { 353 pr_notice("Unknown option - '%s'\n", str); 354 } 355 356 str += strcspn(str, ","); 357 while (*str == ',') 358 str++; 359 } 360 361 return 1; 362 } 363 __setup("intel_iommu=", intel_iommu_setup); 364 365 void *alloc_pgtable_page(int node, gfp_t gfp) 366 { 367 struct page *page; 368 void *vaddr = NULL; 369 370 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 371 if (page) 372 vaddr = page_address(page); 373 return vaddr; 374 } 375 376 void free_pgtable_page(void *vaddr) 377 { 378 free_page((unsigned long)vaddr); 379 } 380 381 static inline int domain_type_is_si(struct dmar_domain *domain) 382 { 383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 384 } 385 386 static inline int domain_pfn_supported(struct dmar_domain *domain, 387 unsigned long pfn) 388 { 389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 390 391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 392 } 393 394 /* 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 397 * the returned SAGAW. 398 */ 399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 400 { 401 unsigned long fl_sagaw, sl_sagaw; 402 403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 404 sl_sagaw = cap_sagaw(iommu->cap); 405 406 /* Second level only. */ 407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 408 return sl_sagaw; 409 410 /* First level only. */ 411 if (!ecap_slts(iommu->ecap)) 412 return fl_sagaw; 413 414 return fl_sagaw & sl_sagaw; 415 } 416 417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 418 { 419 unsigned long sagaw; 420 int agaw; 421 422 sagaw = __iommu_calculate_sagaw(iommu); 423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 424 if (test_bit(agaw, &sagaw)) 425 break; 426 } 427 428 return agaw; 429 } 430 431 /* 432 * Calculate max SAGAW for each iommu. 433 */ 434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 435 { 436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 437 } 438 439 /* 440 * calculate agaw for each iommu. 441 * "SAGAW" may be different across iommus, use a default agaw, and 442 * get a supported less agaw for iommus that don't support the default agaw. 443 */ 444 int iommu_calculate_agaw(struct intel_iommu *iommu) 445 { 446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 447 } 448 449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 450 { 451 return sm_supported(iommu) ? 452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 453 } 454 455 static void domain_update_iommu_coherency(struct dmar_domain *domain) 456 { 457 struct iommu_domain_info *info; 458 struct dmar_drhd_unit *drhd; 459 struct intel_iommu *iommu; 460 bool found = false; 461 unsigned long i; 462 463 domain->iommu_coherency = true; 464 xa_for_each(&domain->iommu_array, i, info) { 465 found = true; 466 if (!iommu_paging_structure_coherency(info->iommu)) { 467 domain->iommu_coherency = false; 468 break; 469 } 470 } 471 if (found) 472 return; 473 474 /* No hardware attached; use lowest common denominator */ 475 rcu_read_lock(); 476 for_each_active_iommu(iommu, drhd) { 477 if (!iommu_paging_structure_coherency(iommu)) { 478 domain->iommu_coherency = false; 479 break; 480 } 481 } 482 rcu_read_unlock(); 483 } 484 485 static int domain_update_iommu_superpage(struct dmar_domain *domain, 486 struct intel_iommu *skip) 487 { 488 struct dmar_drhd_unit *drhd; 489 struct intel_iommu *iommu; 490 int mask = 0x3; 491 492 if (!intel_iommu_superpage) 493 return 0; 494 495 /* set iommu_superpage to the smallest common denominator */ 496 rcu_read_lock(); 497 for_each_active_iommu(iommu, drhd) { 498 if (iommu != skip) { 499 if (domain && domain->use_first_level) { 500 if (!cap_fl1gp_support(iommu->cap)) 501 mask = 0x1; 502 } else { 503 mask &= cap_super_page_val(iommu->cap); 504 } 505 506 if (!mask) 507 break; 508 } 509 } 510 rcu_read_unlock(); 511 512 return fls(mask); 513 } 514 515 static int domain_update_device_node(struct dmar_domain *domain) 516 { 517 struct device_domain_info *info; 518 int nid = NUMA_NO_NODE; 519 unsigned long flags; 520 521 spin_lock_irqsave(&domain->lock, flags); 522 list_for_each_entry(info, &domain->devices, link) { 523 /* 524 * There could possibly be multiple device numa nodes as devices 525 * within the same domain may sit behind different IOMMUs. There 526 * isn't perfect answer in such situation, so we select first 527 * come first served policy. 528 */ 529 nid = dev_to_node(info->dev); 530 if (nid != NUMA_NO_NODE) 531 break; 532 } 533 spin_unlock_irqrestore(&domain->lock, flags); 534 535 return nid; 536 } 537 538 static void domain_update_iotlb(struct dmar_domain *domain); 539 540 /* Return the super pagesize bitmap if supported. */ 541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 542 { 543 unsigned long bitmap = 0; 544 545 /* 546 * 1-level super page supports page size of 2MiB, 2-level super page 547 * supports page size of both 2MiB and 1GiB. 548 */ 549 if (domain->iommu_superpage == 1) 550 bitmap |= SZ_2M; 551 else if (domain->iommu_superpage == 2) 552 bitmap |= SZ_2M | SZ_1G; 553 554 return bitmap; 555 } 556 557 /* Some capabilities may be different across iommus */ 558 static void domain_update_iommu_cap(struct dmar_domain *domain) 559 { 560 domain_update_iommu_coherency(domain); 561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 562 563 /* 564 * If RHSA is missing, we should default to the device numa domain 565 * as fall back. 566 */ 567 if (domain->nid == NUMA_NO_NODE) 568 domain->nid = domain_update_device_node(domain); 569 570 /* 571 * First-level translation restricts the input-address to a 572 * canonical address (i.e., address bits 63:N have the same 573 * value as address bit [N-1], where N is 48-bits with 4-level 574 * paging and 57-bits with 5-level paging). Hence, skip bit 575 * [N-1]. 576 */ 577 if (domain->use_first_level) 578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 579 else 580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 581 582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 583 domain_update_iotlb(domain); 584 } 585 586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 587 u8 devfn, int alloc) 588 { 589 struct root_entry *root = &iommu->root_entry[bus]; 590 struct context_entry *context; 591 u64 *entry; 592 593 /* 594 * Except that the caller requested to allocate a new entry, 595 * returning a copied context entry makes no sense. 596 */ 597 if (!alloc && context_copied(iommu, bus, devfn)) 598 return NULL; 599 600 entry = &root->lo; 601 if (sm_supported(iommu)) { 602 if (devfn >= 0x80) { 603 devfn -= 0x80; 604 entry = &root->hi; 605 } 606 devfn *= 2; 607 } 608 if (*entry & 1) 609 context = phys_to_virt(*entry & VTD_PAGE_MASK); 610 else { 611 unsigned long phy_addr; 612 if (!alloc) 613 return NULL; 614 615 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 616 if (!context) 617 return NULL; 618 619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 620 phy_addr = virt_to_phys((void *)context); 621 *entry = phy_addr | 1; 622 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 623 } 624 return &context[devfn]; 625 } 626 627 /** 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 629 * sub-hierarchy of a candidate PCI-PCI bridge 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 631 * @bridge: the candidate PCI-PCI bridge 632 * 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 634 */ 635 static bool 636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 637 { 638 struct pci_dev *pdev, *pbridge; 639 640 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 641 return false; 642 643 pdev = to_pci_dev(dev); 644 pbridge = to_pci_dev(bridge); 645 646 if (pbridge->subordinate && 647 pbridge->subordinate->number <= pdev->bus->number && 648 pbridge->subordinate->busn_res.end >= pdev->bus->number) 649 return true; 650 651 return false; 652 } 653 654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 655 { 656 struct dmar_drhd_unit *drhd; 657 u32 vtbar; 658 int rc; 659 660 /* We know that this device on this chipset has its own IOMMU. 661 * If we find it under a different IOMMU, then the BIOS is lying 662 * to us. Hope that the IOMMU for this device is actually 663 * disabled, and it needs no translation... 664 */ 665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 666 if (rc) { 667 /* "can't" happen */ 668 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 669 return false; 670 } 671 vtbar &= 0xffff0000; 672 673 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 674 drhd = dmar_find_matched_drhd_unit(pdev); 675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 678 return true; 679 } 680 681 return false; 682 } 683 684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 685 { 686 if (!iommu || iommu->drhd->ignored) 687 return true; 688 689 if (dev_is_pci(dev)) { 690 struct pci_dev *pdev = to_pci_dev(dev); 691 692 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 694 quirk_ioat_snb_local_iommu(pdev)) 695 return true; 696 } 697 698 return false; 699 } 700 701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 702 { 703 struct dmar_drhd_unit *drhd = NULL; 704 struct pci_dev *pdev = NULL; 705 struct intel_iommu *iommu; 706 struct device *tmp; 707 u16 segment = 0; 708 int i; 709 710 if (!dev) 711 return NULL; 712 713 if (dev_is_pci(dev)) { 714 struct pci_dev *pf_pdev; 715 716 pdev = pci_real_dma_dev(to_pci_dev(dev)); 717 718 /* VFs aren't listed in scope tables; we need to look up 719 * the PF instead to find the IOMMU. */ 720 pf_pdev = pci_physfn(pdev); 721 dev = &pf_pdev->dev; 722 segment = pci_domain_nr(pdev->bus); 723 } else if (has_acpi_companion(dev)) 724 dev = &ACPI_COMPANION(dev)->dev; 725 726 rcu_read_lock(); 727 for_each_iommu(iommu, drhd) { 728 if (pdev && segment != drhd->segment) 729 continue; 730 731 for_each_active_dev_scope(drhd->devices, 732 drhd->devices_cnt, i, tmp) { 733 if (tmp == dev) { 734 /* For a VF use its original BDF# not that of the PF 735 * which we used for the IOMMU lookup. Strictly speaking 736 * we could do this for all PCI devices; we only need to 737 * get the BDF# from the scope table for ACPI matches. */ 738 if (pdev && pdev->is_virtfn) 739 goto got_pdev; 740 741 if (bus && devfn) { 742 *bus = drhd->devices[i].bus; 743 *devfn = drhd->devices[i].devfn; 744 } 745 goto out; 746 } 747 748 if (is_downstream_to_pci_bridge(dev, tmp)) 749 goto got_pdev; 750 } 751 752 if (pdev && drhd->include_all) { 753 got_pdev: 754 if (bus && devfn) { 755 *bus = pdev->bus->number; 756 *devfn = pdev->devfn; 757 } 758 goto out; 759 } 760 } 761 iommu = NULL; 762 out: 763 if (iommu_is_dummy(iommu, dev)) 764 iommu = NULL; 765 766 rcu_read_unlock(); 767 768 return iommu; 769 } 770 771 static void domain_flush_cache(struct dmar_domain *domain, 772 void *addr, int size) 773 { 774 if (!domain->iommu_coherency) 775 clflush_cache_range(addr, size); 776 } 777 778 static void free_context_table(struct intel_iommu *iommu) 779 { 780 struct context_entry *context; 781 int i; 782 783 if (!iommu->root_entry) 784 return; 785 786 for (i = 0; i < ROOT_ENTRY_NR; i++) { 787 context = iommu_context_addr(iommu, i, 0, 0); 788 if (context) 789 free_pgtable_page(context); 790 791 if (!sm_supported(iommu)) 792 continue; 793 794 context = iommu_context_addr(iommu, i, 0x80, 0); 795 if (context) 796 free_pgtable_page(context); 797 } 798 799 free_pgtable_page(iommu->root_entry); 800 iommu->root_entry = NULL; 801 } 802 803 #ifdef CONFIG_DMAR_DEBUG 804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 805 u8 bus, u8 devfn, struct dma_pte *parent, int level) 806 { 807 struct dma_pte *pte; 808 int offset; 809 810 while (1) { 811 offset = pfn_level_offset(pfn, level); 812 pte = &parent[offset]; 813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 814 pr_info("PTE not present at level %d\n", level); 815 break; 816 } 817 818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 819 820 if (level == 1) 821 break; 822 823 parent = phys_to_virt(dma_pte_addr(pte)); 824 level--; 825 } 826 } 827 828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 829 unsigned long long addr, u32 pasid) 830 { 831 struct pasid_dir_entry *dir, *pde; 832 struct pasid_entry *entries, *pte; 833 struct context_entry *ctx_entry; 834 struct root_entry *rt_entry; 835 int i, dir_index, index, level; 836 u8 devfn = source_id & 0xff; 837 u8 bus = source_id >> 8; 838 struct dma_pte *pgtable; 839 840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 841 842 /* root entry dump */ 843 rt_entry = &iommu->root_entry[bus]; 844 if (!rt_entry) { 845 pr_info("root table entry is not present\n"); 846 return; 847 } 848 849 if (sm_supported(iommu)) 850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 851 rt_entry->hi, rt_entry->lo); 852 else 853 pr_info("root entry: 0x%016llx", rt_entry->lo); 854 855 /* context entry dump */ 856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 857 if (!ctx_entry) { 858 pr_info("context table entry is not present\n"); 859 return; 860 } 861 862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 863 ctx_entry->hi, ctx_entry->lo); 864 865 /* legacy mode does not require PASID entries */ 866 if (!sm_supported(iommu)) { 867 level = agaw_to_level(ctx_entry->hi & 7); 868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 869 goto pgtable_walk; 870 } 871 872 /* get the pointer to pasid directory entry */ 873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 874 if (!dir) { 875 pr_info("pasid directory entry is not present\n"); 876 return; 877 } 878 /* For request-without-pasid, get the pasid from context entry */ 879 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 880 pasid = PASID_RID2PASID; 881 882 dir_index = pasid >> PASID_PDE_SHIFT; 883 pde = &dir[dir_index]; 884 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 885 886 /* get the pointer to the pasid table entry */ 887 entries = get_pasid_table_from_pde(pde); 888 if (!entries) { 889 pr_info("pasid table entry is not present\n"); 890 return; 891 } 892 index = pasid & PASID_PTE_MASK; 893 pte = &entries[index]; 894 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 896 897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 900 } else { 901 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 903 } 904 905 pgtable_walk: 906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 907 } 908 #endif 909 910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 911 unsigned long pfn, int *target_level, 912 gfp_t gfp) 913 { 914 struct dma_pte *parent, *pte; 915 int level = agaw_to_level(domain->agaw); 916 int offset; 917 918 if (!domain_pfn_supported(domain, pfn)) 919 /* Address beyond IOMMU's addressing capabilities. */ 920 return NULL; 921 922 parent = domain->pgd; 923 924 while (1) { 925 void *tmp_page; 926 927 offset = pfn_level_offset(pfn, level); 928 pte = &parent[offset]; 929 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 930 break; 931 if (level == *target_level) 932 break; 933 934 if (!dma_pte_present(pte)) { 935 uint64_t pteval; 936 937 tmp_page = alloc_pgtable_page(domain->nid, gfp); 938 939 if (!tmp_page) 940 return NULL; 941 942 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 943 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 944 if (domain->use_first_level) 945 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 946 947 if (cmpxchg64(&pte->val, 0ULL, pteval)) 948 /* Someone else set it while we were thinking; use theirs. */ 949 free_pgtable_page(tmp_page); 950 else 951 domain_flush_cache(domain, pte, sizeof(*pte)); 952 } 953 if (level == 1) 954 break; 955 956 parent = phys_to_virt(dma_pte_addr(pte)); 957 level--; 958 } 959 960 if (!*target_level) 961 *target_level = level; 962 963 return pte; 964 } 965 966 /* return address's pte at specific level */ 967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 968 unsigned long pfn, 969 int level, int *large_page) 970 { 971 struct dma_pte *parent, *pte; 972 int total = agaw_to_level(domain->agaw); 973 int offset; 974 975 parent = domain->pgd; 976 while (level <= total) { 977 offset = pfn_level_offset(pfn, total); 978 pte = &parent[offset]; 979 if (level == total) 980 return pte; 981 982 if (!dma_pte_present(pte)) { 983 *large_page = total; 984 break; 985 } 986 987 if (dma_pte_superpage(pte)) { 988 *large_page = total; 989 return pte; 990 } 991 992 parent = phys_to_virt(dma_pte_addr(pte)); 993 total--; 994 } 995 return NULL; 996 } 997 998 /* clear last level pte, a tlb flush should be followed */ 999 static void dma_pte_clear_range(struct dmar_domain *domain, 1000 unsigned long start_pfn, 1001 unsigned long last_pfn) 1002 { 1003 unsigned int large_page; 1004 struct dma_pte *first_pte, *pte; 1005 1006 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1007 WARN_ON(start_pfn > last_pfn)) 1008 return; 1009 1010 /* we don't need lock here; nobody else touches the iova range */ 1011 do { 1012 large_page = 1; 1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1014 if (!pte) { 1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1016 continue; 1017 } 1018 do { 1019 dma_clear_pte(pte); 1020 start_pfn += lvl_to_nr_pages(large_page); 1021 pte++; 1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1023 1024 domain_flush_cache(domain, first_pte, 1025 (void *)pte - (void *)first_pte); 1026 1027 } while (start_pfn && start_pfn <= last_pfn); 1028 } 1029 1030 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1031 int retain_level, struct dma_pte *pte, 1032 unsigned long pfn, unsigned long start_pfn, 1033 unsigned long last_pfn) 1034 { 1035 pfn = max(start_pfn, pfn); 1036 pte = &pte[pfn_level_offset(pfn, level)]; 1037 1038 do { 1039 unsigned long level_pfn; 1040 struct dma_pte *level_pte; 1041 1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1043 goto next; 1044 1045 level_pfn = pfn & level_mask(level); 1046 level_pte = phys_to_virt(dma_pte_addr(pte)); 1047 1048 if (level > 2) { 1049 dma_pte_free_level(domain, level - 1, retain_level, 1050 level_pte, level_pfn, start_pfn, 1051 last_pfn); 1052 } 1053 1054 /* 1055 * Free the page table if we're below the level we want to 1056 * retain and the range covers the entire table. 1057 */ 1058 if (level < retain_level && !(start_pfn > level_pfn || 1059 last_pfn < level_pfn + level_size(level) - 1)) { 1060 dma_clear_pte(pte); 1061 domain_flush_cache(domain, pte, sizeof(*pte)); 1062 free_pgtable_page(level_pte); 1063 } 1064 next: 1065 pfn += level_size(level); 1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1067 } 1068 1069 /* 1070 * clear last level (leaf) ptes and free page table pages below the 1071 * level we wish to keep intact. 1072 */ 1073 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1074 unsigned long start_pfn, 1075 unsigned long last_pfn, 1076 int retain_level) 1077 { 1078 dma_pte_clear_range(domain, start_pfn, last_pfn); 1079 1080 /* We don't need lock here; nobody else touches the iova range */ 1081 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1082 domain->pgd, 0, start_pfn, last_pfn); 1083 1084 /* free pgd */ 1085 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1086 free_pgtable_page(domain->pgd); 1087 domain->pgd = NULL; 1088 } 1089 } 1090 1091 /* When a page at a given level is being unlinked from its parent, we don't 1092 need to *modify* it at all. All we need to do is make a list of all the 1093 pages which can be freed just as soon as we've flushed the IOTLB and we 1094 know the hardware page-walk will no longer touch them. 1095 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1096 be freed. */ 1097 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1098 int level, struct dma_pte *pte, 1099 struct list_head *freelist) 1100 { 1101 struct page *pg; 1102 1103 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1104 list_add_tail(&pg->lru, freelist); 1105 1106 if (level == 1) 1107 return; 1108 1109 pte = page_address(pg); 1110 do { 1111 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1112 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1113 pte++; 1114 } while (!first_pte_in_page(pte)); 1115 } 1116 1117 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1118 struct dma_pte *pte, unsigned long pfn, 1119 unsigned long start_pfn, unsigned long last_pfn, 1120 struct list_head *freelist) 1121 { 1122 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1123 1124 pfn = max(start_pfn, pfn); 1125 pte = &pte[pfn_level_offset(pfn, level)]; 1126 1127 do { 1128 unsigned long level_pfn = pfn & level_mask(level); 1129 1130 if (!dma_pte_present(pte)) 1131 goto next; 1132 1133 /* If range covers entire pagetable, free it */ 1134 if (start_pfn <= level_pfn && 1135 last_pfn >= level_pfn + level_size(level) - 1) { 1136 /* These suborbinate page tables are going away entirely. Don't 1137 bother to clear them; we're just going to *free* them. */ 1138 if (level > 1 && !dma_pte_superpage(pte)) 1139 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1140 1141 dma_clear_pte(pte); 1142 if (!first_pte) 1143 first_pte = pte; 1144 last_pte = pte; 1145 } else if (level > 1) { 1146 /* Recurse down into a level that isn't *entirely* obsolete */ 1147 dma_pte_clear_level(domain, level - 1, 1148 phys_to_virt(dma_pte_addr(pte)), 1149 level_pfn, start_pfn, last_pfn, 1150 freelist); 1151 } 1152 next: 1153 pfn = level_pfn + level_size(level); 1154 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1155 1156 if (first_pte) 1157 domain_flush_cache(domain, first_pte, 1158 (void *)++last_pte - (void *)first_pte); 1159 } 1160 1161 /* We can't just free the pages because the IOMMU may still be walking 1162 the page tables, and may have cached the intermediate levels. The 1163 pages can only be freed after the IOTLB flush has been done. */ 1164 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1165 unsigned long last_pfn, struct list_head *freelist) 1166 { 1167 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1168 WARN_ON(start_pfn > last_pfn)) 1169 return; 1170 1171 /* we don't need lock here; nobody else touches the iova range */ 1172 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1173 domain->pgd, 0, start_pfn, last_pfn, freelist); 1174 1175 /* free pgd */ 1176 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1177 struct page *pgd_page = virt_to_page(domain->pgd); 1178 list_add_tail(&pgd_page->lru, freelist); 1179 domain->pgd = NULL; 1180 } 1181 } 1182 1183 /* iommu handling */ 1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1185 { 1186 struct root_entry *root; 1187 1188 root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1189 if (!root) { 1190 pr_err("Allocating root entry for %s failed\n", 1191 iommu->name); 1192 return -ENOMEM; 1193 } 1194 1195 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1196 iommu->root_entry = root; 1197 1198 return 0; 1199 } 1200 1201 static void iommu_set_root_entry(struct intel_iommu *iommu) 1202 { 1203 u64 addr; 1204 u32 sts; 1205 unsigned long flag; 1206 1207 addr = virt_to_phys(iommu->root_entry); 1208 if (sm_supported(iommu)) 1209 addr |= DMA_RTADDR_SMT; 1210 1211 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1212 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1213 1214 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1215 1216 /* Make sure hardware complete it */ 1217 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1218 readl, (sts & DMA_GSTS_RTPS), sts); 1219 1220 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1221 1222 /* 1223 * Hardware invalidates all DMA remapping hardware translation 1224 * caches as part of SRTP flow. 1225 */ 1226 if (cap_esrtps(iommu->cap)) 1227 return; 1228 1229 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1230 if (sm_supported(iommu)) 1231 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1232 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1233 } 1234 1235 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1236 { 1237 u32 val; 1238 unsigned long flag; 1239 1240 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1241 return; 1242 1243 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1244 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1245 1246 /* Make sure hardware complete it */ 1247 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1248 readl, (!(val & DMA_GSTS_WBFS)), val); 1249 1250 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1251 } 1252 1253 /* return value determine if we need a write buffer flush */ 1254 static void __iommu_flush_context(struct intel_iommu *iommu, 1255 u16 did, u16 source_id, u8 function_mask, 1256 u64 type) 1257 { 1258 u64 val = 0; 1259 unsigned long flag; 1260 1261 switch (type) { 1262 case DMA_CCMD_GLOBAL_INVL: 1263 val = DMA_CCMD_GLOBAL_INVL; 1264 break; 1265 case DMA_CCMD_DOMAIN_INVL: 1266 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1267 break; 1268 case DMA_CCMD_DEVICE_INVL: 1269 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1270 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1271 break; 1272 default: 1273 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1274 iommu->name, type); 1275 return; 1276 } 1277 val |= DMA_CCMD_ICC; 1278 1279 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1280 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1281 1282 /* Make sure hardware complete it */ 1283 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1284 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1285 1286 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1287 } 1288 1289 /* return value determine if we need a write buffer flush */ 1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1291 u64 addr, unsigned int size_order, u64 type) 1292 { 1293 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1294 u64 val = 0, val_iva = 0; 1295 unsigned long flag; 1296 1297 switch (type) { 1298 case DMA_TLB_GLOBAL_FLUSH: 1299 /* global flush doesn't need set IVA_REG */ 1300 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1301 break; 1302 case DMA_TLB_DSI_FLUSH: 1303 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1304 break; 1305 case DMA_TLB_PSI_FLUSH: 1306 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1307 /* IH bit is passed in as part of address */ 1308 val_iva = size_order | addr; 1309 break; 1310 default: 1311 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1312 iommu->name, type); 1313 return; 1314 } 1315 /* Note: set drain read/write */ 1316 #if 0 1317 /* 1318 * This is probably to be super secure.. Looks like we can 1319 * ignore it without any impact. 1320 */ 1321 if (cap_read_drain(iommu->cap)) 1322 val |= DMA_TLB_READ_DRAIN; 1323 #endif 1324 if (cap_write_drain(iommu->cap)) 1325 val |= DMA_TLB_WRITE_DRAIN; 1326 1327 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1328 /* Note: Only uses first TLB reg currently */ 1329 if (val_iva) 1330 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1331 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1332 1333 /* Make sure hardware complete it */ 1334 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1335 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1336 1337 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1338 1339 /* check IOTLB invalidation granularity */ 1340 if (DMA_TLB_IAIG(val) == 0) 1341 pr_err("Flush IOTLB failed\n"); 1342 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1343 pr_debug("TLB flush request %Lx, actual %Lx\n", 1344 (unsigned long long)DMA_TLB_IIRG(type), 1345 (unsigned long long)DMA_TLB_IAIG(val)); 1346 } 1347 1348 static struct device_domain_info * 1349 domain_lookup_dev_info(struct dmar_domain *domain, 1350 struct intel_iommu *iommu, u8 bus, u8 devfn) 1351 { 1352 struct device_domain_info *info; 1353 unsigned long flags; 1354 1355 spin_lock_irqsave(&domain->lock, flags); 1356 list_for_each_entry(info, &domain->devices, link) { 1357 if (info->iommu == iommu && info->bus == bus && 1358 info->devfn == devfn) { 1359 spin_unlock_irqrestore(&domain->lock, flags); 1360 return info; 1361 } 1362 } 1363 spin_unlock_irqrestore(&domain->lock, flags); 1364 1365 return NULL; 1366 } 1367 1368 static void domain_update_iotlb(struct dmar_domain *domain) 1369 { 1370 struct device_domain_info *info; 1371 bool has_iotlb_device = false; 1372 unsigned long flags; 1373 1374 spin_lock_irqsave(&domain->lock, flags); 1375 list_for_each_entry(info, &domain->devices, link) { 1376 if (info->ats_enabled) { 1377 has_iotlb_device = true; 1378 break; 1379 } 1380 } 1381 domain->has_iotlb_device = has_iotlb_device; 1382 spin_unlock_irqrestore(&domain->lock, flags); 1383 } 1384 1385 /* 1386 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1387 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1388 * check because it applies only to the built-in QAT devices and it doesn't 1389 * grant additional privileges. 1390 */ 1391 #define BUGGY_QAT_DEVID_MASK 0x4940 1392 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1393 { 1394 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1395 return false; 1396 1397 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1398 return false; 1399 1400 return true; 1401 } 1402 1403 static void iommu_enable_pci_caps(struct device_domain_info *info) 1404 { 1405 struct pci_dev *pdev; 1406 1407 if (!dev_is_pci(info->dev)) 1408 return; 1409 1410 pdev = to_pci_dev(info->dev); 1411 1412 /* The PCIe spec, in its wisdom, declares that the behaviour of 1413 the device if you enable PASID support after ATS support is 1414 undefined. So always enable PASID support on devices which 1415 have it, even if we can't yet know if we're ever going to 1416 use it. */ 1417 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1418 info->pasid_enabled = 1; 1419 1420 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1421 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1422 info->ats_enabled = 1; 1423 domain_update_iotlb(info->domain); 1424 } 1425 } 1426 1427 static void iommu_disable_pci_caps(struct device_domain_info *info) 1428 { 1429 struct pci_dev *pdev; 1430 1431 if (!dev_is_pci(info->dev)) 1432 return; 1433 1434 pdev = to_pci_dev(info->dev); 1435 1436 if (info->ats_enabled) { 1437 pci_disable_ats(pdev); 1438 info->ats_enabled = 0; 1439 domain_update_iotlb(info->domain); 1440 } 1441 1442 if (info->pasid_enabled) { 1443 pci_disable_pasid(pdev); 1444 info->pasid_enabled = 0; 1445 } 1446 } 1447 1448 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1449 u64 addr, unsigned int mask) 1450 { 1451 u16 sid, qdep; 1452 1453 if (!info || !info->ats_enabled) 1454 return; 1455 1456 sid = info->bus << 8 | info->devfn; 1457 qdep = info->ats_qdep; 1458 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1459 qdep, addr, mask); 1460 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep); 1461 } 1462 1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1464 u64 addr, unsigned mask) 1465 { 1466 struct device_domain_info *info; 1467 unsigned long flags; 1468 1469 if (!domain->has_iotlb_device) 1470 return; 1471 1472 spin_lock_irqsave(&domain->lock, flags); 1473 list_for_each_entry(info, &domain->devices, link) 1474 __iommu_flush_dev_iotlb(info, addr, mask); 1475 spin_unlock_irqrestore(&domain->lock, flags); 1476 } 1477 1478 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1479 struct dmar_domain *domain, 1480 unsigned long pfn, unsigned int pages, 1481 int ih, int map) 1482 { 1483 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1484 unsigned int mask = ilog2(aligned_pages); 1485 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1486 u16 did = domain_id_iommu(domain, iommu); 1487 1488 if (WARN_ON(!pages)) 1489 return; 1490 1491 if (ih) 1492 ih = 1 << 6; 1493 1494 if (domain->use_first_level) { 1495 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1496 } else { 1497 unsigned long bitmask = aligned_pages - 1; 1498 1499 /* 1500 * PSI masks the low order bits of the base address. If the 1501 * address isn't aligned to the mask, then compute a mask value 1502 * needed to ensure the target range is flushed. 1503 */ 1504 if (unlikely(bitmask & pfn)) { 1505 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1506 1507 /* 1508 * Since end_pfn <= pfn + bitmask, the only way bits 1509 * higher than bitmask can differ in pfn and end_pfn is 1510 * by carrying. This means after masking out bitmask, 1511 * high bits starting with the first set bit in 1512 * shared_bits are all equal in both pfn and end_pfn. 1513 */ 1514 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1515 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1516 } 1517 1518 /* 1519 * Fallback to domain selective flush if no PSI support or 1520 * the size is too big. 1521 */ 1522 if (!cap_pgsel_inv(iommu->cap) || 1523 mask > cap_max_amask_val(iommu->cap)) 1524 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1525 DMA_TLB_DSI_FLUSH); 1526 else 1527 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1528 DMA_TLB_PSI_FLUSH); 1529 } 1530 1531 /* 1532 * In caching mode, changes of pages from non-present to present require 1533 * flush. However, device IOTLB doesn't need to be flushed in this case. 1534 */ 1535 if (!cap_caching_mode(iommu->cap) || !map) 1536 iommu_flush_dev_iotlb(domain, addr, mask); 1537 } 1538 1539 /* Notification for newly created mappings */ 1540 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1541 struct dmar_domain *domain, 1542 unsigned long pfn, unsigned int pages) 1543 { 1544 /* 1545 * It's a non-present to present mapping. Only flush if caching mode 1546 * and second level. 1547 */ 1548 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1549 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1550 else 1551 iommu_flush_write_buffer(iommu); 1552 } 1553 1554 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1555 { 1556 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1557 struct iommu_domain_info *info; 1558 unsigned long idx; 1559 1560 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1561 struct intel_iommu *iommu = info->iommu; 1562 u16 did = domain_id_iommu(dmar_domain, iommu); 1563 1564 if (dmar_domain->use_first_level) 1565 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1566 else 1567 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1568 DMA_TLB_DSI_FLUSH); 1569 1570 if (!cap_caching_mode(iommu->cap)) 1571 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1572 } 1573 } 1574 1575 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1576 { 1577 u32 pmen; 1578 unsigned long flags; 1579 1580 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1581 return; 1582 1583 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1584 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1585 pmen &= ~DMA_PMEN_EPM; 1586 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1587 1588 /* wait for the protected region status bit to clear */ 1589 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1590 readl, !(pmen & DMA_PMEN_PRS), pmen); 1591 1592 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1593 } 1594 1595 static void iommu_enable_translation(struct intel_iommu *iommu) 1596 { 1597 u32 sts; 1598 unsigned long flags; 1599 1600 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1601 iommu->gcmd |= DMA_GCMD_TE; 1602 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1603 1604 /* Make sure hardware complete it */ 1605 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1606 readl, (sts & DMA_GSTS_TES), sts); 1607 1608 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1609 } 1610 1611 static void iommu_disable_translation(struct intel_iommu *iommu) 1612 { 1613 u32 sts; 1614 unsigned long flag; 1615 1616 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1617 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1618 return; 1619 1620 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1621 iommu->gcmd &= ~DMA_GCMD_TE; 1622 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1623 1624 /* Make sure hardware complete it */ 1625 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1626 readl, (!(sts & DMA_GSTS_TES)), sts); 1627 1628 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1629 } 1630 1631 static int iommu_init_domains(struct intel_iommu *iommu) 1632 { 1633 u32 ndomains; 1634 1635 ndomains = cap_ndoms(iommu->cap); 1636 pr_debug("%s: Number of Domains supported <%d>\n", 1637 iommu->name, ndomains); 1638 1639 spin_lock_init(&iommu->lock); 1640 1641 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1642 if (!iommu->domain_ids) 1643 return -ENOMEM; 1644 1645 /* 1646 * If Caching mode is set, then invalid translations are tagged 1647 * with domain-id 0, hence we need to pre-allocate it. We also 1648 * use domain-id 0 as a marker for non-allocated domain-id, so 1649 * make sure it is not used for a real domain. 1650 */ 1651 set_bit(0, iommu->domain_ids); 1652 1653 /* 1654 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1655 * entry for first-level or pass-through translation modes should 1656 * be programmed with a domain id different from those used for 1657 * second-level or nested translation. We reserve a domain id for 1658 * this purpose. 1659 */ 1660 if (sm_supported(iommu)) 1661 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1662 1663 return 0; 1664 } 1665 1666 static void disable_dmar_iommu(struct intel_iommu *iommu) 1667 { 1668 if (!iommu->domain_ids) 1669 return; 1670 1671 /* 1672 * All iommu domains must have been detached from the devices, 1673 * hence there should be no domain IDs in use. 1674 */ 1675 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1676 > NUM_RESERVED_DID)) 1677 return; 1678 1679 if (iommu->gcmd & DMA_GCMD_TE) 1680 iommu_disable_translation(iommu); 1681 } 1682 1683 static void free_dmar_iommu(struct intel_iommu *iommu) 1684 { 1685 if (iommu->domain_ids) { 1686 bitmap_free(iommu->domain_ids); 1687 iommu->domain_ids = NULL; 1688 } 1689 1690 if (iommu->copied_tables) { 1691 bitmap_free(iommu->copied_tables); 1692 iommu->copied_tables = NULL; 1693 } 1694 1695 /* free context mapping */ 1696 free_context_table(iommu); 1697 1698 #ifdef CONFIG_INTEL_IOMMU_SVM 1699 if (pasid_supported(iommu)) { 1700 if (ecap_prs(iommu->ecap)) 1701 intel_svm_finish_prq(iommu); 1702 } 1703 #endif 1704 } 1705 1706 /* 1707 * Check and return whether first level is used by default for 1708 * DMA translation. 1709 */ 1710 static bool first_level_by_default(unsigned int type) 1711 { 1712 /* Only SL is available in legacy mode */ 1713 if (!scalable_mode_support()) 1714 return false; 1715 1716 /* Only level (either FL or SL) is available, just use it */ 1717 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1718 return intel_cap_flts_sanity(); 1719 1720 /* Both levels are available, decide it based on domain type */ 1721 return type != IOMMU_DOMAIN_UNMANAGED; 1722 } 1723 1724 static struct dmar_domain *alloc_domain(unsigned int type) 1725 { 1726 struct dmar_domain *domain; 1727 1728 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1729 if (!domain) 1730 return NULL; 1731 1732 domain->nid = NUMA_NO_NODE; 1733 if (first_level_by_default(type)) 1734 domain->use_first_level = true; 1735 domain->has_iotlb_device = false; 1736 INIT_LIST_HEAD(&domain->devices); 1737 spin_lock_init(&domain->lock); 1738 xa_init(&domain->iommu_array); 1739 1740 return domain; 1741 } 1742 1743 static int domain_attach_iommu(struct dmar_domain *domain, 1744 struct intel_iommu *iommu) 1745 { 1746 struct iommu_domain_info *info, *curr; 1747 unsigned long ndomains; 1748 int num, ret = -ENOSPC; 1749 1750 info = kzalloc(sizeof(*info), GFP_KERNEL); 1751 if (!info) 1752 return -ENOMEM; 1753 1754 spin_lock(&iommu->lock); 1755 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1756 if (curr) { 1757 curr->refcnt++; 1758 spin_unlock(&iommu->lock); 1759 kfree(info); 1760 return 0; 1761 } 1762 1763 ndomains = cap_ndoms(iommu->cap); 1764 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1765 if (num >= ndomains) { 1766 pr_err("%s: No free domain ids\n", iommu->name); 1767 goto err_unlock; 1768 } 1769 1770 set_bit(num, iommu->domain_ids); 1771 info->refcnt = 1; 1772 info->did = num; 1773 info->iommu = iommu; 1774 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1775 NULL, info, GFP_ATOMIC); 1776 if (curr) { 1777 ret = xa_err(curr) ? : -EBUSY; 1778 goto err_clear; 1779 } 1780 domain_update_iommu_cap(domain); 1781 1782 spin_unlock(&iommu->lock); 1783 return 0; 1784 1785 err_clear: 1786 clear_bit(info->did, iommu->domain_ids); 1787 err_unlock: 1788 spin_unlock(&iommu->lock); 1789 kfree(info); 1790 return ret; 1791 } 1792 1793 static void domain_detach_iommu(struct dmar_domain *domain, 1794 struct intel_iommu *iommu) 1795 { 1796 struct iommu_domain_info *info; 1797 1798 spin_lock(&iommu->lock); 1799 info = xa_load(&domain->iommu_array, iommu->seq_id); 1800 if (--info->refcnt == 0) { 1801 clear_bit(info->did, iommu->domain_ids); 1802 xa_erase(&domain->iommu_array, iommu->seq_id); 1803 domain->nid = NUMA_NO_NODE; 1804 domain_update_iommu_cap(domain); 1805 kfree(info); 1806 } 1807 spin_unlock(&iommu->lock); 1808 } 1809 1810 static inline int guestwidth_to_adjustwidth(int gaw) 1811 { 1812 int agaw; 1813 int r = (gaw - 12) % 9; 1814 1815 if (r == 0) 1816 agaw = gaw; 1817 else 1818 agaw = gaw + 9 - r; 1819 if (agaw > 64) 1820 agaw = 64; 1821 return agaw; 1822 } 1823 1824 static void domain_exit(struct dmar_domain *domain) 1825 { 1826 if (domain->pgd) { 1827 LIST_HEAD(freelist); 1828 1829 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1830 put_pages_list(&freelist); 1831 } 1832 1833 if (WARN_ON(!list_empty(&domain->devices))) 1834 return; 1835 1836 kfree(domain); 1837 } 1838 1839 /* 1840 * Get the PASID directory size for scalable mode context entry. 1841 * Value of X in the PDTS field of a scalable mode context entry 1842 * indicates PASID directory with 2^(X + 7) entries. 1843 */ 1844 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1845 { 1846 unsigned long pds, max_pde; 1847 1848 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1849 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1850 if (pds < 7) 1851 return 0; 1852 1853 return pds - 7; 1854 } 1855 1856 /* 1857 * Set the RID_PASID field of a scalable mode context entry. The 1858 * IOMMU hardware will use the PASID value set in this field for 1859 * DMA translations of DMA requests without PASID. 1860 */ 1861 static inline void 1862 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1863 { 1864 context->hi |= pasid & ((1 << 20) - 1); 1865 } 1866 1867 /* 1868 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1869 * entry. 1870 */ 1871 static inline void context_set_sm_dte(struct context_entry *context) 1872 { 1873 context->lo |= BIT_ULL(2); 1874 } 1875 1876 /* 1877 * Set the PRE(Page Request Enable) field of a scalable mode context 1878 * entry. 1879 */ 1880 static inline void context_set_sm_pre(struct context_entry *context) 1881 { 1882 context->lo |= BIT_ULL(4); 1883 } 1884 1885 /* Convert value to context PASID directory size field coding. */ 1886 #define context_pdts(pds) (((pds) & 0x7) << 9) 1887 1888 static int domain_context_mapping_one(struct dmar_domain *domain, 1889 struct intel_iommu *iommu, 1890 struct pasid_table *table, 1891 u8 bus, u8 devfn) 1892 { 1893 struct device_domain_info *info = 1894 domain_lookup_dev_info(domain, iommu, bus, devfn); 1895 u16 did = domain_id_iommu(domain, iommu); 1896 int translation = CONTEXT_TT_MULTI_LEVEL; 1897 struct context_entry *context; 1898 int ret; 1899 1900 WARN_ON(did == 0); 1901 1902 if (hw_pass_through && domain_type_is_si(domain)) 1903 translation = CONTEXT_TT_PASS_THROUGH; 1904 1905 pr_debug("Set context mapping for %02x:%02x.%d\n", 1906 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1907 1908 spin_lock(&iommu->lock); 1909 ret = -ENOMEM; 1910 context = iommu_context_addr(iommu, bus, devfn, 1); 1911 if (!context) 1912 goto out_unlock; 1913 1914 ret = 0; 1915 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1916 goto out_unlock; 1917 1918 /* 1919 * For kdump cases, old valid entries may be cached due to the 1920 * in-flight DMA and copied pgtable, but there is no unmapping 1921 * behaviour for them, thus we need an explicit cache flush for 1922 * the newly-mapped device. For kdump, at this point, the device 1923 * is supposed to finish reset at its driver probe stage, so no 1924 * in-flight DMA will exist, and we don't need to worry anymore 1925 * hereafter. 1926 */ 1927 if (context_copied(iommu, bus, devfn)) { 1928 u16 did_old = context_domain_id(context); 1929 1930 if (did_old < cap_ndoms(iommu->cap)) { 1931 iommu->flush.flush_context(iommu, did_old, 1932 (((u16)bus) << 8) | devfn, 1933 DMA_CCMD_MASK_NOBIT, 1934 DMA_CCMD_DEVICE_INVL); 1935 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1936 DMA_TLB_DSI_FLUSH); 1937 } 1938 1939 clear_context_copied(iommu, bus, devfn); 1940 } 1941 1942 context_clear_entry(context); 1943 1944 if (sm_supported(iommu)) { 1945 unsigned long pds; 1946 1947 WARN_ON(!table); 1948 1949 /* Setup the PASID DIR pointer: */ 1950 pds = context_get_sm_pds(table); 1951 context->lo = (u64)virt_to_phys(table->table) | 1952 context_pdts(pds); 1953 1954 /* Setup the RID_PASID field: */ 1955 context_set_sm_rid2pasid(context, PASID_RID2PASID); 1956 1957 /* 1958 * Setup the Device-TLB enable bit and Page request 1959 * Enable bit: 1960 */ 1961 if (info && info->ats_supported) 1962 context_set_sm_dte(context); 1963 if (info && info->pri_supported) 1964 context_set_sm_pre(context); 1965 if (info && info->pasid_supported) 1966 context_set_pasid(context); 1967 } else { 1968 struct dma_pte *pgd = domain->pgd; 1969 int agaw; 1970 1971 context_set_domain_id(context, did); 1972 1973 if (translation != CONTEXT_TT_PASS_THROUGH) { 1974 /* 1975 * Skip top levels of page tables for iommu which has 1976 * less agaw than default. Unnecessary for PT mode. 1977 */ 1978 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 1979 ret = -ENOMEM; 1980 pgd = phys_to_virt(dma_pte_addr(pgd)); 1981 if (!dma_pte_present(pgd)) 1982 goto out_unlock; 1983 } 1984 1985 if (info && info->ats_supported) 1986 translation = CONTEXT_TT_DEV_IOTLB; 1987 else 1988 translation = CONTEXT_TT_MULTI_LEVEL; 1989 1990 context_set_address_root(context, virt_to_phys(pgd)); 1991 context_set_address_width(context, agaw); 1992 } else { 1993 /* 1994 * In pass through mode, AW must be programmed to 1995 * indicate the largest AGAW value supported by 1996 * hardware. And ASR is ignored by hardware. 1997 */ 1998 context_set_address_width(context, iommu->msagaw); 1999 } 2000 2001 context_set_translation_type(context, translation); 2002 } 2003 2004 context_set_fault_enable(context); 2005 context_set_present(context); 2006 if (!ecap_coherent(iommu->ecap)) 2007 clflush_cache_range(context, sizeof(*context)); 2008 2009 /* 2010 * It's a non-present to present mapping. If hardware doesn't cache 2011 * non-present entry we only need to flush the write-buffer. If the 2012 * _does_ cache non-present entries, then it does so in the special 2013 * domain #0, which we have to flush: 2014 */ 2015 if (cap_caching_mode(iommu->cap)) { 2016 iommu->flush.flush_context(iommu, 0, 2017 (((u16)bus) << 8) | devfn, 2018 DMA_CCMD_MASK_NOBIT, 2019 DMA_CCMD_DEVICE_INVL); 2020 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2021 } else { 2022 iommu_flush_write_buffer(iommu); 2023 } 2024 2025 ret = 0; 2026 2027 out_unlock: 2028 spin_unlock(&iommu->lock); 2029 2030 return ret; 2031 } 2032 2033 struct domain_context_mapping_data { 2034 struct dmar_domain *domain; 2035 struct intel_iommu *iommu; 2036 struct pasid_table *table; 2037 }; 2038 2039 static int domain_context_mapping_cb(struct pci_dev *pdev, 2040 u16 alias, void *opaque) 2041 { 2042 struct domain_context_mapping_data *data = opaque; 2043 2044 return domain_context_mapping_one(data->domain, data->iommu, 2045 data->table, PCI_BUS_NUM(alias), 2046 alias & 0xff); 2047 } 2048 2049 static int 2050 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2051 { 2052 struct domain_context_mapping_data data; 2053 struct pasid_table *table; 2054 struct intel_iommu *iommu; 2055 u8 bus, devfn; 2056 2057 iommu = device_to_iommu(dev, &bus, &devfn); 2058 if (!iommu) 2059 return -ENODEV; 2060 2061 table = intel_pasid_get_table(dev); 2062 2063 if (!dev_is_pci(dev)) 2064 return domain_context_mapping_one(domain, iommu, table, 2065 bus, devfn); 2066 2067 data.domain = domain; 2068 data.iommu = iommu; 2069 data.table = table; 2070 2071 return pci_for_each_dma_alias(to_pci_dev(dev), 2072 &domain_context_mapping_cb, &data); 2073 } 2074 2075 /* Returns a number of VTD pages, but aligned to MM page size */ 2076 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2077 size_t size) 2078 { 2079 host_addr &= ~PAGE_MASK; 2080 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2081 } 2082 2083 /* Return largest possible superpage level for a given mapping */ 2084 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2085 unsigned long iov_pfn, 2086 unsigned long phy_pfn, 2087 unsigned long pages) 2088 { 2089 int support, level = 1; 2090 unsigned long pfnmerge; 2091 2092 support = domain->iommu_superpage; 2093 2094 /* To use a large page, the virtual *and* physical addresses 2095 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2096 of them will mean we have to use smaller pages. So just 2097 merge them and check both at once. */ 2098 pfnmerge = iov_pfn | phy_pfn; 2099 2100 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2101 pages >>= VTD_STRIDE_SHIFT; 2102 if (!pages) 2103 break; 2104 pfnmerge >>= VTD_STRIDE_SHIFT; 2105 level++; 2106 support--; 2107 } 2108 return level; 2109 } 2110 2111 /* 2112 * Ensure that old small page tables are removed to make room for superpage(s). 2113 * We're going to add new large pages, so make sure we don't remove their parent 2114 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2115 */ 2116 static void switch_to_super_page(struct dmar_domain *domain, 2117 unsigned long start_pfn, 2118 unsigned long end_pfn, int level) 2119 { 2120 unsigned long lvl_pages = lvl_to_nr_pages(level); 2121 struct iommu_domain_info *info; 2122 struct dma_pte *pte = NULL; 2123 unsigned long i; 2124 2125 while (start_pfn <= end_pfn) { 2126 if (!pte) 2127 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2128 GFP_ATOMIC); 2129 2130 if (dma_pte_present(pte)) { 2131 dma_pte_free_pagetable(domain, start_pfn, 2132 start_pfn + lvl_pages - 1, 2133 level + 1); 2134 2135 xa_for_each(&domain->iommu_array, i, info) 2136 iommu_flush_iotlb_psi(info->iommu, domain, 2137 start_pfn, lvl_pages, 2138 0, 0); 2139 } 2140 2141 pte++; 2142 start_pfn += lvl_pages; 2143 if (first_pte_in_page(pte)) 2144 pte = NULL; 2145 } 2146 } 2147 2148 static int 2149 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2150 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2151 gfp_t gfp) 2152 { 2153 struct dma_pte *first_pte = NULL, *pte = NULL; 2154 unsigned int largepage_lvl = 0; 2155 unsigned long lvl_pages = 0; 2156 phys_addr_t pteval; 2157 u64 attr; 2158 2159 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2160 return -EINVAL; 2161 2162 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2163 return -EINVAL; 2164 2165 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2166 attr |= DMA_FL_PTE_PRESENT; 2167 if (domain->use_first_level) { 2168 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2169 if (prot & DMA_PTE_WRITE) 2170 attr |= DMA_FL_PTE_DIRTY; 2171 } 2172 2173 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2174 2175 while (nr_pages > 0) { 2176 uint64_t tmp; 2177 2178 if (!pte) { 2179 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2180 phys_pfn, nr_pages); 2181 2182 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2183 gfp); 2184 if (!pte) 2185 return -ENOMEM; 2186 first_pte = pte; 2187 2188 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2189 2190 /* It is large page*/ 2191 if (largepage_lvl > 1) { 2192 unsigned long end_pfn; 2193 unsigned long pages_to_remove; 2194 2195 pteval |= DMA_PTE_LARGE_PAGE; 2196 pages_to_remove = min_t(unsigned long, nr_pages, 2197 nr_pte_to_next_page(pte) * lvl_pages); 2198 end_pfn = iov_pfn + pages_to_remove - 1; 2199 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2200 } else { 2201 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2202 } 2203 2204 } 2205 /* We don't need lock here, nobody else 2206 * touches the iova range 2207 */ 2208 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2209 if (tmp) { 2210 static int dumps = 5; 2211 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2212 iov_pfn, tmp, (unsigned long long)pteval); 2213 if (dumps) { 2214 dumps--; 2215 debug_dma_dump_mappings(NULL); 2216 } 2217 WARN_ON(1); 2218 } 2219 2220 nr_pages -= lvl_pages; 2221 iov_pfn += lvl_pages; 2222 phys_pfn += lvl_pages; 2223 pteval += lvl_pages * VTD_PAGE_SIZE; 2224 2225 /* If the next PTE would be the first in a new page, then we 2226 * need to flush the cache on the entries we've just written. 2227 * And then we'll need to recalculate 'pte', so clear it and 2228 * let it get set again in the if (!pte) block above. 2229 * 2230 * If we're done (!nr_pages) we need to flush the cache too. 2231 * 2232 * Also if we've been setting superpages, we may need to 2233 * recalculate 'pte' and switch back to smaller pages for the 2234 * end of the mapping, if the trailing size is not enough to 2235 * use another superpage (i.e. nr_pages < lvl_pages). 2236 */ 2237 pte++; 2238 if (!nr_pages || first_pte_in_page(pte) || 2239 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2240 domain_flush_cache(domain, first_pte, 2241 (void *)pte - (void *)first_pte); 2242 pte = NULL; 2243 } 2244 } 2245 2246 return 0; 2247 } 2248 2249 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2250 { 2251 struct intel_iommu *iommu = info->iommu; 2252 struct context_entry *context; 2253 u16 did_old; 2254 2255 if (!iommu) 2256 return; 2257 2258 spin_lock(&iommu->lock); 2259 context = iommu_context_addr(iommu, bus, devfn, 0); 2260 if (!context) { 2261 spin_unlock(&iommu->lock); 2262 return; 2263 } 2264 2265 if (sm_supported(iommu)) { 2266 if (hw_pass_through && domain_type_is_si(info->domain)) 2267 did_old = FLPT_DEFAULT_DID; 2268 else 2269 did_old = domain_id_iommu(info->domain, iommu); 2270 } else { 2271 did_old = context_domain_id(context); 2272 } 2273 2274 context_clear_entry(context); 2275 __iommu_flush_cache(iommu, context, sizeof(*context)); 2276 spin_unlock(&iommu->lock); 2277 iommu->flush.flush_context(iommu, 2278 did_old, 2279 (((u16)bus) << 8) | devfn, 2280 DMA_CCMD_MASK_NOBIT, 2281 DMA_CCMD_DEVICE_INVL); 2282 2283 if (sm_supported(iommu)) 2284 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2285 2286 iommu->flush.flush_iotlb(iommu, 2287 did_old, 2288 0, 2289 0, 2290 DMA_TLB_DSI_FLUSH); 2291 2292 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2293 } 2294 2295 static int domain_setup_first_level(struct intel_iommu *iommu, 2296 struct dmar_domain *domain, 2297 struct device *dev, 2298 u32 pasid) 2299 { 2300 struct dma_pte *pgd = domain->pgd; 2301 int agaw, level; 2302 int flags = 0; 2303 2304 /* 2305 * Skip top levels of page tables for iommu which has 2306 * less agaw than default. Unnecessary for PT mode. 2307 */ 2308 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2309 pgd = phys_to_virt(dma_pte_addr(pgd)); 2310 if (!dma_pte_present(pgd)) 2311 return -ENOMEM; 2312 } 2313 2314 level = agaw_to_level(agaw); 2315 if (level != 4 && level != 5) 2316 return -EINVAL; 2317 2318 if (level == 5) 2319 flags |= PASID_FLAG_FL5LP; 2320 2321 if (domain->force_snooping) 2322 flags |= PASID_FLAG_PAGE_SNOOP; 2323 2324 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2325 domain_id_iommu(domain, iommu), 2326 flags); 2327 } 2328 2329 static bool dev_is_real_dma_subdevice(struct device *dev) 2330 { 2331 return dev && dev_is_pci(dev) && 2332 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2333 } 2334 2335 static int iommu_domain_identity_map(struct dmar_domain *domain, 2336 unsigned long first_vpfn, 2337 unsigned long last_vpfn) 2338 { 2339 /* 2340 * RMRR range might have overlap with physical memory range, 2341 * clear it first 2342 */ 2343 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2344 2345 return __domain_mapping(domain, first_vpfn, 2346 first_vpfn, last_vpfn - first_vpfn + 1, 2347 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2348 } 2349 2350 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2351 2352 static int __init si_domain_init(int hw) 2353 { 2354 struct dmar_rmrr_unit *rmrr; 2355 struct device *dev; 2356 int i, nid, ret; 2357 2358 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2359 if (!si_domain) 2360 return -EFAULT; 2361 2362 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2363 domain_exit(si_domain); 2364 si_domain = NULL; 2365 return -EFAULT; 2366 } 2367 2368 if (hw) 2369 return 0; 2370 2371 for_each_online_node(nid) { 2372 unsigned long start_pfn, end_pfn; 2373 int i; 2374 2375 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2376 ret = iommu_domain_identity_map(si_domain, 2377 mm_to_dma_pfn(start_pfn), 2378 mm_to_dma_pfn(end_pfn)); 2379 if (ret) 2380 return ret; 2381 } 2382 } 2383 2384 /* 2385 * Identity map the RMRRs so that devices with RMRRs could also use 2386 * the si_domain. 2387 */ 2388 for_each_rmrr_units(rmrr) { 2389 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2390 i, dev) { 2391 unsigned long long start = rmrr->base_address; 2392 unsigned long long end = rmrr->end_address; 2393 2394 if (WARN_ON(end < start || 2395 end >> agaw_to_width(si_domain->agaw))) 2396 continue; 2397 2398 ret = iommu_domain_identity_map(si_domain, 2399 mm_to_dma_pfn(start >> PAGE_SHIFT), 2400 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2401 if (ret) 2402 return ret; 2403 } 2404 } 2405 2406 return 0; 2407 } 2408 2409 static int dmar_domain_attach_device(struct dmar_domain *domain, 2410 struct device *dev) 2411 { 2412 struct device_domain_info *info = dev_iommu_priv_get(dev); 2413 struct intel_iommu *iommu; 2414 unsigned long flags; 2415 u8 bus, devfn; 2416 int ret; 2417 2418 iommu = device_to_iommu(dev, &bus, &devfn); 2419 if (!iommu) 2420 return -ENODEV; 2421 2422 ret = domain_attach_iommu(domain, iommu); 2423 if (ret) 2424 return ret; 2425 info->domain = domain; 2426 spin_lock_irqsave(&domain->lock, flags); 2427 list_add(&info->link, &domain->devices); 2428 spin_unlock_irqrestore(&domain->lock, flags); 2429 2430 /* PASID table is mandatory for a PCI device in scalable mode. */ 2431 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2432 /* Setup the PASID entry for requests without PASID: */ 2433 if (hw_pass_through && domain_type_is_si(domain)) 2434 ret = intel_pasid_setup_pass_through(iommu, domain, 2435 dev, PASID_RID2PASID); 2436 else if (domain->use_first_level) 2437 ret = domain_setup_first_level(iommu, domain, dev, 2438 PASID_RID2PASID); 2439 else 2440 ret = intel_pasid_setup_second_level(iommu, domain, 2441 dev, PASID_RID2PASID); 2442 if (ret) { 2443 dev_err(dev, "Setup RID2PASID failed\n"); 2444 device_block_translation(dev); 2445 return ret; 2446 } 2447 } 2448 2449 ret = domain_context_mapping(domain, dev); 2450 if (ret) { 2451 dev_err(dev, "Domain context map failed\n"); 2452 device_block_translation(dev); 2453 return ret; 2454 } 2455 2456 iommu_enable_pci_caps(info); 2457 2458 return 0; 2459 } 2460 2461 static bool device_has_rmrr(struct device *dev) 2462 { 2463 struct dmar_rmrr_unit *rmrr; 2464 struct device *tmp; 2465 int i; 2466 2467 rcu_read_lock(); 2468 for_each_rmrr_units(rmrr) { 2469 /* 2470 * Return TRUE if this RMRR contains the device that 2471 * is passed in. 2472 */ 2473 for_each_active_dev_scope(rmrr->devices, 2474 rmrr->devices_cnt, i, tmp) 2475 if (tmp == dev || 2476 is_downstream_to_pci_bridge(dev, tmp)) { 2477 rcu_read_unlock(); 2478 return true; 2479 } 2480 } 2481 rcu_read_unlock(); 2482 return false; 2483 } 2484 2485 /** 2486 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2487 * is relaxable (ie. is allowed to be not enforced under some conditions) 2488 * @dev: device handle 2489 * 2490 * We assume that PCI USB devices with RMRRs have them largely 2491 * for historical reasons and that the RMRR space is not actively used post 2492 * boot. This exclusion may change if vendors begin to abuse it. 2493 * 2494 * The same exception is made for graphics devices, with the requirement that 2495 * any use of the RMRR regions will be torn down before assigning the device 2496 * to a guest. 2497 * 2498 * Return: true if the RMRR is relaxable, false otherwise 2499 */ 2500 static bool device_rmrr_is_relaxable(struct device *dev) 2501 { 2502 struct pci_dev *pdev; 2503 2504 if (!dev_is_pci(dev)) 2505 return false; 2506 2507 pdev = to_pci_dev(dev); 2508 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2509 return true; 2510 else 2511 return false; 2512 } 2513 2514 /* 2515 * There are a couple cases where we need to restrict the functionality of 2516 * devices associated with RMRRs. The first is when evaluating a device for 2517 * identity mapping because problems exist when devices are moved in and out 2518 * of domains and their respective RMRR information is lost. This means that 2519 * a device with associated RMRRs will never be in a "passthrough" domain. 2520 * The second is use of the device through the IOMMU API. This interface 2521 * expects to have full control of the IOVA space for the device. We cannot 2522 * satisfy both the requirement that RMRR access is maintained and have an 2523 * unencumbered IOVA space. We also have no ability to quiesce the device's 2524 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2525 * We therefore prevent devices associated with an RMRR from participating in 2526 * the IOMMU API, which eliminates them from device assignment. 2527 * 2528 * In both cases, devices which have relaxable RMRRs are not concerned by this 2529 * restriction. See device_rmrr_is_relaxable comment. 2530 */ 2531 static bool device_is_rmrr_locked(struct device *dev) 2532 { 2533 if (!device_has_rmrr(dev)) 2534 return false; 2535 2536 if (device_rmrr_is_relaxable(dev)) 2537 return false; 2538 2539 return true; 2540 } 2541 2542 /* 2543 * Return the required default domain type for a specific device. 2544 * 2545 * @dev: the device in query 2546 * @startup: true if this is during early boot 2547 * 2548 * Returns: 2549 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2550 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2551 * - 0: both identity and dynamic domains work for this device 2552 */ 2553 static int device_def_domain_type(struct device *dev) 2554 { 2555 if (dev_is_pci(dev)) { 2556 struct pci_dev *pdev = to_pci_dev(dev); 2557 2558 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2559 return IOMMU_DOMAIN_IDENTITY; 2560 2561 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2562 return IOMMU_DOMAIN_IDENTITY; 2563 } 2564 2565 return 0; 2566 } 2567 2568 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2569 { 2570 /* 2571 * Start from the sane iommu hardware state. 2572 * If the queued invalidation is already initialized by us 2573 * (for example, while enabling interrupt-remapping) then 2574 * we got the things already rolling from a sane state. 2575 */ 2576 if (!iommu->qi) { 2577 /* 2578 * Clear any previous faults. 2579 */ 2580 dmar_fault(-1, iommu); 2581 /* 2582 * Disable queued invalidation if supported and already enabled 2583 * before OS handover. 2584 */ 2585 dmar_disable_qi(iommu); 2586 } 2587 2588 if (dmar_enable_qi(iommu)) { 2589 /* 2590 * Queued Invalidate not enabled, use Register Based Invalidate 2591 */ 2592 iommu->flush.flush_context = __iommu_flush_context; 2593 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2594 pr_info("%s: Using Register based invalidation\n", 2595 iommu->name); 2596 } else { 2597 iommu->flush.flush_context = qi_flush_context; 2598 iommu->flush.flush_iotlb = qi_flush_iotlb; 2599 pr_info("%s: Using Queued invalidation\n", iommu->name); 2600 } 2601 } 2602 2603 static int copy_context_table(struct intel_iommu *iommu, 2604 struct root_entry *old_re, 2605 struct context_entry **tbl, 2606 int bus, bool ext) 2607 { 2608 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2609 struct context_entry *new_ce = NULL, ce; 2610 struct context_entry *old_ce = NULL; 2611 struct root_entry re; 2612 phys_addr_t old_ce_phys; 2613 2614 tbl_idx = ext ? bus * 2 : bus; 2615 memcpy(&re, old_re, sizeof(re)); 2616 2617 for (devfn = 0; devfn < 256; devfn++) { 2618 /* First calculate the correct index */ 2619 idx = (ext ? devfn * 2 : devfn) % 256; 2620 2621 if (idx == 0) { 2622 /* First save what we may have and clean up */ 2623 if (new_ce) { 2624 tbl[tbl_idx] = new_ce; 2625 __iommu_flush_cache(iommu, new_ce, 2626 VTD_PAGE_SIZE); 2627 pos = 1; 2628 } 2629 2630 if (old_ce) 2631 memunmap(old_ce); 2632 2633 ret = 0; 2634 if (devfn < 0x80) 2635 old_ce_phys = root_entry_lctp(&re); 2636 else 2637 old_ce_phys = root_entry_uctp(&re); 2638 2639 if (!old_ce_phys) { 2640 if (ext && devfn == 0) { 2641 /* No LCTP, try UCTP */ 2642 devfn = 0x7f; 2643 continue; 2644 } else { 2645 goto out; 2646 } 2647 } 2648 2649 ret = -ENOMEM; 2650 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2651 MEMREMAP_WB); 2652 if (!old_ce) 2653 goto out; 2654 2655 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2656 if (!new_ce) 2657 goto out_unmap; 2658 2659 ret = 0; 2660 } 2661 2662 /* Now copy the context entry */ 2663 memcpy(&ce, old_ce + idx, sizeof(ce)); 2664 2665 if (!context_present(&ce)) 2666 continue; 2667 2668 did = context_domain_id(&ce); 2669 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2670 set_bit(did, iommu->domain_ids); 2671 2672 set_context_copied(iommu, bus, devfn); 2673 new_ce[idx] = ce; 2674 } 2675 2676 tbl[tbl_idx + pos] = new_ce; 2677 2678 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2679 2680 out_unmap: 2681 memunmap(old_ce); 2682 2683 out: 2684 return ret; 2685 } 2686 2687 static int copy_translation_tables(struct intel_iommu *iommu) 2688 { 2689 struct context_entry **ctxt_tbls; 2690 struct root_entry *old_rt; 2691 phys_addr_t old_rt_phys; 2692 int ctxt_table_entries; 2693 u64 rtaddr_reg; 2694 int bus, ret; 2695 bool new_ext, ext; 2696 2697 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2698 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2699 new_ext = !!sm_supported(iommu); 2700 2701 /* 2702 * The RTT bit can only be changed when translation is disabled, 2703 * but disabling translation means to open a window for data 2704 * corruption. So bail out and don't copy anything if we would 2705 * have to change the bit. 2706 */ 2707 if (new_ext != ext) 2708 return -EINVAL; 2709 2710 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2711 if (!iommu->copied_tables) 2712 return -ENOMEM; 2713 2714 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2715 if (!old_rt_phys) 2716 return -EINVAL; 2717 2718 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2719 if (!old_rt) 2720 return -ENOMEM; 2721 2722 /* This is too big for the stack - allocate it from slab */ 2723 ctxt_table_entries = ext ? 512 : 256; 2724 ret = -ENOMEM; 2725 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2726 if (!ctxt_tbls) 2727 goto out_unmap; 2728 2729 for (bus = 0; bus < 256; bus++) { 2730 ret = copy_context_table(iommu, &old_rt[bus], 2731 ctxt_tbls, bus, ext); 2732 if (ret) { 2733 pr_err("%s: Failed to copy context table for bus %d\n", 2734 iommu->name, bus); 2735 continue; 2736 } 2737 } 2738 2739 spin_lock(&iommu->lock); 2740 2741 /* Context tables are copied, now write them to the root_entry table */ 2742 for (bus = 0; bus < 256; bus++) { 2743 int idx = ext ? bus * 2 : bus; 2744 u64 val; 2745 2746 if (ctxt_tbls[idx]) { 2747 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2748 iommu->root_entry[bus].lo = val; 2749 } 2750 2751 if (!ext || !ctxt_tbls[idx + 1]) 2752 continue; 2753 2754 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2755 iommu->root_entry[bus].hi = val; 2756 } 2757 2758 spin_unlock(&iommu->lock); 2759 2760 kfree(ctxt_tbls); 2761 2762 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2763 2764 ret = 0; 2765 2766 out_unmap: 2767 memunmap(old_rt); 2768 2769 return ret; 2770 } 2771 2772 static int __init init_dmars(void) 2773 { 2774 struct dmar_drhd_unit *drhd; 2775 struct intel_iommu *iommu; 2776 int ret; 2777 2778 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2779 if (ret) 2780 goto free_iommu; 2781 2782 for_each_iommu(iommu, drhd) { 2783 if (drhd->ignored) { 2784 iommu_disable_translation(iommu); 2785 continue; 2786 } 2787 2788 /* 2789 * Find the max pasid size of all IOMMU's in the system. 2790 * We need to ensure the system pasid table is no bigger 2791 * than the smallest supported. 2792 */ 2793 if (pasid_supported(iommu)) { 2794 u32 temp = 2 << ecap_pss(iommu->ecap); 2795 2796 intel_pasid_max_id = min_t(u32, temp, 2797 intel_pasid_max_id); 2798 } 2799 2800 intel_iommu_init_qi(iommu); 2801 2802 ret = iommu_init_domains(iommu); 2803 if (ret) 2804 goto free_iommu; 2805 2806 init_translation_status(iommu); 2807 2808 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2809 iommu_disable_translation(iommu); 2810 clear_translation_pre_enabled(iommu); 2811 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2812 iommu->name); 2813 } 2814 2815 /* 2816 * TBD: 2817 * we could share the same root & context tables 2818 * among all IOMMU's. Need to Split it later. 2819 */ 2820 ret = iommu_alloc_root_entry(iommu); 2821 if (ret) 2822 goto free_iommu; 2823 2824 if (translation_pre_enabled(iommu)) { 2825 pr_info("Translation already enabled - trying to copy translation structures\n"); 2826 2827 ret = copy_translation_tables(iommu); 2828 if (ret) { 2829 /* 2830 * We found the IOMMU with translation 2831 * enabled - but failed to copy over the 2832 * old root-entry table. Try to proceed 2833 * by disabling translation now and 2834 * allocating a clean root-entry table. 2835 * This might cause DMAR faults, but 2836 * probably the dump will still succeed. 2837 */ 2838 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2839 iommu->name); 2840 iommu_disable_translation(iommu); 2841 clear_translation_pre_enabled(iommu); 2842 } else { 2843 pr_info("Copied translation tables from previous kernel for %s\n", 2844 iommu->name); 2845 } 2846 } 2847 2848 if (!ecap_pass_through(iommu->ecap)) 2849 hw_pass_through = 0; 2850 intel_svm_check(iommu); 2851 } 2852 2853 /* 2854 * Now that qi is enabled on all iommus, set the root entry and flush 2855 * caches. This is required on some Intel X58 chipsets, otherwise the 2856 * flush_context function will loop forever and the boot hangs. 2857 */ 2858 for_each_active_iommu(iommu, drhd) { 2859 iommu_flush_write_buffer(iommu); 2860 iommu_set_root_entry(iommu); 2861 } 2862 2863 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2864 dmar_map_gfx = 0; 2865 #endif 2866 2867 if (!dmar_map_gfx) 2868 iommu_identity_mapping |= IDENTMAP_GFX; 2869 2870 check_tylersburg_isoch(); 2871 2872 ret = si_domain_init(hw_pass_through); 2873 if (ret) 2874 goto free_iommu; 2875 2876 /* 2877 * for each drhd 2878 * enable fault log 2879 * global invalidate context cache 2880 * global invalidate iotlb 2881 * enable translation 2882 */ 2883 for_each_iommu(iommu, drhd) { 2884 if (drhd->ignored) { 2885 /* 2886 * we always have to disable PMRs or DMA may fail on 2887 * this device 2888 */ 2889 if (force_on) 2890 iommu_disable_protect_mem_regions(iommu); 2891 continue; 2892 } 2893 2894 iommu_flush_write_buffer(iommu); 2895 2896 #ifdef CONFIG_INTEL_IOMMU_SVM 2897 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2898 /* 2899 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2900 * could cause possible lock race condition. 2901 */ 2902 up_write(&dmar_global_lock); 2903 ret = intel_svm_enable_prq(iommu); 2904 down_write(&dmar_global_lock); 2905 if (ret) 2906 goto free_iommu; 2907 } 2908 #endif 2909 ret = dmar_set_interrupt(iommu); 2910 if (ret) 2911 goto free_iommu; 2912 } 2913 2914 return 0; 2915 2916 free_iommu: 2917 for_each_active_iommu(iommu, drhd) { 2918 disable_dmar_iommu(iommu); 2919 free_dmar_iommu(iommu); 2920 } 2921 if (si_domain) { 2922 domain_exit(si_domain); 2923 si_domain = NULL; 2924 } 2925 2926 return ret; 2927 } 2928 2929 static void __init init_no_remapping_devices(void) 2930 { 2931 struct dmar_drhd_unit *drhd; 2932 struct device *dev; 2933 int i; 2934 2935 for_each_drhd_unit(drhd) { 2936 if (!drhd->include_all) { 2937 for_each_active_dev_scope(drhd->devices, 2938 drhd->devices_cnt, i, dev) 2939 break; 2940 /* ignore DMAR unit if no devices exist */ 2941 if (i == drhd->devices_cnt) 2942 drhd->ignored = 1; 2943 } 2944 } 2945 2946 for_each_active_drhd_unit(drhd) { 2947 if (drhd->include_all) 2948 continue; 2949 2950 for_each_active_dev_scope(drhd->devices, 2951 drhd->devices_cnt, i, dev) 2952 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2953 break; 2954 if (i < drhd->devices_cnt) 2955 continue; 2956 2957 /* This IOMMU has *only* gfx devices. Either bypass it or 2958 set the gfx_mapped flag, as appropriate */ 2959 drhd->gfx_dedicated = 1; 2960 if (!dmar_map_gfx) 2961 drhd->ignored = 1; 2962 } 2963 } 2964 2965 #ifdef CONFIG_SUSPEND 2966 static int init_iommu_hw(void) 2967 { 2968 struct dmar_drhd_unit *drhd; 2969 struct intel_iommu *iommu = NULL; 2970 2971 for_each_active_iommu(iommu, drhd) 2972 if (iommu->qi) 2973 dmar_reenable_qi(iommu); 2974 2975 for_each_iommu(iommu, drhd) { 2976 if (drhd->ignored) { 2977 /* 2978 * we always have to disable PMRs or DMA may fail on 2979 * this device 2980 */ 2981 if (force_on) 2982 iommu_disable_protect_mem_regions(iommu); 2983 continue; 2984 } 2985 2986 iommu_flush_write_buffer(iommu); 2987 iommu_set_root_entry(iommu); 2988 iommu_enable_translation(iommu); 2989 iommu_disable_protect_mem_regions(iommu); 2990 } 2991 2992 return 0; 2993 } 2994 2995 static void iommu_flush_all(void) 2996 { 2997 struct dmar_drhd_unit *drhd; 2998 struct intel_iommu *iommu; 2999 3000 for_each_active_iommu(iommu, drhd) { 3001 iommu->flush.flush_context(iommu, 0, 0, 0, 3002 DMA_CCMD_GLOBAL_INVL); 3003 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3004 DMA_TLB_GLOBAL_FLUSH); 3005 } 3006 } 3007 3008 static int iommu_suspend(void) 3009 { 3010 struct dmar_drhd_unit *drhd; 3011 struct intel_iommu *iommu = NULL; 3012 unsigned long flag; 3013 3014 for_each_active_iommu(iommu, drhd) { 3015 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3016 GFP_KERNEL); 3017 if (!iommu->iommu_state) 3018 goto nomem; 3019 } 3020 3021 iommu_flush_all(); 3022 3023 for_each_active_iommu(iommu, drhd) { 3024 iommu_disable_translation(iommu); 3025 3026 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3027 3028 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3029 readl(iommu->reg + DMAR_FECTL_REG); 3030 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3031 readl(iommu->reg + DMAR_FEDATA_REG); 3032 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3033 readl(iommu->reg + DMAR_FEADDR_REG); 3034 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3035 readl(iommu->reg + DMAR_FEUADDR_REG); 3036 3037 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3038 } 3039 return 0; 3040 3041 nomem: 3042 for_each_active_iommu(iommu, drhd) 3043 kfree(iommu->iommu_state); 3044 3045 return -ENOMEM; 3046 } 3047 3048 static void iommu_resume(void) 3049 { 3050 struct dmar_drhd_unit *drhd; 3051 struct intel_iommu *iommu = NULL; 3052 unsigned long flag; 3053 3054 if (init_iommu_hw()) { 3055 if (force_on) 3056 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3057 else 3058 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3059 return; 3060 } 3061 3062 for_each_active_iommu(iommu, drhd) { 3063 3064 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3065 3066 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3067 iommu->reg + DMAR_FECTL_REG); 3068 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3069 iommu->reg + DMAR_FEDATA_REG); 3070 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3071 iommu->reg + DMAR_FEADDR_REG); 3072 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3073 iommu->reg + DMAR_FEUADDR_REG); 3074 3075 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3076 } 3077 3078 for_each_active_iommu(iommu, drhd) 3079 kfree(iommu->iommu_state); 3080 } 3081 3082 static struct syscore_ops iommu_syscore_ops = { 3083 .resume = iommu_resume, 3084 .suspend = iommu_suspend, 3085 }; 3086 3087 static void __init init_iommu_pm_ops(void) 3088 { 3089 register_syscore_ops(&iommu_syscore_ops); 3090 } 3091 3092 #else 3093 static inline void init_iommu_pm_ops(void) {} 3094 #endif /* CONFIG_PM */ 3095 3096 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3097 { 3098 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3099 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3100 rmrr->end_address <= rmrr->base_address || 3101 arch_rmrr_sanity_check(rmrr)) 3102 return -EINVAL; 3103 3104 return 0; 3105 } 3106 3107 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3108 { 3109 struct acpi_dmar_reserved_memory *rmrr; 3110 struct dmar_rmrr_unit *rmrru; 3111 3112 rmrr = (struct acpi_dmar_reserved_memory *)header; 3113 if (rmrr_sanity_check(rmrr)) { 3114 pr_warn(FW_BUG 3115 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3116 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3117 rmrr->base_address, rmrr->end_address, 3118 dmi_get_system_info(DMI_BIOS_VENDOR), 3119 dmi_get_system_info(DMI_BIOS_VERSION), 3120 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3121 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3122 } 3123 3124 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3125 if (!rmrru) 3126 goto out; 3127 3128 rmrru->hdr = header; 3129 3130 rmrru->base_address = rmrr->base_address; 3131 rmrru->end_address = rmrr->end_address; 3132 3133 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3134 ((void *)rmrr) + rmrr->header.length, 3135 &rmrru->devices_cnt); 3136 if (rmrru->devices_cnt && rmrru->devices == NULL) 3137 goto free_rmrru; 3138 3139 list_add(&rmrru->list, &dmar_rmrr_units); 3140 3141 return 0; 3142 free_rmrru: 3143 kfree(rmrru); 3144 out: 3145 return -ENOMEM; 3146 } 3147 3148 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3149 { 3150 struct dmar_atsr_unit *atsru; 3151 struct acpi_dmar_atsr *tmp; 3152 3153 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3154 dmar_rcu_check()) { 3155 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3156 if (atsr->segment != tmp->segment) 3157 continue; 3158 if (atsr->header.length != tmp->header.length) 3159 continue; 3160 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3161 return atsru; 3162 } 3163 3164 return NULL; 3165 } 3166 3167 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3168 { 3169 struct acpi_dmar_atsr *atsr; 3170 struct dmar_atsr_unit *atsru; 3171 3172 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3173 return 0; 3174 3175 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3176 atsru = dmar_find_atsr(atsr); 3177 if (atsru) 3178 return 0; 3179 3180 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3181 if (!atsru) 3182 return -ENOMEM; 3183 3184 /* 3185 * If memory is allocated from slab by ACPI _DSM method, we need to 3186 * copy the memory content because the memory buffer will be freed 3187 * on return. 3188 */ 3189 atsru->hdr = (void *)(atsru + 1); 3190 memcpy(atsru->hdr, hdr, hdr->length); 3191 atsru->include_all = atsr->flags & 0x1; 3192 if (!atsru->include_all) { 3193 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3194 (void *)atsr + atsr->header.length, 3195 &atsru->devices_cnt); 3196 if (atsru->devices_cnt && atsru->devices == NULL) { 3197 kfree(atsru); 3198 return -ENOMEM; 3199 } 3200 } 3201 3202 list_add_rcu(&atsru->list, &dmar_atsr_units); 3203 3204 return 0; 3205 } 3206 3207 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3208 { 3209 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3210 kfree(atsru); 3211 } 3212 3213 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3214 { 3215 struct acpi_dmar_atsr *atsr; 3216 struct dmar_atsr_unit *atsru; 3217 3218 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3219 atsru = dmar_find_atsr(atsr); 3220 if (atsru) { 3221 list_del_rcu(&atsru->list); 3222 synchronize_rcu(); 3223 intel_iommu_free_atsr(atsru); 3224 } 3225 3226 return 0; 3227 } 3228 3229 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3230 { 3231 int i; 3232 struct device *dev; 3233 struct acpi_dmar_atsr *atsr; 3234 struct dmar_atsr_unit *atsru; 3235 3236 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3237 atsru = dmar_find_atsr(atsr); 3238 if (!atsru) 3239 return 0; 3240 3241 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3242 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3243 i, dev) 3244 return -EBUSY; 3245 } 3246 3247 return 0; 3248 } 3249 3250 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3251 { 3252 struct dmar_satc_unit *satcu; 3253 struct acpi_dmar_satc *tmp; 3254 3255 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3256 dmar_rcu_check()) { 3257 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3258 if (satc->segment != tmp->segment) 3259 continue; 3260 if (satc->header.length != tmp->header.length) 3261 continue; 3262 if (memcmp(satc, tmp, satc->header.length) == 0) 3263 return satcu; 3264 } 3265 3266 return NULL; 3267 } 3268 3269 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3270 { 3271 struct acpi_dmar_satc *satc; 3272 struct dmar_satc_unit *satcu; 3273 3274 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3275 return 0; 3276 3277 satc = container_of(hdr, struct acpi_dmar_satc, header); 3278 satcu = dmar_find_satc(satc); 3279 if (satcu) 3280 return 0; 3281 3282 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3283 if (!satcu) 3284 return -ENOMEM; 3285 3286 satcu->hdr = (void *)(satcu + 1); 3287 memcpy(satcu->hdr, hdr, hdr->length); 3288 satcu->atc_required = satc->flags & 0x1; 3289 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3290 (void *)satc + satc->header.length, 3291 &satcu->devices_cnt); 3292 if (satcu->devices_cnt && !satcu->devices) { 3293 kfree(satcu); 3294 return -ENOMEM; 3295 } 3296 list_add_rcu(&satcu->list, &dmar_satc_units); 3297 3298 return 0; 3299 } 3300 3301 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3302 { 3303 int sp, ret; 3304 struct intel_iommu *iommu = dmaru->iommu; 3305 3306 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3307 if (ret) 3308 goto out; 3309 3310 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3311 pr_warn("%s: Doesn't support hardware pass through.\n", 3312 iommu->name); 3313 return -ENXIO; 3314 } 3315 3316 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3317 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3318 pr_warn("%s: Doesn't support large page.\n", 3319 iommu->name); 3320 return -ENXIO; 3321 } 3322 3323 /* 3324 * Disable translation if already enabled prior to OS handover. 3325 */ 3326 if (iommu->gcmd & DMA_GCMD_TE) 3327 iommu_disable_translation(iommu); 3328 3329 ret = iommu_init_domains(iommu); 3330 if (ret == 0) 3331 ret = iommu_alloc_root_entry(iommu); 3332 if (ret) 3333 goto out; 3334 3335 intel_svm_check(iommu); 3336 3337 if (dmaru->ignored) { 3338 /* 3339 * we always have to disable PMRs or DMA may fail on this device 3340 */ 3341 if (force_on) 3342 iommu_disable_protect_mem_regions(iommu); 3343 return 0; 3344 } 3345 3346 intel_iommu_init_qi(iommu); 3347 iommu_flush_write_buffer(iommu); 3348 3349 #ifdef CONFIG_INTEL_IOMMU_SVM 3350 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3351 ret = intel_svm_enable_prq(iommu); 3352 if (ret) 3353 goto disable_iommu; 3354 } 3355 #endif 3356 ret = dmar_set_interrupt(iommu); 3357 if (ret) 3358 goto disable_iommu; 3359 3360 iommu_set_root_entry(iommu); 3361 iommu_enable_translation(iommu); 3362 3363 iommu_disable_protect_mem_regions(iommu); 3364 return 0; 3365 3366 disable_iommu: 3367 disable_dmar_iommu(iommu); 3368 out: 3369 free_dmar_iommu(iommu); 3370 return ret; 3371 } 3372 3373 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3374 { 3375 int ret = 0; 3376 struct intel_iommu *iommu = dmaru->iommu; 3377 3378 if (!intel_iommu_enabled) 3379 return 0; 3380 if (iommu == NULL) 3381 return -EINVAL; 3382 3383 if (insert) { 3384 ret = intel_iommu_add(dmaru); 3385 } else { 3386 disable_dmar_iommu(iommu); 3387 free_dmar_iommu(iommu); 3388 } 3389 3390 return ret; 3391 } 3392 3393 static void intel_iommu_free_dmars(void) 3394 { 3395 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3396 struct dmar_atsr_unit *atsru, *atsr_n; 3397 struct dmar_satc_unit *satcu, *satc_n; 3398 3399 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3400 list_del(&rmrru->list); 3401 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3402 kfree(rmrru); 3403 } 3404 3405 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3406 list_del(&atsru->list); 3407 intel_iommu_free_atsr(atsru); 3408 } 3409 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3410 list_del(&satcu->list); 3411 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3412 kfree(satcu); 3413 } 3414 } 3415 3416 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3417 { 3418 struct dmar_satc_unit *satcu; 3419 struct acpi_dmar_satc *satc; 3420 struct device *tmp; 3421 int i; 3422 3423 dev = pci_physfn(dev); 3424 rcu_read_lock(); 3425 3426 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3427 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3428 if (satc->segment != pci_domain_nr(dev->bus)) 3429 continue; 3430 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3431 if (to_pci_dev(tmp) == dev) 3432 goto out; 3433 } 3434 satcu = NULL; 3435 out: 3436 rcu_read_unlock(); 3437 return satcu; 3438 } 3439 3440 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3441 { 3442 int i, ret = 1; 3443 struct pci_bus *bus; 3444 struct pci_dev *bridge = NULL; 3445 struct device *tmp; 3446 struct acpi_dmar_atsr *atsr; 3447 struct dmar_atsr_unit *atsru; 3448 struct dmar_satc_unit *satcu; 3449 3450 dev = pci_physfn(dev); 3451 satcu = dmar_find_matched_satc_unit(dev); 3452 if (satcu) 3453 /* 3454 * This device supports ATS as it is in SATC table. 3455 * When IOMMU is in legacy mode, enabling ATS is done 3456 * automatically by HW for the device that requires 3457 * ATS, hence OS should not enable this device ATS 3458 * to avoid duplicated TLB invalidation. 3459 */ 3460 return !(satcu->atc_required && !sm_supported(iommu)); 3461 3462 for (bus = dev->bus; bus; bus = bus->parent) { 3463 bridge = bus->self; 3464 /* If it's an integrated device, allow ATS */ 3465 if (!bridge) 3466 return 1; 3467 /* Connected via non-PCIe: no ATS */ 3468 if (!pci_is_pcie(bridge) || 3469 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3470 return 0; 3471 /* If we found the root port, look it up in the ATSR */ 3472 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3473 break; 3474 } 3475 3476 rcu_read_lock(); 3477 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3478 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3479 if (atsr->segment != pci_domain_nr(dev->bus)) 3480 continue; 3481 3482 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3483 if (tmp == &bridge->dev) 3484 goto out; 3485 3486 if (atsru->include_all) 3487 goto out; 3488 } 3489 ret = 0; 3490 out: 3491 rcu_read_unlock(); 3492 3493 return ret; 3494 } 3495 3496 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3497 { 3498 int ret; 3499 struct dmar_rmrr_unit *rmrru; 3500 struct dmar_atsr_unit *atsru; 3501 struct dmar_satc_unit *satcu; 3502 struct acpi_dmar_atsr *atsr; 3503 struct acpi_dmar_reserved_memory *rmrr; 3504 struct acpi_dmar_satc *satc; 3505 3506 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3507 return 0; 3508 3509 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3510 rmrr = container_of(rmrru->hdr, 3511 struct acpi_dmar_reserved_memory, header); 3512 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3513 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3514 ((void *)rmrr) + rmrr->header.length, 3515 rmrr->segment, rmrru->devices, 3516 rmrru->devices_cnt); 3517 if (ret < 0) 3518 return ret; 3519 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3520 dmar_remove_dev_scope(info, rmrr->segment, 3521 rmrru->devices, rmrru->devices_cnt); 3522 } 3523 } 3524 3525 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3526 if (atsru->include_all) 3527 continue; 3528 3529 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3530 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3531 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3532 (void *)atsr + atsr->header.length, 3533 atsr->segment, atsru->devices, 3534 atsru->devices_cnt); 3535 if (ret > 0) 3536 break; 3537 else if (ret < 0) 3538 return ret; 3539 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3540 if (dmar_remove_dev_scope(info, atsr->segment, 3541 atsru->devices, atsru->devices_cnt)) 3542 break; 3543 } 3544 } 3545 list_for_each_entry(satcu, &dmar_satc_units, list) { 3546 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3547 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3548 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3549 (void *)satc + satc->header.length, 3550 satc->segment, satcu->devices, 3551 satcu->devices_cnt); 3552 if (ret > 0) 3553 break; 3554 else if (ret < 0) 3555 return ret; 3556 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3557 if (dmar_remove_dev_scope(info, satc->segment, 3558 satcu->devices, satcu->devices_cnt)) 3559 break; 3560 } 3561 } 3562 3563 return 0; 3564 } 3565 3566 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3567 unsigned long val, void *v) 3568 { 3569 struct memory_notify *mhp = v; 3570 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3571 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3572 mhp->nr_pages - 1); 3573 3574 switch (val) { 3575 case MEM_GOING_ONLINE: 3576 if (iommu_domain_identity_map(si_domain, 3577 start_vpfn, last_vpfn)) { 3578 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3579 start_vpfn, last_vpfn); 3580 return NOTIFY_BAD; 3581 } 3582 break; 3583 3584 case MEM_OFFLINE: 3585 case MEM_CANCEL_ONLINE: 3586 { 3587 struct dmar_drhd_unit *drhd; 3588 struct intel_iommu *iommu; 3589 LIST_HEAD(freelist); 3590 3591 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3592 3593 rcu_read_lock(); 3594 for_each_active_iommu(iommu, drhd) 3595 iommu_flush_iotlb_psi(iommu, si_domain, 3596 start_vpfn, mhp->nr_pages, 3597 list_empty(&freelist), 0); 3598 rcu_read_unlock(); 3599 put_pages_list(&freelist); 3600 } 3601 break; 3602 } 3603 3604 return NOTIFY_OK; 3605 } 3606 3607 static struct notifier_block intel_iommu_memory_nb = { 3608 .notifier_call = intel_iommu_memory_notifier, 3609 .priority = 0 3610 }; 3611 3612 static void intel_disable_iommus(void) 3613 { 3614 struct intel_iommu *iommu = NULL; 3615 struct dmar_drhd_unit *drhd; 3616 3617 for_each_iommu(iommu, drhd) 3618 iommu_disable_translation(iommu); 3619 } 3620 3621 void intel_iommu_shutdown(void) 3622 { 3623 struct dmar_drhd_unit *drhd; 3624 struct intel_iommu *iommu = NULL; 3625 3626 if (no_iommu || dmar_disabled) 3627 return; 3628 3629 down_write(&dmar_global_lock); 3630 3631 /* Disable PMRs explicitly here. */ 3632 for_each_iommu(iommu, drhd) 3633 iommu_disable_protect_mem_regions(iommu); 3634 3635 /* Make sure the IOMMUs are switched off */ 3636 intel_disable_iommus(); 3637 3638 up_write(&dmar_global_lock); 3639 } 3640 3641 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3642 { 3643 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3644 3645 return container_of(iommu_dev, struct intel_iommu, iommu); 3646 } 3647 3648 static ssize_t version_show(struct device *dev, 3649 struct device_attribute *attr, char *buf) 3650 { 3651 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3652 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3653 return sysfs_emit(buf, "%d:%d\n", 3654 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3655 } 3656 static DEVICE_ATTR_RO(version); 3657 3658 static ssize_t address_show(struct device *dev, 3659 struct device_attribute *attr, char *buf) 3660 { 3661 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3662 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3663 } 3664 static DEVICE_ATTR_RO(address); 3665 3666 static ssize_t cap_show(struct device *dev, 3667 struct device_attribute *attr, char *buf) 3668 { 3669 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3670 return sysfs_emit(buf, "%llx\n", iommu->cap); 3671 } 3672 static DEVICE_ATTR_RO(cap); 3673 3674 static ssize_t ecap_show(struct device *dev, 3675 struct device_attribute *attr, char *buf) 3676 { 3677 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3678 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3679 } 3680 static DEVICE_ATTR_RO(ecap); 3681 3682 static ssize_t domains_supported_show(struct device *dev, 3683 struct device_attribute *attr, char *buf) 3684 { 3685 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3686 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3687 } 3688 static DEVICE_ATTR_RO(domains_supported); 3689 3690 static ssize_t domains_used_show(struct device *dev, 3691 struct device_attribute *attr, char *buf) 3692 { 3693 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3694 return sysfs_emit(buf, "%d\n", 3695 bitmap_weight(iommu->domain_ids, 3696 cap_ndoms(iommu->cap))); 3697 } 3698 static DEVICE_ATTR_RO(domains_used); 3699 3700 static struct attribute *intel_iommu_attrs[] = { 3701 &dev_attr_version.attr, 3702 &dev_attr_address.attr, 3703 &dev_attr_cap.attr, 3704 &dev_attr_ecap.attr, 3705 &dev_attr_domains_supported.attr, 3706 &dev_attr_domains_used.attr, 3707 NULL, 3708 }; 3709 3710 static struct attribute_group intel_iommu_group = { 3711 .name = "intel-iommu", 3712 .attrs = intel_iommu_attrs, 3713 }; 3714 3715 const struct attribute_group *intel_iommu_groups[] = { 3716 &intel_iommu_group, 3717 NULL, 3718 }; 3719 3720 static inline bool has_external_pci(void) 3721 { 3722 struct pci_dev *pdev = NULL; 3723 3724 for_each_pci_dev(pdev) 3725 if (pdev->external_facing) { 3726 pci_dev_put(pdev); 3727 return true; 3728 } 3729 3730 return false; 3731 } 3732 3733 static int __init platform_optin_force_iommu(void) 3734 { 3735 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3736 return 0; 3737 3738 if (no_iommu || dmar_disabled) 3739 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3740 3741 /* 3742 * If Intel-IOMMU is disabled by default, we will apply identity 3743 * map for all devices except those marked as being untrusted. 3744 */ 3745 if (dmar_disabled) 3746 iommu_set_default_passthrough(false); 3747 3748 dmar_disabled = 0; 3749 no_iommu = 0; 3750 3751 return 1; 3752 } 3753 3754 static int __init probe_acpi_namespace_devices(void) 3755 { 3756 struct dmar_drhd_unit *drhd; 3757 /* To avoid a -Wunused-but-set-variable warning. */ 3758 struct intel_iommu *iommu __maybe_unused; 3759 struct device *dev; 3760 int i, ret = 0; 3761 3762 for_each_active_iommu(iommu, drhd) { 3763 for_each_active_dev_scope(drhd->devices, 3764 drhd->devices_cnt, i, dev) { 3765 struct acpi_device_physical_node *pn; 3766 struct iommu_group *group; 3767 struct acpi_device *adev; 3768 3769 if (dev->bus != &acpi_bus_type) 3770 continue; 3771 3772 adev = to_acpi_device(dev); 3773 mutex_lock(&adev->physical_node_lock); 3774 list_for_each_entry(pn, 3775 &adev->physical_node_list, node) { 3776 group = iommu_group_get(pn->dev); 3777 if (group) { 3778 iommu_group_put(group); 3779 continue; 3780 } 3781 3782 ret = iommu_probe_device(pn->dev); 3783 if (ret) 3784 break; 3785 } 3786 mutex_unlock(&adev->physical_node_lock); 3787 3788 if (ret) 3789 return ret; 3790 } 3791 } 3792 3793 return 0; 3794 } 3795 3796 static __init int tboot_force_iommu(void) 3797 { 3798 if (!tboot_enabled()) 3799 return 0; 3800 3801 if (no_iommu || dmar_disabled) 3802 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3803 3804 dmar_disabled = 0; 3805 no_iommu = 0; 3806 3807 return 1; 3808 } 3809 3810 int __init intel_iommu_init(void) 3811 { 3812 int ret = -ENODEV; 3813 struct dmar_drhd_unit *drhd; 3814 struct intel_iommu *iommu; 3815 3816 /* 3817 * Intel IOMMU is required for a TXT/tboot launch or platform 3818 * opt in, so enforce that. 3819 */ 3820 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3821 platform_optin_force_iommu(); 3822 3823 down_write(&dmar_global_lock); 3824 if (dmar_table_init()) { 3825 if (force_on) 3826 panic("tboot: Failed to initialize DMAR table\n"); 3827 goto out_free_dmar; 3828 } 3829 3830 if (dmar_dev_scope_init() < 0) { 3831 if (force_on) 3832 panic("tboot: Failed to initialize DMAR device scope\n"); 3833 goto out_free_dmar; 3834 } 3835 3836 up_write(&dmar_global_lock); 3837 3838 /* 3839 * The bus notifier takes the dmar_global_lock, so lockdep will 3840 * complain later when we register it under the lock. 3841 */ 3842 dmar_register_bus_notifier(); 3843 3844 down_write(&dmar_global_lock); 3845 3846 if (!no_iommu) 3847 intel_iommu_debugfs_init(); 3848 3849 if (no_iommu || dmar_disabled) { 3850 /* 3851 * We exit the function here to ensure IOMMU's remapping and 3852 * mempool aren't setup, which means that the IOMMU's PMRs 3853 * won't be disabled via the call to init_dmars(). So disable 3854 * it explicitly here. The PMRs were setup by tboot prior to 3855 * calling SENTER, but the kernel is expected to reset/tear 3856 * down the PMRs. 3857 */ 3858 if (intel_iommu_tboot_noforce) { 3859 for_each_iommu(iommu, drhd) 3860 iommu_disable_protect_mem_regions(iommu); 3861 } 3862 3863 /* 3864 * Make sure the IOMMUs are switched off, even when we 3865 * boot into a kexec kernel and the previous kernel left 3866 * them enabled 3867 */ 3868 intel_disable_iommus(); 3869 goto out_free_dmar; 3870 } 3871 3872 if (list_empty(&dmar_rmrr_units)) 3873 pr_info("No RMRR found\n"); 3874 3875 if (list_empty(&dmar_atsr_units)) 3876 pr_info("No ATSR found\n"); 3877 3878 if (list_empty(&dmar_satc_units)) 3879 pr_info("No SATC found\n"); 3880 3881 init_no_remapping_devices(); 3882 3883 ret = init_dmars(); 3884 if (ret) { 3885 if (force_on) 3886 panic("tboot: Failed to initialize DMARs\n"); 3887 pr_err("Initialization failed\n"); 3888 goto out_free_dmar; 3889 } 3890 up_write(&dmar_global_lock); 3891 3892 init_iommu_pm_ops(); 3893 3894 down_read(&dmar_global_lock); 3895 for_each_active_iommu(iommu, drhd) { 3896 /* 3897 * The flush queue implementation does not perform 3898 * page-selective invalidations that are required for efficient 3899 * TLB flushes in virtual environments. The benefit of batching 3900 * is likely to be much lower than the overhead of synchronizing 3901 * the virtual and physical IOMMU page-tables. 3902 */ 3903 if (cap_caching_mode(iommu->cap) && 3904 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3905 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3906 iommu_set_dma_strict(); 3907 } 3908 iommu_device_sysfs_add(&iommu->iommu, NULL, 3909 intel_iommu_groups, 3910 "%s", iommu->name); 3911 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3912 3913 iommu_pmu_register(iommu); 3914 } 3915 up_read(&dmar_global_lock); 3916 3917 if (si_domain && !hw_pass_through) 3918 register_memory_notifier(&intel_iommu_memory_nb); 3919 3920 down_read(&dmar_global_lock); 3921 if (probe_acpi_namespace_devices()) 3922 pr_warn("ACPI name space devices didn't probe correctly\n"); 3923 3924 /* Finally, we enable the DMA remapping hardware. */ 3925 for_each_iommu(iommu, drhd) { 3926 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3927 iommu_enable_translation(iommu); 3928 3929 iommu_disable_protect_mem_regions(iommu); 3930 } 3931 up_read(&dmar_global_lock); 3932 3933 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3934 3935 intel_iommu_enabled = 1; 3936 3937 return 0; 3938 3939 out_free_dmar: 3940 intel_iommu_free_dmars(); 3941 up_write(&dmar_global_lock); 3942 return ret; 3943 } 3944 3945 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3946 { 3947 struct device_domain_info *info = opaque; 3948 3949 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3950 return 0; 3951 } 3952 3953 /* 3954 * NB - intel-iommu lacks any sort of reference counting for the users of 3955 * dependent devices. If multiple endpoints have intersecting dependent 3956 * devices, unbinding the driver from any one of them will possibly leave 3957 * the others unable to operate. 3958 */ 3959 static void domain_context_clear(struct device_domain_info *info) 3960 { 3961 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 3962 return; 3963 3964 pci_for_each_dma_alias(to_pci_dev(info->dev), 3965 &domain_context_clear_one_cb, info); 3966 } 3967 3968 static void dmar_remove_one_dev_info(struct device *dev) 3969 { 3970 struct device_domain_info *info = dev_iommu_priv_get(dev); 3971 struct dmar_domain *domain = info->domain; 3972 struct intel_iommu *iommu = info->iommu; 3973 unsigned long flags; 3974 3975 if (!dev_is_real_dma_subdevice(info->dev)) { 3976 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3977 intel_pasid_tear_down_entry(iommu, info->dev, 3978 PASID_RID2PASID, false); 3979 3980 iommu_disable_pci_caps(info); 3981 domain_context_clear(info); 3982 } 3983 3984 spin_lock_irqsave(&domain->lock, flags); 3985 list_del(&info->link); 3986 spin_unlock_irqrestore(&domain->lock, flags); 3987 3988 domain_detach_iommu(domain, iommu); 3989 info->domain = NULL; 3990 } 3991 3992 /* 3993 * Clear the page table pointer in context or pasid table entries so that 3994 * all DMA requests without PASID from the device are blocked. If the page 3995 * table has been set, clean up the data structures. 3996 */ 3997 static void device_block_translation(struct device *dev) 3998 { 3999 struct device_domain_info *info = dev_iommu_priv_get(dev); 4000 struct intel_iommu *iommu = info->iommu; 4001 unsigned long flags; 4002 4003 iommu_disable_pci_caps(info); 4004 if (!dev_is_real_dma_subdevice(dev)) { 4005 if (sm_supported(iommu)) 4006 intel_pasid_tear_down_entry(iommu, dev, 4007 PASID_RID2PASID, false); 4008 else 4009 domain_context_clear(info); 4010 } 4011 4012 if (!info->domain) 4013 return; 4014 4015 spin_lock_irqsave(&info->domain->lock, flags); 4016 list_del(&info->link); 4017 spin_unlock_irqrestore(&info->domain->lock, flags); 4018 4019 domain_detach_iommu(info->domain, iommu); 4020 info->domain = NULL; 4021 } 4022 4023 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4024 { 4025 int adjust_width; 4026 4027 /* calculate AGAW */ 4028 domain->gaw = guest_width; 4029 adjust_width = guestwidth_to_adjustwidth(guest_width); 4030 domain->agaw = width_to_agaw(adjust_width); 4031 4032 domain->iommu_coherency = false; 4033 domain->iommu_superpage = 0; 4034 domain->max_addr = 0; 4035 4036 /* always allocate the top pgd */ 4037 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 4038 if (!domain->pgd) 4039 return -ENOMEM; 4040 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4041 return 0; 4042 } 4043 4044 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4045 struct device *dev) 4046 { 4047 device_block_translation(dev); 4048 return 0; 4049 } 4050 4051 static struct iommu_domain blocking_domain = { 4052 .ops = &(const struct iommu_domain_ops) { 4053 .attach_dev = blocking_domain_attach_dev, 4054 .free = intel_iommu_domain_free 4055 } 4056 }; 4057 4058 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4059 { 4060 struct dmar_domain *dmar_domain; 4061 struct iommu_domain *domain; 4062 4063 switch (type) { 4064 case IOMMU_DOMAIN_BLOCKED: 4065 return &blocking_domain; 4066 case IOMMU_DOMAIN_DMA: 4067 case IOMMU_DOMAIN_DMA_FQ: 4068 case IOMMU_DOMAIN_UNMANAGED: 4069 dmar_domain = alloc_domain(type); 4070 if (!dmar_domain) { 4071 pr_err("Can't allocate dmar_domain\n"); 4072 return NULL; 4073 } 4074 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4075 pr_err("Domain initialization failed\n"); 4076 domain_exit(dmar_domain); 4077 return NULL; 4078 } 4079 4080 domain = &dmar_domain->domain; 4081 domain->geometry.aperture_start = 0; 4082 domain->geometry.aperture_end = 4083 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4084 domain->geometry.force_aperture = true; 4085 4086 return domain; 4087 case IOMMU_DOMAIN_IDENTITY: 4088 return &si_domain->domain; 4089 case IOMMU_DOMAIN_SVA: 4090 return intel_svm_domain_alloc(); 4091 default: 4092 return NULL; 4093 } 4094 4095 return NULL; 4096 } 4097 4098 static void intel_iommu_domain_free(struct iommu_domain *domain) 4099 { 4100 if (domain != &si_domain->domain && domain != &blocking_domain) 4101 domain_exit(to_dmar_domain(domain)); 4102 } 4103 4104 static int prepare_domain_attach_device(struct iommu_domain *domain, 4105 struct device *dev) 4106 { 4107 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4108 struct intel_iommu *iommu; 4109 int addr_width; 4110 4111 iommu = device_to_iommu(dev, NULL, NULL); 4112 if (!iommu) 4113 return -ENODEV; 4114 4115 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4116 return -EINVAL; 4117 4118 /* check if this iommu agaw is sufficient for max mapped address */ 4119 addr_width = agaw_to_width(iommu->agaw); 4120 if (addr_width > cap_mgaw(iommu->cap)) 4121 addr_width = cap_mgaw(iommu->cap); 4122 4123 if (dmar_domain->max_addr > (1LL << addr_width)) 4124 return -EINVAL; 4125 dmar_domain->gaw = addr_width; 4126 4127 /* 4128 * Knock out extra levels of page tables if necessary 4129 */ 4130 while (iommu->agaw < dmar_domain->agaw) { 4131 struct dma_pte *pte; 4132 4133 pte = dmar_domain->pgd; 4134 if (dma_pte_present(pte)) { 4135 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4136 free_pgtable_page(pte); 4137 } 4138 dmar_domain->agaw--; 4139 } 4140 4141 return 0; 4142 } 4143 4144 static int intel_iommu_attach_device(struct iommu_domain *domain, 4145 struct device *dev) 4146 { 4147 struct device_domain_info *info = dev_iommu_priv_get(dev); 4148 int ret; 4149 4150 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4151 device_is_rmrr_locked(dev)) { 4152 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4153 return -EPERM; 4154 } 4155 4156 if (info->domain) 4157 device_block_translation(dev); 4158 4159 ret = prepare_domain_attach_device(domain, dev); 4160 if (ret) 4161 return ret; 4162 4163 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4164 } 4165 4166 static int intel_iommu_map(struct iommu_domain *domain, 4167 unsigned long iova, phys_addr_t hpa, 4168 size_t size, int iommu_prot, gfp_t gfp) 4169 { 4170 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4171 u64 max_addr; 4172 int prot = 0; 4173 4174 if (iommu_prot & IOMMU_READ) 4175 prot |= DMA_PTE_READ; 4176 if (iommu_prot & IOMMU_WRITE) 4177 prot |= DMA_PTE_WRITE; 4178 if (dmar_domain->set_pte_snp) 4179 prot |= DMA_PTE_SNP; 4180 4181 max_addr = iova + size; 4182 if (dmar_domain->max_addr < max_addr) { 4183 u64 end; 4184 4185 /* check if minimum agaw is sufficient for mapped address */ 4186 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4187 if (end < max_addr) { 4188 pr_err("%s: iommu width (%d) is not " 4189 "sufficient for the mapped address (%llx)\n", 4190 __func__, dmar_domain->gaw, max_addr); 4191 return -EFAULT; 4192 } 4193 dmar_domain->max_addr = max_addr; 4194 } 4195 /* Round up size to next multiple of PAGE_SIZE, if it and 4196 the low bits of hpa would take us onto the next page */ 4197 size = aligned_nrpages(hpa, size); 4198 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4199 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4200 } 4201 4202 static int intel_iommu_map_pages(struct iommu_domain *domain, 4203 unsigned long iova, phys_addr_t paddr, 4204 size_t pgsize, size_t pgcount, 4205 int prot, gfp_t gfp, size_t *mapped) 4206 { 4207 unsigned long pgshift = __ffs(pgsize); 4208 size_t size = pgcount << pgshift; 4209 int ret; 4210 4211 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4212 return -EINVAL; 4213 4214 if (!IS_ALIGNED(iova | paddr, pgsize)) 4215 return -EINVAL; 4216 4217 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4218 if (!ret && mapped) 4219 *mapped = size; 4220 4221 return ret; 4222 } 4223 4224 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4225 unsigned long iova, size_t size, 4226 struct iommu_iotlb_gather *gather) 4227 { 4228 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4229 unsigned long start_pfn, last_pfn; 4230 int level = 0; 4231 4232 /* Cope with horrid API which requires us to unmap more than the 4233 size argument if it happens to be a large-page mapping. */ 4234 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4235 &level, GFP_ATOMIC))) 4236 return 0; 4237 4238 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4239 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4240 4241 start_pfn = iova >> VTD_PAGE_SHIFT; 4242 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4243 4244 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4245 4246 if (dmar_domain->max_addr == iova + size) 4247 dmar_domain->max_addr = iova; 4248 4249 /* 4250 * We do not use page-selective IOTLB invalidation in flush queue, 4251 * so there is no need to track page and sync iotlb. 4252 */ 4253 if (!iommu_iotlb_gather_queued(gather)) 4254 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4255 4256 return size; 4257 } 4258 4259 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4260 unsigned long iova, 4261 size_t pgsize, size_t pgcount, 4262 struct iommu_iotlb_gather *gather) 4263 { 4264 unsigned long pgshift = __ffs(pgsize); 4265 size_t size = pgcount << pgshift; 4266 4267 return intel_iommu_unmap(domain, iova, size, gather); 4268 } 4269 4270 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4271 struct iommu_iotlb_gather *gather) 4272 { 4273 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4274 unsigned long iova_pfn = IOVA_PFN(gather->start); 4275 size_t size = gather->end - gather->start; 4276 struct iommu_domain_info *info; 4277 unsigned long start_pfn; 4278 unsigned long nrpages; 4279 unsigned long i; 4280 4281 nrpages = aligned_nrpages(gather->start, size); 4282 start_pfn = mm_to_dma_pfn(iova_pfn); 4283 4284 xa_for_each(&dmar_domain->iommu_array, i, info) 4285 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4286 start_pfn, nrpages, 4287 list_empty(&gather->freelist), 0); 4288 4289 put_pages_list(&gather->freelist); 4290 } 4291 4292 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4293 dma_addr_t iova) 4294 { 4295 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4296 struct dma_pte *pte; 4297 int level = 0; 4298 u64 phys = 0; 4299 4300 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4301 GFP_ATOMIC); 4302 if (pte && dma_pte_present(pte)) 4303 phys = dma_pte_addr(pte) + 4304 (iova & (BIT_MASK(level_to_offset_bits(level) + 4305 VTD_PAGE_SHIFT) - 1)); 4306 4307 return phys; 4308 } 4309 4310 static bool domain_support_force_snooping(struct dmar_domain *domain) 4311 { 4312 struct device_domain_info *info; 4313 bool support = true; 4314 4315 assert_spin_locked(&domain->lock); 4316 list_for_each_entry(info, &domain->devices, link) { 4317 if (!ecap_sc_support(info->iommu->ecap)) { 4318 support = false; 4319 break; 4320 } 4321 } 4322 4323 return support; 4324 } 4325 4326 static void domain_set_force_snooping(struct dmar_domain *domain) 4327 { 4328 struct device_domain_info *info; 4329 4330 assert_spin_locked(&domain->lock); 4331 /* 4332 * Second level page table supports per-PTE snoop control. The 4333 * iommu_map() interface will handle this by setting SNP bit. 4334 */ 4335 if (!domain->use_first_level) { 4336 domain->set_pte_snp = true; 4337 return; 4338 } 4339 4340 list_for_each_entry(info, &domain->devices, link) 4341 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4342 PASID_RID2PASID); 4343 } 4344 4345 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4346 { 4347 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4348 unsigned long flags; 4349 4350 if (dmar_domain->force_snooping) 4351 return true; 4352 4353 spin_lock_irqsave(&dmar_domain->lock, flags); 4354 if (!domain_support_force_snooping(dmar_domain)) { 4355 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4356 return false; 4357 } 4358 4359 domain_set_force_snooping(dmar_domain); 4360 dmar_domain->force_snooping = true; 4361 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4362 4363 return true; 4364 } 4365 4366 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4367 { 4368 struct device_domain_info *info = dev_iommu_priv_get(dev); 4369 4370 switch (cap) { 4371 case IOMMU_CAP_CACHE_COHERENCY: 4372 return true; 4373 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4374 return dmar_platform_optin(); 4375 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4376 return ecap_sc_support(info->iommu->ecap); 4377 default: 4378 return false; 4379 } 4380 } 4381 4382 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4383 { 4384 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4385 struct device_domain_info *info; 4386 struct intel_iommu *iommu; 4387 u8 bus, devfn; 4388 int ret; 4389 4390 iommu = device_to_iommu(dev, &bus, &devfn); 4391 if (!iommu || !iommu->iommu.ops) 4392 return ERR_PTR(-ENODEV); 4393 4394 info = kzalloc(sizeof(*info), GFP_KERNEL); 4395 if (!info) 4396 return ERR_PTR(-ENOMEM); 4397 4398 if (dev_is_real_dma_subdevice(dev)) { 4399 info->bus = pdev->bus->number; 4400 info->devfn = pdev->devfn; 4401 info->segment = pci_domain_nr(pdev->bus); 4402 } else { 4403 info->bus = bus; 4404 info->devfn = devfn; 4405 info->segment = iommu->segment; 4406 } 4407 4408 info->dev = dev; 4409 info->iommu = iommu; 4410 if (dev_is_pci(dev)) { 4411 if (ecap_dev_iotlb_support(iommu->ecap) && 4412 pci_ats_supported(pdev) && 4413 dmar_ats_supported(pdev, iommu)) { 4414 info->ats_supported = 1; 4415 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4416 4417 /* 4418 * For IOMMU that supports device IOTLB throttling 4419 * (DIT), we assign PFSID to the invalidation desc 4420 * of a VF such that IOMMU HW can gauge queue depth 4421 * at PF level. If DIT is not set, PFSID will be 4422 * treated as reserved, which should be set to 0. 4423 */ 4424 if (ecap_dit(iommu->ecap)) 4425 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4426 info->ats_qdep = pci_ats_queue_depth(pdev); 4427 } 4428 if (sm_supported(iommu)) { 4429 if (pasid_supported(iommu)) { 4430 int features = pci_pasid_features(pdev); 4431 4432 if (features >= 0) 4433 info->pasid_supported = features | 1; 4434 } 4435 4436 if (info->ats_supported && ecap_prs(iommu->ecap) && 4437 pci_pri_supported(pdev)) 4438 info->pri_supported = 1; 4439 } 4440 } 4441 4442 dev_iommu_priv_set(dev, info); 4443 4444 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4445 ret = intel_pasid_alloc_table(dev); 4446 if (ret) { 4447 dev_err(dev, "PASID table allocation failed\n"); 4448 dev_iommu_priv_set(dev, NULL); 4449 kfree(info); 4450 return ERR_PTR(ret); 4451 } 4452 } 4453 4454 return &iommu->iommu; 4455 } 4456 4457 static void intel_iommu_release_device(struct device *dev) 4458 { 4459 struct device_domain_info *info = dev_iommu_priv_get(dev); 4460 4461 dmar_remove_one_dev_info(dev); 4462 intel_pasid_free_table(dev); 4463 dev_iommu_priv_set(dev, NULL); 4464 kfree(info); 4465 set_dma_ops(dev, NULL); 4466 } 4467 4468 static void intel_iommu_probe_finalize(struct device *dev) 4469 { 4470 set_dma_ops(dev, NULL); 4471 iommu_setup_dma_ops(dev, 0, U64_MAX); 4472 } 4473 4474 static void intel_iommu_get_resv_regions(struct device *device, 4475 struct list_head *head) 4476 { 4477 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4478 struct iommu_resv_region *reg; 4479 struct dmar_rmrr_unit *rmrr; 4480 struct device *i_dev; 4481 int i; 4482 4483 rcu_read_lock(); 4484 for_each_rmrr_units(rmrr) { 4485 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4486 i, i_dev) { 4487 struct iommu_resv_region *resv; 4488 enum iommu_resv_type type; 4489 size_t length; 4490 4491 if (i_dev != device && 4492 !is_downstream_to_pci_bridge(device, i_dev)) 4493 continue; 4494 4495 length = rmrr->end_address - rmrr->base_address + 1; 4496 4497 type = device_rmrr_is_relaxable(device) ? 4498 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4499 4500 resv = iommu_alloc_resv_region(rmrr->base_address, 4501 length, prot, type, 4502 GFP_ATOMIC); 4503 if (!resv) 4504 break; 4505 4506 list_add_tail(&resv->list, head); 4507 } 4508 } 4509 rcu_read_unlock(); 4510 4511 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4512 if (dev_is_pci(device)) { 4513 struct pci_dev *pdev = to_pci_dev(device); 4514 4515 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4516 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4517 IOMMU_RESV_DIRECT_RELAXABLE, 4518 GFP_KERNEL); 4519 if (reg) 4520 list_add_tail(®->list, head); 4521 } 4522 } 4523 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4524 4525 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4526 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4527 0, IOMMU_RESV_MSI, GFP_KERNEL); 4528 if (!reg) 4529 return; 4530 list_add_tail(®->list, head); 4531 } 4532 4533 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4534 { 4535 if (dev_is_pci(dev)) 4536 return pci_device_group(dev); 4537 return generic_device_group(dev); 4538 } 4539 4540 static int intel_iommu_enable_sva(struct device *dev) 4541 { 4542 struct device_domain_info *info = dev_iommu_priv_get(dev); 4543 struct intel_iommu *iommu; 4544 4545 if (!info || dmar_disabled) 4546 return -EINVAL; 4547 4548 iommu = info->iommu; 4549 if (!iommu) 4550 return -EINVAL; 4551 4552 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4553 return -ENODEV; 4554 4555 if (!info->pasid_enabled || !info->ats_enabled) 4556 return -EINVAL; 4557 4558 /* 4559 * Devices having device-specific I/O fault handling should not 4560 * support PCI/PRI. The IOMMU side has no means to check the 4561 * capability of device-specific IOPF. Therefore, IOMMU can only 4562 * default that if the device driver enables SVA on a non-PRI 4563 * device, it will handle IOPF in its own way. 4564 */ 4565 if (!info->pri_supported) 4566 return 0; 4567 4568 /* Devices supporting PRI should have it enabled. */ 4569 if (!info->pri_enabled) 4570 return -EINVAL; 4571 4572 return 0; 4573 } 4574 4575 static int intel_iommu_enable_iopf(struct device *dev) 4576 { 4577 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4578 struct device_domain_info *info = dev_iommu_priv_get(dev); 4579 struct intel_iommu *iommu; 4580 int ret; 4581 4582 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4583 return -ENODEV; 4584 4585 if (info->pri_enabled) 4586 return -EBUSY; 4587 4588 iommu = info->iommu; 4589 if (!iommu) 4590 return -EINVAL; 4591 4592 /* PASID is required in PRG Response Message. */ 4593 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4594 return -EINVAL; 4595 4596 ret = pci_reset_pri(pdev); 4597 if (ret) 4598 return ret; 4599 4600 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4601 if (ret) 4602 return ret; 4603 4604 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4605 if (ret) 4606 goto iopf_remove_device; 4607 4608 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4609 if (ret) 4610 goto iopf_unregister_handler; 4611 info->pri_enabled = 1; 4612 4613 return 0; 4614 4615 iopf_unregister_handler: 4616 iommu_unregister_device_fault_handler(dev); 4617 iopf_remove_device: 4618 iopf_queue_remove_device(iommu->iopf_queue, dev); 4619 4620 return ret; 4621 } 4622 4623 static int intel_iommu_disable_iopf(struct device *dev) 4624 { 4625 struct device_domain_info *info = dev_iommu_priv_get(dev); 4626 struct intel_iommu *iommu = info->iommu; 4627 4628 if (!info->pri_enabled) 4629 return -EINVAL; 4630 4631 /* 4632 * PCIe spec states that by clearing PRI enable bit, the Page 4633 * Request Interface will not issue new page requests, but has 4634 * outstanding page requests that have been transmitted or are 4635 * queued for transmission. This is supposed to be called after 4636 * the device driver has stopped DMA, all PASIDs have been 4637 * unbound and the outstanding PRQs have been drained. 4638 */ 4639 pci_disable_pri(to_pci_dev(dev)); 4640 info->pri_enabled = 0; 4641 4642 /* 4643 * With PRI disabled and outstanding PRQs drained, unregistering 4644 * fault handler and removing device from iopf queue should never 4645 * fail. 4646 */ 4647 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4648 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4649 4650 return 0; 4651 } 4652 4653 static int 4654 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4655 { 4656 switch (feat) { 4657 case IOMMU_DEV_FEAT_IOPF: 4658 return intel_iommu_enable_iopf(dev); 4659 4660 case IOMMU_DEV_FEAT_SVA: 4661 return intel_iommu_enable_sva(dev); 4662 4663 default: 4664 return -ENODEV; 4665 } 4666 } 4667 4668 static int 4669 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4670 { 4671 switch (feat) { 4672 case IOMMU_DEV_FEAT_IOPF: 4673 return intel_iommu_disable_iopf(dev); 4674 4675 case IOMMU_DEV_FEAT_SVA: 4676 return 0; 4677 4678 default: 4679 return -ENODEV; 4680 } 4681 } 4682 4683 static bool intel_iommu_is_attach_deferred(struct device *dev) 4684 { 4685 struct device_domain_info *info = dev_iommu_priv_get(dev); 4686 4687 return translation_pre_enabled(info->iommu) && !info->domain; 4688 } 4689 4690 /* 4691 * Check that the device does not live on an external facing PCI port that is 4692 * marked as untrusted. Such devices should not be able to apply quirks and 4693 * thus not be able to bypass the IOMMU restrictions. 4694 */ 4695 static bool risky_device(struct pci_dev *pdev) 4696 { 4697 if (pdev->untrusted) { 4698 pci_info(pdev, 4699 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4700 pdev->vendor, pdev->device); 4701 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4702 return true; 4703 } 4704 return false; 4705 } 4706 4707 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4708 unsigned long iova, size_t size) 4709 { 4710 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4711 unsigned long pages = aligned_nrpages(iova, size); 4712 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4713 struct iommu_domain_info *info; 4714 unsigned long i; 4715 4716 xa_for_each(&dmar_domain->iommu_array, i, info) 4717 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4718 } 4719 4720 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4721 { 4722 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4723 struct iommu_domain *domain; 4724 4725 /* Domain type specific cleanup: */ 4726 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4727 if (domain) { 4728 switch (domain->type) { 4729 case IOMMU_DOMAIN_SVA: 4730 intel_svm_remove_dev_pasid(dev, pasid); 4731 break; 4732 default: 4733 /* should never reach here */ 4734 WARN_ON(1); 4735 break; 4736 } 4737 } 4738 4739 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4740 } 4741 4742 const struct iommu_ops intel_iommu_ops = { 4743 .capable = intel_iommu_capable, 4744 .domain_alloc = intel_iommu_domain_alloc, 4745 .probe_device = intel_iommu_probe_device, 4746 .probe_finalize = intel_iommu_probe_finalize, 4747 .release_device = intel_iommu_release_device, 4748 .get_resv_regions = intel_iommu_get_resv_regions, 4749 .device_group = intel_iommu_device_group, 4750 .dev_enable_feat = intel_iommu_dev_enable_feat, 4751 .dev_disable_feat = intel_iommu_dev_disable_feat, 4752 .is_attach_deferred = intel_iommu_is_attach_deferred, 4753 .def_domain_type = device_def_domain_type, 4754 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4755 .pgsize_bitmap = SZ_4K, 4756 #ifdef CONFIG_INTEL_IOMMU_SVM 4757 .page_response = intel_svm_page_response, 4758 #endif 4759 .default_domain_ops = &(const struct iommu_domain_ops) { 4760 .attach_dev = intel_iommu_attach_device, 4761 .map_pages = intel_iommu_map_pages, 4762 .unmap_pages = intel_iommu_unmap_pages, 4763 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4764 .flush_iotlb_all = intel_flush_iotlb_all, 4765 .iotlb_sync = intel_iommu_tlb_sync, 4766 .iova_to_phys = intel_iommu_iova_to_phys, 4767 .free = intel_iommu_domain_free, 4768 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4769 } 4770 }; 4771 4772 static void quirk_iommu_igfx(struct pci_dev *dev) 4773 { 4774 if (risky_device(dev)) 4775 return; 4776 4777 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4778 dmar_map_gfx = 0; 4779 } 4780 4781 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4789 4790 /* Broadwell igfx malfunctions with dmar */ 4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4815 4816 static void quirk_iommu_rwbf(struct pci_dev *dev) 4817 { 4818 if (risky_device(dev)) 4819 return; 4820 4821 /* 4822 * Mobile 4 Series Chipset neglects to set RWBF capability, 4823 * but needs it. Same seems to hold for the desktop versions. 4824 */ 4825 pci_info(dev, "Forcing write-buffer flush capability\n"); 4826 rwbf_quirk = 1; 4827 } 4828 4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4836 4837 #define GGC 0x52 4838 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4839 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4840 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4841 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4842 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4843 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4844 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4845 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4846 4847 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4848 { 4849 unsigned short ggc; 4850 4851 if (risky_device(dev)) 4852 return; 4853 4854 if (pci_read_config_word(dev, GGC, &ggc)) 4855 return; 4856 4857 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4858 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4859 dmar_map_gfx = 0; 4860 } else if (dmar_map_gfx) { 4861 /* we have to ensure the gfx device is idle before we flush */ 4862 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4863 iommu_set_dma_strict(); 4864 } 4865 } 4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4870 4871 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4872 { 4873 unsigned short ver; 4874 4875 if (!IS_GFX_DEVICE(dev)) 4876 return; 4877 4878 ver = (dev->device >> 8) & 0xff; 4879 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4880 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4881 ver != 0x9a && ver != 0xa7) 4882 return; 4883 4884 if (risky_device(dev)) 4885 return; 4886 4887 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4888 iommu_skip_te_disable = 1; 4889 } 4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4891 4892 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4893 ISOCH DMAR unit for the Azalia sound device, but not give it any 4894 TLB entries, which causes it to deadlock. Check for that. We do 4895 this in a function called from init_dmars(), instead of in a PCI 4896 quirk, because we don't want to print the obnoxious "BIOS broken" 4897 message if VT-d is actually disabled. 4898 */ 4899 static void __init check_tylersburg_isoch(void) 4900 { 4901 struct pci_dev *pdev; 4902 uint32_t vtisochctrl; 4903 4904 /* If there's no Azalia in the system anyway, forget it. */ 4905 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4906 if (!pdev) 4907 return; 4908 4909 if (risky_device(pdev)) { 4910 pci_dev_put(pdev); 4911 return; 4912 } 4913 4914 pci_dev_put(pdev); 4915 4916 /* System Management Registers. Might be hidden, in which case 4917 we can't do the sanity check. But that's OK, because the 4918 known-broken BIOSes _don't_ actually hide it, so far. */ 4919 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4920 if (!pdev) 4921 return; 4922 4923 if (risky_device(pdev)) { 4924 pci_dev_put(pdev); 4925 return; 4926 } 4927 4928 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4929 pci_dev_put(pdev); 4930 return; 4931 } 4932 4933 pci_dev_put(pdev); 4934 4935 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 4936 if (vtisochctrl & 1) 4937 return; 4938 4939 /* Drop all bits other than the number of TLB entries */ 4940 vtisochctrl &= 0x1c; 4941 4942 /* If we have the recommended number of TLB entries (16), fine. */ 4943 if (vtisochctrl == 0x10) 4944 return; 4945 4946 /* Zero TLB entries? You get to ride the short bus to school. */ 4947 if (!vtisochctrl) { 4948 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 4949 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4950 dmi_get_system_info(DMI_BIOS_VENDOR), 4951 dmi_get_system_info(DMI_BIOS_VERSION), 4952 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4953 iommu_identity_mapping |= IDENTMAP_AZALIA; 4954 return; 4955 } 4956 4957 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 4958 vtisochctrl); 4959 } 4960 4961 /* 4962 * Here we deal with a device TLB defect where device may inadvertently issue ATS 4963 * invalidation completion before posted writes initiated with translated address 4964 * that utilized translations matching the invalidation address range, violating 4965 * the invalidation completion ordering. 4966 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 4967 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 4968 * under the control of the trusted/privileged host device driver must use this 4969 * quirk. 4970 * Device TLBs are invalidated under the following six conditions: 4971 * 1. Device driver does DMA API unmap IOVA 4972 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 4973 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 4974 * exit_mmap() due to crash 4975 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 4976 * VM has to free pages that were unmapped 4977 * 5. Userspace driver unmaps a DMA buffer 4978 * 6. Cache invalidation in vSVA usage (upcoming) 4979 * 4980 * For #1 and #2, device drivers are responsible for stopping DMA traffic 4981 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 4982 * invalidate TLB the same way as normal user unmap which will use this quirk. 4983 * The dTLB invalidation after PASID cache flush does not need this quirk. 4984 * 4985 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 4986 */ 4987 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 4988 unsigned long address, unsigned long mask, 4989 u32 pasid, u16 qdep) 4990 { 4991 u16 sid; 4992 4993 if (likely(!info->dtlb_extra_inval)) 4994 return; 4995 4996 sid = PCI_DEVID(info->bus, info->devfn); 4997 if (pasid == PASID_RID2PASID) { 4998 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 4999 qdep, address, mask); 5000 } else { 5001 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5002 pasid, qdep, address, mask); 5003 } 5004 } 5005 5006 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5007 5008 /* 5009 * Function to submit a command to the enhanced command interface. The 5010 * valid enhanced command descriptions are defined in Table 47 of the 5011 * VT-d spec. The VT-d hardware implementation may support some but not 5012 * all commands, which can be determined by checking the Enhanced 5013 * Command Capability Register. 5014 * 5015 * Return values: 5016 * - 0: Command successful without any error; 5017 * - Negative: software error value; 5018 * - Nonzero positive: failure status code defined in Table 48. 5019 */ 5020 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5021 { 5022 unsigned long flags; 5023 u64 res; 5024 int ret; 5025 5026 if (!cap_ecmds(iommu->cap)) 5027 return -ENODEV; 5028 5029 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5030 5031 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5032 if (res & DMA_ECMD_ECRSP_IP) { 5033 ret = -EBUSY; 5034 goto err; 5035 } 5036 5037 /* 5038 * Unconditionally write the operand B, because 5039 * - There is no side effect if an ecmd doesn't require an 5040 * operand B, but we set the register to some value. 5041 * - It's not invoked in any critical path. The extra MMIO 5042 * write doesn't bring any performance concerns. 5043 */ 5044 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5045 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5046 5047 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5048 !(res & DMA_ECMD_ECRSP_IP), res); 5049 5050 if (res & DMA_ECMD_ECRSP_IP) { 5051 ret = -ETIMEDOUT; 5052 goto err; 5053 } 5054 5055 ret = ecmd_get_status_code(res); 5056 err: 5057 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5058 5059 return ret; 5060 } 5061