1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dmi.h> 19 #include <linux/memory.h> 20 #include <linux/pci.h> 21 #include <linux/pci-ats.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscore_ops.h> 24 #include <linux/tboot.h> 25 #include <uapi/linux/iommufd.h> 26 27 #include "iommu.h" 28 #include "../dma-iommu.h" 29 #include "../irq_remapping.h" 30 #include "../iommu-sva.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 #include "perfmon.h" 34 35 #define ROOT_SIZE VTD_PAGE_SIZE 36 #define CONTEXT_SIZE VTD_PAGE_SIZE 37 38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 42 43 #define IOAPIC_RANGE_START (0xfee00000) 44 #define IOAPIC_RANGE_END (0xfeefffff) 45 #define IOVA_START_ADDR (0x1000) 46 47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 48 49 #define MAX_AGAW_WIDTH 64 50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 51 52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 54 55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 60 61 /* IO virtual address start page frame number */ 62 #define IOVA_START_PFN (1) 63 64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 65 66 /* page table handling */ 67 #define LEVEL_STRIDE (9) 68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 69 70 static inline int agaw_to_level(int agaw) 71 { 72 return agaw + 2; 73 } 74 75 static inline int agaw_to_width(int agaw) 76 { 77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 78 } 79 80 static inline int width_to_agaw(int width) 81 { 82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 83 } 84 85 static inline unsigned int level_to_offset_bits(int level) 86 { 87 return (level - 1) * LEVEL_STRIDE; 88 } 89 90 static inline int pfn_level_offset(u64 pfn, int level) 91 { 92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 93 } 94 95 static inline u64 level_mask(int level) 96 { 97 return -1ULL << level_to_offset_bits(level); 98 } 99 100 static inline u64 level_size(int level) 101 { 102 return 1ULL << level_to_offset_bits(level); 103 } 104 105 static inline u64 align_to_level(u64 pfn, int level) 106 { 107 return (pfn + level_size(level) - 1) & level_mask(level); 108 } 109 110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 111 { 112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 113 } 114 115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 116 are never going to work. */ 117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) 118 { 119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 120 } 121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) 122 { 123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; 124 } 125 static inline unsigned long page_to_dma_pfn(struct page *pg) 126 { 127 return mm_to_dma_pfn_start(page_to_pfn(pg)); 128 } 129 static inline unsigned long virt_to_dma_pfn(void *p) 130 { 131 return page_to_dma_pfn(virt_to_page(p)); 132 } 133 134 static void __init check_tylersburg_isoch(void); 135 static int rwbf_quirk; 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_set_present(struct context_entry *context) 172 { 173 context->lo |= 1; 174 } 175 176 static inline void context_set_fault_enable(struct context_entry *context) 177 { 178 context->lo &= (((u64)-1) << 2) | 1; 179 } 180 181 static inline void context_set_translation_type(struct context_entry *context, 182 unsigned long value) 183 { 184 context->lo &= (((u64)-1) << 4) | 3; 185 context->lo |= (value & 3) << 2; 186 } 187 188 static inline void context_set_address_root(struct context_entry *context, 189 unsigned long value) 190 { 191 context->lo &= ~VTD_PAGE_MASK; 192 context->lo |= value & VTD_PAGE_MASK; 193 } 194 195 static inline void context_set_address_width(struct context_entry *context, 196 unsigned long value) 197 { 198 context->hi |= value & 7; 199 } 200 201 static inline void context_set_domain_id(struct context_entry *context, 202 unsigned long value) 203 { 204 context->hi |= (value & ((1 << 16) - 1)) << 8; 205 } 206 207 static inline void context_set_pasid(struct context_entry *context) 208 { 209 context->lo |= CONTEXT_PASIDE; 210 } 211 212 static inline int context_domain_id(struct context_entry *c) 213 { 214 return((c->hi >> 8) & 0xffff); 215 } 216 217 static inline void context_clear_entry(struct context_entry *context) 218 { 219 context->lo = 0; 220 context->hi = 0; 221 } 222 223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 224 { 225 if (!iommu->copied_tables) 226 return false; 227 228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); 229 } 230 231 static inline void 232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 233 { 234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables); 235 } 236 237 static inline void 238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) 239 { 240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); 241 } 242 243 /* 244 * This domain is a statically identity mapping domain. 245 * 1. This domain creats a static 1:1 mapping to all usable memory. 246 * 2. It maps to each iommu if successful. 247 * 3. Each iommu mapps to this domain if successful. 248 */ 249 static struct dmar_domain *si_domain; 250 static int hw_pass_through = 1; 251 252 struct dmar_rmrr_unit { 253 struct list_head list; /* list of rmrr units */ 254 struct acpi_dmar_header *hdr; /* ACPI header */ 255 u64 base_address; /* reserved base address*/ 256 u64 end_address; /* reserved end address */ 257 struct dmar_dev_scope *devices; /* target devices */ 258 int devices_cnt; /* target device count */ 259 }; 260 261 struct dmar_atsr_unit { 262 struct list_head list; /* list of ATSR units */ 263 struct acpi_dmar_header *hdr; /* ACPI header */ 264 struct dmar_dev_scope *devices; /* target devices */ 265 int devices_cnt; /* target device count */ 266 u8 include_all:1; /* include all ports */ 267 }; 268 269 struct dmar_satc_unit { 270 struct list_head list; /* list of SATC units */ 271 struct acpi_dmar_header *hdr; /* ACPI header */ 272 struct dmar_dev_scope *devices; /* target devices */ 273 struct intel_iommu *iommu; /* the corresponding iommu */ 274 int devices_cnt; /* target device count */ 275 u8 atc_required:1; /* ATS is required */ 276 }; 277 278 static LIST_HEAD(dmar_atsr_units); 279 static LIST_HEAD(dmar_rmrr_units); 280 static LIST_HEAD(dmar_satc_units); 281 282 #define for_each_rmrr_units(rmrr) \ 283 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 284 285 static void device_block_translation(struct device *dev); 286 static void intel_iommu_domain_free(struct iommu_domain *domain); 287 288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 290 291 int intel_iommu_enabled = 0; 292 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 293 294 static int dmar_map_gfx = 1; 295 static int intel_iommu_superpage = 1; 296 static int iommu_identity_mapping; 297 static int iommu_skip_te_disable; 298 299 #define IDENTMAP_GFX 2 300 #define IDENTMAP_AZALIA 4 301 302 const struct iommu_ops intel_iommu_ops; 303 304 static bool translation_pre_enabled(struct intel_iommu *iommu) 305 { 306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 307 } 308 309 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 310 { 311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 312 } 313 314 static void init_translation_status(struct intel_iommu *iommu) 315 { 316 u32 gsts; 317 318 gsts = readl(iommu->reg + DMAR_GSTS_REG); 319 if (gsts & DMA_GSTS_TES) 320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 321 } 322 323 static int __init intel_iommu_setup(char *str) 324 { 325 if (!str) 326 return -EINVAL; 327 328 while (*str) { 329 if (!strncmp(str, "on", 2)) { 330 dmar_disabled = 0; 331 pr_info("IOMMU enabled\n"); 332 } else if (!strncmp(str, "off", 3)) { 333 dmar_disabled = 1; 334 no_platform_optin = 1; 335 pr_info("IOMMU disabled\n"); 336 } else if (!strncmp(str, "igfx_off", 8)) { 337 dmar_map_gfx = 0; 338 pr_info("Disable GFX device mapping\n"); 339 } else if (!strncmp(str, "forcedac", 8)) { 340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 341 iommu_dma_forcedac = true; 342 } else if (!strncmp(str, "strict", 6)) { 343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 344 iommu_set_dma_strict(); 345 } else if (!strncmp(str, "sp_off", 6)) { 346 pr_info("Disable supported super page\n"); 347 intel_iommu_superpage = 0; 348 } else if (!strncmp(str, "sm_on", 5)) { 349 pr_info("Enable scalable mode if hardware supports\n"); 350 intel_iommu_sm = 1; 351 } else if (!strncmp(str, "sm_off", 6)) { 352 pr_info("Scalable mode is disallowed\n"); 353 intel_iommu_sm = 0; 354 } else if (!strncmp(str, "tboot_noforce", 13)) { 355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 356 intel_iommu_tboot_noforce = 1; 357 } else { 358 pr_notice("Unknown option - '%s'\n", str); 359 } 360 361 str += strcspn(str, ","); 362 while (*str == ',') 363 str++; 364 } 365 366 return 1; 367 } 368 __setup("intel_iommu=", intel_iommu_setup); 369 370 void *alloc_pgtable_page(int node, gfp_t gfp) 371 { 372 struct page *page; 373 void *vaddr = NULL; 374 375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 376 if (page) 377 vaddr = page_address(page); 378 return vaddr; 379 } 380 381 void free_pgtable_page(void *vaddr) 382 { 383 free_page((unsigned long)vaddr); 384 } 385 386 static inline int domain_type_is_si(struct dmar_domain *domain) 387 { 388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 389 } 390 391 static inline int domain_pfn_supported(struct dmar_domain *domain, 392 unsigned long pfn) 393 { 394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 395 396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 397 } 398 399 /* 400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of 402 * the returned SAGAW. 403 */ 404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) 405 { 406 unsigned long fl_sagaw, sl_sagaw; 407 408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); 409 sl_sagaw = cap_sagaw(iommu->cap); 410 411 /* Second level only. */ 412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 413 return sl_sagaw; 414 415 /* First level only. */ 416 if (!ecap_slts(iommu->ecap)) 417 return fl_sagaw; 418 419 return fl_sagaw & sl_sagaw; 420 } 421 422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 423 { 424 unsigned long sagaw; 425 int agaw; 426 427 sagaw = __iommu_calculate_sagaw(iommu); 428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { 429 if (test_bit(agaw, &sagaw)) 430 break; 431 } 432 433 return agaw; 434 } 435 436 /* 437 * Calculate max SAGAW for each iommu. 438 */ 439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 440 { 441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 442 } 443 444 /* 445 * calculate agaw for each iommu. 446 * "SAGAW" may be different across iommus, use a default agaw, and 447 * get a supported less agaw for iommus that don't support the default agaw. 448 */ 449 int iommu_calculate_agaw(struct intel_iommu *iommu) 450 { 451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 452 } 453 454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 455 { 456 return sm_supported(iommu) ? 457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 458 } 459 460 static void domain_update_iommu_coherency(struct dmar_domain *domain) 461 { 462 struct iommu_domain_info *info; 463 struct dmar_drhd_unit *drhd; 464 struct intel_iommu *iommu; 465 bool found = false; 466 unsigned long i; 467 468 domain->iommu_coherency = true; 469 xa_for_each(&domain->iommu_array, i, info) { 470 found = true; 471 if (!iommu_paging_structure_coherency(info->iommu)) { 472 domain->iommu_coherency = false; 473 break; 474 } 475 } 476 if (found) 477 return; 478 479 /* No hardware attached; use lowest common denominator */ 480 rcu_read_lock(); 481 for_each_active_iommu(iommu, drhd) { 482 if (!iommu_paging_structure_coherency(iommu)) { 483 domain->iommu_coherency = false; 484 break; 485 } 486 } 487 rcu_read_unlock(); 488 } 489 490 static int domain_update_iommu_superpage(struct dmar_domain *domain, 491 struct intel_iommu *skip) 492 { 493 struct dmar_drhd_unit *drhd; 494 struct intel_iommu *iommu; 495 int mask = 0x3; 496 497 if (!intel_iommu_superpage) 498 return 0; 499 500 /* set iommu_superpage to the smallest common denominator */ 501 rcu_read_lock(); 502 for_each_active_iommu(iommu, drhd) { 503 if (iommu != skip) { 504 if (domain && domain->use_first_level) { 505 if (!cap_fl1gp_support(iommu->cap)) 506 mask = 0x1; 507 } else { 508 mask &= cap_super_page_val(iommu->cap); 509 } 510 511 if (!mask) 512 break; 513 } 514 } 515 rcu_read_unlock(); 516 517 return fls(mask); 518 } 519 520 static int domain_update_device_node(struct dmar_domain *domain) 521 { 522 struct device_domain_info *info; 523 int nid = NUMA_NO_NODE; 524 unsigned long flags; 525 526 spin_lock_irqsave(&domain->lock, flags); 527 list_for_each_entry(info, &domain->devices, link) { 528 /* 529 * There could possibly be multiple device numa nodes as devices 530 * within the same domain may sit behind different IOMMUs. There 531 * isn't perfect answer in such situation, so we select first 532 * come first served policy. 533 */ 534 nid = dev_to_node(info->dev); 535 if (nid != NUMA_NO_NODE) 536 break; 537 } 538 spin_unlock_irqrestore(&domain->lock, flags); 539 540 return nid; 541 } 542 543 static void domain_update_iotlb(struct dmar_domain *domain); 544 545 /* Return the super pagesize bitmap if supported. */ 546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 547 { 548 unsigned long bitmap = 0; 549 550 /* 551 * 1-level super page supports page size of 2MiB, 2-level super page 552 * supports page size of both 2MiB and 1GiB. 553 */ 554 if (domain->iommu_superpage == 1) 555 bitmap |= SZ_2M; 556 else if (domain->iommu_superpage == 2) 557 bitmap |= SZ_2M | SZ_1G; 558 559 return bitmap; 560 } 561 562 /* Some capabilities may be different across iommus */ 563 static void domain_update_iommu_cap(struct dmar_domain *domain) 564 { 565 domain_update_iommu_coherency(domain); 566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 567 568 /* 569 * If RHSA is missing, we should default to the device numa domain 570 * as fall back. 571 */ 572 if (domain->nid == NUMA_NO_NODE) 573 domain->nid = domain_update_device_node(domain); 574 575 /* 576 * First-level translation restricts the input-address to a 577 * canonical address (i.e., address bits 63:N have the same 578 * value as address bit [N-1], where N is 48-bits with 4-level 579 * paging and 57-bits with 5-level paging). Hence, skip bit 580 * [N-1]. 581 */ 582 if (domain->use_first_level) 583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 584 else 585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 586 587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 588 domain_update_iotlb(domain); 589 } 590 591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 592 u8 devfn, int alloc) 593 { 594 struct root_entry *root = &iommu->root_entry[bus]; 595 struct context_entry *context; 596 u64 *entry; 597 598 /* 599 * Except that the caller requested to allocate a new entry, 600 * returning a copied context entry makes no sense. 601 */ 602 if (!alloc && context_copied(iommu, bus, devfn)) 603 return NULL; 604 605 entry = &root->lo; 606 if (sm_supported(iommu)) { 607 if (devfn >= 0x80) { 608 devfn -= 0x80; 609 entry = &root->hi; 610 } 611 devfn *= 2; 612 } 613 if (*entry & 1) 614 context = phys_to_virt(*entry & VTD_PAGE_MASK); 615 else { 616 unsigned long phy_addr; 617 if (!alloc) 618 return NULL; 619 620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 621 if (!context) 622 return NULL; 623 624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 625 phy_addr = virt_to_phys((void *)context); 626 *entry = phy_addr | 1; 627 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 628 } 629 return &context[devfn]; 630 } 631 632 /** 633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 634 * sub-hierarchy of a candidate PCI-PCI bridge 635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 636 * @bridge: the candidate PCI-PCI bridge 637 * 638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 639 */ 640 static bool 641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 642 { 643 struct pci_dev *pdev, *pbridge; 644 645 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 646 return false; 647 648 pdev = to_pci_dev(dev); 649 pbridge = to_pci_dev(bridge); 650 651 if (pbridge->subordinate && 652 pbridge->subordinate->number <= pdev->bus->number && 653 pbridge->subordinate->busn_res.end >= pdev->bus->number) 654 return true; 655 656 return false; 657 } 658 659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 660 { 661 struct dmar_drhd_unit *drhd; 662 u32 vtbar; 663 int rc; 664 665 /* We know that this device on this chipset has its own IOMMU. 666 * If we find it under a different IOMMU, then the BIOS is lying 667 * to us. Hope that the IOMMU for this device is actually 668 * disabled, and it needs no translation... 669 */ 670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 671 if (rc) { 672 /* "can't" happen */ 673 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 674 return false; 675 } 676 vtbar &= 0xffff0000; 677 678 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 679 drhd = dmar_find_matched_drhd_unit(pdev); 680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 683 return true; 684 } 685 686 return false; 687 } 688 689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 690 { 691 if (!iommu || iommu->drhd->ignored) 692 return true; 693 694 if (dev_is_pci(dev)) { 695 struct pci_dev *pdev = to_pci_dev(dev); 696 697 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 699 quirk_ioat_snb_local_iommu(pdev)) 700 return true; 701 } 702 703 return false; 704 } 705 706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 707 { 708 struct dmar_drhd_unit *drhd = NULL; 709 struct pci_dev *pdev = NULL; 710 struct intel_iommu *iommu; 711 struct device *tmp; 712 u16 segment = 0; 713 int i; 714 715 if (!dev) 716 return NULL; 717 718 if (dev_is_pci(dev)) { 719 struct pci_dev *pf_pdev; 720 721 pdev = pci_real_dma_dev(to_pci_dev(dev)); 722 723 /* VFs aren't listed in scope tables; we need to look up 724 * the PF instead to find the IOMMU. */ 725 pf_pdev = pci_physfn(pdev); 726 dev = &pf_pdev->dev; 727 segment = pci_domain_nr(pdev->bus); 728 } else if (has_acpi_companion(dev)) 729 dev = &ACPI_COMPANION(dev)->dev; 730 731 rcu_read_lock(); 732 for_each_iommu(iommu, drhd) { 733 if (pdev && segment != drhd->segment) 734 continue; 735 736 for_each_active_dev_scope(drhd->devices, 737 drhd->devices_cnt, i, tmp) { 738 if (tmp == dev) { 739 /* For a VF use its original BDF# not that of the PF 740 * which we used for the IOMMU lookup. Strictly speaking 741 * we could do this for all PCI devices; we only need to 742 * get the BDF# from the scope table for ACPI matches. */ 743 if (pdev && pdev->is_virtfn) 744 goto got_pdev; 745 746 if (bus && devfn) { 747 *bus = drhd->devices[i].bus; 748 *devfn = drhd->devices[i].devfn; 749 } 750 goto out; 751 } 752 753 if (is_downstream_to_pci_bridge(dev, tmp)) 754 goto got_pdev; 755 } 756 757 if (pdev && drhd->include_all) { 758 got_pdev: 759 if (bus && devfn) { 760 *bus = pdev->bus->number; 761 *devfn = pdev->devfn; 762 } 763 goto out; 764 } 765 } 766 iommu = NULL; 767 out: 768 if (iommu_is_dummy(iommu, dev)) 769 iommu = NULL; 770 771 rcu_read_unlock(); 772 773 return iommu; 774 } 775 776 static void domain_flush_cache(struct dmar_domain *domain, 777 void *addr, int size) 778 { 779 if (!domain->iommu_coherency) 780 clflush_cache_range(addr, size); 781 } 782 783 static void free_context_table(struct intel_iommu *iommu) 784 { 785 struct context_entry *context; 786 int i; 787 788 if (!iommu->root_entry) 789 return; 790 791 for (i = 0; i < ROOT_ENTRY_NR; i++) { 792 context = iommu_context_addr(iommu, i, 0, 0); 793 if (context) 794 free_pgtable_page(context); 795 796 if (!sm_supported(iommu)) 797 continue; 798 799 context = iommu_context_addr(iommu, i, 0x80, 0); 800 if (context) 801 free_pgtable_page(context); 802 } 803 804 free_pgtable_page(iommu->root_entry); 805 iommu->root_entry = NULL; 806 } 807 808 #ifdef CONFIG_DMAR_DEBUG 809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, 810 u8 bus, u8 devfn, struct dma_pte *parent, int level) 811 { 812 struct dma_pte *pte; 813 int offset; 814 815 while (1) { 816 offset = pfn_level_offset(pfn, level); 817 pte = &parent[offset]; 818 819 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 820 821 if (!dma_pte_present(pte)) { 822 pr_info("page table not present at level %d\n", level - 1); 823 break; 824 } 825 826 if (level == 1 || dma_pte_superpage(pte)) 827 break; 828 829 parent = phys_to_virt(dma_pte_addr(pte)); 830 level--; 831 } 832 } 833 834 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 835 unsigned long long addr, u32 pasid) 836 { 837 struct pasid_dir_entry *dir, *pde; 838 struct pasid_entry *entries, *pte; 839 struct context_entry *ctx_entry; 840 struct root_entry *rt_entry; 841 int i, dir_index, index, level; 842 u8 devfn = source_id & 0xff; 843 u8 bus = source_id >> 8; 844 struct dma_pte *pgtable; 845 846 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 847 848 /* root entry dump */ 849 if (!iommu->root_entry) { 850 pr_info("root table is not present\n"); 851 return; 852 } 853 rt_entry = &iommu->root_entry[bus]; 854 855 if (sm_supported(iommu)) 856 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 857 rt_entry->hi, rt_entry->lo); 858 else 859 pr_info("root entry: 0x%016llx", rt_entry->lo); 860 861 /* context entry dump */ 862 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 863 if (!ctx_entry) { 864 pr_info("context table is not present\n"); 865 return; 866 } 867 868 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 869 ctx_entry->hi, ctx_entry->lo); 870 871 /* legacy mode does not require PASID entries */ 872 if (!sm_supported(iommu)) { 873 if (!context_present(ctx_entry)) { 874 pr_info("legacy mode page table is not present\n"); 875 return; 876 } 877 level = agaw_to_level(ctx_entry->hi & 7); 878 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 879 goto pgtable_walk; 880 } 881 882 if (!context_present(ctx_entry)) { 883 pr_info("pasid directory table is not present\n"); 884 return; 885 } 886 887 /* get the pointer to pasid directory entry */ 888 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 889 890 /* For request-without-pasid, get the pasid from context entry */ 891 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) 892 pasid = IOMMU_NO_PASID; 893 894 dir_index = pasid >> PASID_PDE_SHIFT; 895 pde = &dir[dir_index]; 896 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 897 898 /* get the pointer to the pasid table entry */ 899 entries = get_pasid_table_from_pde(pde); 900 if (!entries) { 901 pr_info("pasid table is not present\n"); 902 return; 903 } 904 index = pasid & PASID_PTE_MASK; 905 pte = &entries[index]; 906 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 907 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 908 909 if (!pasid_pte_is_present(pte)) { 910 pr_info("scalable mode page table is not present\n"); 911 return; 912 } 913 914 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { 915 level = pte->val[2] & BIT_ULL(2) ? 5 : 4; 916 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); 917 } else { 918 level = agaw_to_level((pte->val[0] >> 2) & 0x7); 919 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); 920 } 921 922 pgtable_walk: 923 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 924 } 925 #endif 926 927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 928 unsigned long pfn, int *target_level, 929 gfp_t gfp) 930 { 931 struct dma_pte *parent, *pte; 932 int level = agaw_to_level(domain->agaw); 933 int offset; 934 935 if (!domain_pfn_supported(domain, pfn)) 936 /* Address beyond IOMMU's addressing capabilities. */ 937 return NULL; 938 939 parent = domain->pgd; 940 941 while (1) { 942 void *tmp_page; 943 944 offset = pfn_level_offset(pfn, level); 945 pte = &parent[offset]; 946 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 947 break; 948 if (level == *target_level) 949 break; 950 951 if (!dma_pte_present(pte)) { 952 uint64_t pteval; 953 954 tmp_page = alloc_pgtable_page(domain->nid, gfp); 955 956 if (!tmp_page) 957 return NULL; 958 959 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 960 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 961 if (domain->use_first_level) 962 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 963 964 if (cmpxchg64(&pte->val, 0ULL, pteval)) 965 /* Someone else set it while we were thinking; use theirs. */ 966 free_pgtable_page(tmp_page); 967 else 968 domain_flush_cache(domain, pte, sizeof(*pte)); 969 } 970 if (level == 1) 971 break; 972 973 parent = phys_to_virt(dma_pte_addr(pte)); 974 level--; 975 } 976 977 if (!*target_level) 978 *target_level = level; 979 980 return pte; 981 } 982 983 /* return address's pte at specific level */ 984 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 985 unsigned long pfn, 986 int level, int *large_page) 987 { 988 struct dma_pte *parent, *pte; 989 int total = agaw_to_level(domain->agaw); 990 int offset; 991 992 parent = domain->pgd; 993 while (level <= total) { 994 offset = pfn_level_offset(pfn, total); 995 pte = &parent[offset]; 996 if (level == total) 997 return pte; 998 999 if (!dma_pte_present(pte)) { 1000 *large_page = total; 1001 break; 1002 } 1003 1004 if (dma_pte_superpage(pte)) { 1005 *large_page = total; 1006 return pte; 1007 } 1008 1009 parent = phys_to_virt(dma_pte_addr(pte)); 1010 total--; 1011 } 1012 return NULL; 1013 } 1014 1015 /* clear last level pte, a tlb flush should be followed */ 1016 static void dma_pte_clear_range(struct dmar_domain *domain, 1017 unsigned long start_pfn, 1018 unsigned long last_pfn) 1019 { 1020 unsigned int large_page; 1021 struct dma_pte *first_pte, *pte; 1022 1023 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1024 WARN_ON(start_pfn > last_pfn)) 1025 return; 1026 1027 /* we don't need lock here; nobody else touches the iova range */ 1028 do { 1029 large_page = 1; 1030 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1031 if (!pte) { 1032 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1033 continue; 1034 } 1035 do { 1036 dma_clear_pte(pte); 1037 start_pfn += lvl_to_nr_pages(large_page); 1038 pte++; 1039 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1040 1041 domain_flush_cache(domain, first_pte, 1042 (void *)pte - (void *)first_pte); 1043 1044 } while (start_pfn && start_pfn <= last_pfn); 1045 } 1046 1047 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1048 int retain_level, struct dma_pte *pte, 1049 unsigned long pfn, unsigned long start_pfn, 1050 unsigned long last_pfn) 1051 { 1052 pfn = max(start_pfn, pfn); 1053 pte = &pte[pfn_level_offset(pfn, level)]; 1054 1055 do { 1056 unsigned long level_pfn; 1057 struct dma_pte *level_pte; 1058 1059 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1060 goto next; 1061 1062 level_pfn = pfn & level_mask(level); 1063 level_pte = phys_to_virt(dma_pte_addr(pte)); 1064 1065 if (level > 2) { 1066 dma_pte_free_level(domain, level - 1, retain_level, 1067 level_pte, level_pfn, start_pfn, 1068 last_pfn); 1069 } 1070 1071 /* 1072 * Free the page table if we're below the level we want to 1073 * retain and the range covers the entire table. 1074 */ 1075 if (level < retain_level && !(start_pfn > level_pfn || 1076 last_pfn < level_pfn + level_size(level) - 1)) { 1077 dma_clear_pte(pte); 1078 domain_flush_cache(domain, pte, sizeof(*pte)); 1079 free_pgtable_page(level_pte); 1080 } 1081 next: 1082 pfn += level_size(level); 1083 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1084 } 1085 1086 /* 1087 * clear last level (leaf) ptes and free page table pages below the 1088 * level we wish to keep intact. 1089 */ 1090 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1091 unsigned long start_pfn, 1092 unsigned long last_pfn, 1093 int retain_level) 1094 { 1095 dma_pte_clear_range(domain, start_pfn, last_pfn); 1096 1097 /* We don't need lock here; nobody else touches the iova range */ 1098 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1099 domain->pgd, 0, start_pfn, last_pfn); 1100 1101 /* free pgd */ 1102 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1103 free_pgtable_page(domain->pgd); 1104 domain->pgd = NULL; 1105 } 1106 } 1107 1108 /* When a page at a given level is being unlinked from its parent, we don't 1109 need to *modify* it at all. All we need to do is make a list of all the 1110 pages which can be freed just as soon as we've flushed the IOTLB and we 1111 know the hardware page-walk will no longer touch them. 1112 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1113 be freed. */ 1114 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1115 int level, struct dma_pte *pte, 1116 struct list_head *freelist) 1117 { 1118 struct page *pg; 1119 1120 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1121 list_add_tail(&pg->lru, freelist); 1122 1123 if (level == 1) 1124 return; 1125 1126 pte = page_address(pg); 1127 do { 1128 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1129 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1130 pte++; 1131 } while (!first_pte_in_page(pte)); 1132 } 1133 1134 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1135 struct dma_pte *pte, unsigned long pfn, 1136 unsigned long start_pfn, unsigned long last_pfn, 1137 struct list_head *freelist) 1138 { 1139 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1140 1141 pfn = max(start_pfn, pfn); 1142 pte = &pte[pfn_level_offset(pfn, level)]; 1143 1144 do { 1145 unsigned long level_pfn = pfn & level_mask(level); 1146 1147 if (!dma_pte_present(pte)) 1148 goto next; 1149 1150 /* If range covers entire pagetable, free it */ 1151 if (start_pfn <= level_pfn && 1152 last_pfn >= level_pfn + level_size(level) - 1) { 1153 /* These suborbinate page tables are going away entirely. Don't 1154 bother to clear them; we're just going to *free* them. */ 1155 if (level > 1 && !dma_pte_superpage(pte)) 1156 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1157 1158 dma_clear_pte(pte); 1159 if (!first_pte) 1160 first_pte = pte; 1161 last_pte = pte; 1162 } else if (level > 1) { 1163 /* Recurse down into a level that isn't *entirely* obsolete */ 1164 dma_pte_clear_level(domain, level - 1, 1165 phys_to_virt(dma_pte_addr(pte)), 1166 level_pfn, start_pfn, last_pfn, 1167 freelist); 1168 } 1169 next: 1170 pfn = level_pfn + level_size(level); 1171 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1172 1173 if (first_pte) 1174 domain_flush_cache(domain, first_pte, 1175 (void *)++last_pte - (void *)first_pte); 1176 } 1177 1178 /* We can't just free the pages because the IOMMU may still be walking 1179 the page tables, and may have cached the intermediate levels. The 1180 pages can only be freed after the IOTLB flush has been done. */ 1181 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1182 unsigned long last_pfn, struct list_head *freelist) 1183 { 1184 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 1185 WARN_ON(start_pfn > last_pfn)) 1186 return; 1187 1188 /* we don't need lock here; nobody else touches the iova range */ 1189 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1190 domain->pgd, 0, start_pfn, last_pfn, freelist); 1191 1192 /* free pgd */ 1193 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1194 struct page *pgd_page = virt_to_page(domain->pgd); 1195 list_add_tail(&pgd_page->lru, freelist); 1196 domain->pgd = NULL; 1197 } 1198 } 1199 1200 /* iommu handling */ 1201 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1202 { 1203 struct root_entry *root; 1204 1205 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC); 1206 if (!root) { 1207 pr_err("Allocating root entry for %s failed\n", 1208 iommu->name); 1209 return -ENOMEM; 1210 } 1211 1212 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1213 iommu->root_entry = root; 1214 1215 return 0; 1216 } 1217 1218 static void iommu_set_root_entry(struct intel_iommu *iommu) 1219 { 1220 u64 addr; 1221 u32 sts; 1222 unsigned long flag; 1223 1224 addr = virt_to_phys(iommu->root_entry); 1225 if (sm_supported(iommu)) 1226 addr |= DMA_RTADDR_SMT; 1227 1228 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1229 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1230 1231 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1232 1233 /* Make sure hardware complete it */ 1234 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1235 readl, (sts & DMA_GSTS_RTPS), sts); 1236 1237 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1238 1239 /* 1240 * Hardware invalidates all DMA remapping hardware translation 1241 * caches as part of SRTP flow. 1242 */ 1243 if (cap_esrtps(iommu->cap)) 1244 return; 1245 1246 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1247 if (sm_supported(iommu)) 1248 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1249 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1250 } 1251 1252 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1253 { 1254 u32 val; 1255 unsigned long flag; 1256 1257 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1258 return; 1259 1260 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1261 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1262 1263 /* Make sure hardware complete it */ 1264 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1265 readl, (!(val & DMA_GSTS_WBFS)), val); 1266 1267 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1268 } 1269 1270 /* return value determine if we need a write buffer flush */ 1271 static void __iommu_flush_context(struct intel_iommu *iommu, 1272 u16 did, u16 source_id, u8 function_mask, 1273 u64 type) 1274 { 1275 u64 val = 0; 1276 unsigned long flag; 1277 1278 switch (type) { 1279 case DMA_CCMD_GLOBAL_INVL: 1280 val = DMA_CCMD_GLOBAL_INVL; 1281 break; 1282 case DMA_CCMD_DOMAIN_INVL: 1283 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1284 break; 1285 case DMA_CCMD_DEVICE_INVL: 1286 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1287 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1288 break; 1289 default: 1290 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", 1291 iommu->name, type); 1292 return; 1293 } 1294 val |= DMA_CCMD_ICC; 1295 1296 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1297 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1298 1299 /* Make sure hardware complete it */ 1300 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1301 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1302 1303 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1304 } 1305 1306 /* return value determine if we need a write buffer flush */ 1307 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1308 u64 addr, unsigned int size_order, u64 type) 1309 { 1310 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1311 u64 val = 0, val_iva = 0; 1312 unsigned long flag; 1313 1314 switch (type) { 1315 case DMA_TLB_GLOBAL_FLUSH: 1316 /* global flush doesn't need set IVA_REG */ 1317 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1318 break; 1319 case DMA_TLB_DSI_FLUSH: 1320 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1321 break; 1322 case DMA_TLB_PSI_FLUSH: 1323 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1324 /* IH bit is passed in as part of address */ 1325 val_iva = size_order | addr; 1326 break; 1327 default: 1328 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", 1329 iommu->name, type); 1330 return; 1331 } 1332 1333 if (cap_write_drain(iommu->cap)) 1334 val |= DMA_TLB_WRITE_DRAIN; 1335 1336 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1337 /* Note: Only uses first TLB reg currently */ 1338 if (val_iva) 1339 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1340 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1341 1342 /* Make sure hardware complete it */ 1343 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1344 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1345 1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1347 1348 /* check IOTLB invalidation granularity */ 1349 if (DMA_TLB_IAIG(val) == 0) 1350 pr_err("Flush IOTLB failed\n"); 1351 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1352 pr_debug("TLB flush request %Lx, actual %Lx\n", 1353 (unsigned long long)DMA_TLB_IIRG(type), 1354 (unsigned long long)DMA_TLB_IAIG(val)); 1355 } 1356 1357 static struct device_domain_info * 1358 domain_lookup_dev_info(struct dmar_domain *domain, 1359 struct intel_iommu *iommu, u8 bus, u8 devfn) 1360 { 1361 struct device_domain_info *info; 1362 unsigned long flags; 1363 1364 spin_lock_irqsave(&domain->lock, flags); 1365 list_for_each_entry(info, &domain->devices, link) { 1366 if (info->iommu == iommu && info->bus == bus && 1367 info->devfn == devfn) { 1368 spin_unlock_irqrestore(&domain->lock, flags); 1369 return info; 1370 } 1371 } 1372 spin_unlock_irqrestore(&domain->lock, flags); 1373 1374 return NULL; 1375 } 1376 1377 static void domain_update_iotlb(struct dmar_domain *domain) 1378 { 1379 struct dev_pasid_info *dev_pasid; 1380 struct device_domain_info *info; 1381 bool has_iotlb_device = false; 1382 unsigned long flags; 1383 1384 spin_lock_irqsave(&domain->lock, flags); 1385 list_for_each_entry(info, &domain->devices, link) { 1386 if (info->ats_enabled) { 1387 has_iotlb_device = true; 1388 break; 1389 } 1390 } 1391 1392 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1393 info = dev_iommu_priv_get(dev_pasid->dev); 1394 if (info->ats_enabled) { 1395 has_iotlb_device = true; 1396 break; 1397 } 1398 } 1399 domain->has_iotlb_device = has_iotlb_device; 1400 spin_unlock_irqrestore(&domain->lock, flags); 1401 } 1402 1403 /* 1404 * The extra devTLB flush quirk impacts those QAT devices with PCI device 1405 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() 1406 * check because it applies only to the built-in QAT devices and it doesn't 1407 * grant additional privileges. 1408 */ 1409 #define BUGGY_QAT_DEVID_MASK 0x4940 1410 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) 1411 { 1412 if (pdev->vendor != PCI_VENDOR_ID_INTEL) 1413 return false; 1414 1415 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) 1416 return false; 1417 1418 return true; 1419 } 1420 1421 static void iommu_enable_pci_caps(struct device_domain_info *info) 1422 { 1423 struct pci_dev *pdev; 1424 1425 if (!dev_is_pci(info->dev)) 1426 return; 1427 1428 pdev = to_pci_dev(info->dev); 1429 1430 /* The PCIe spec, in its wisdom, declares that the behaviour of 1431 the device if you enable PASID support after ATS support is 1432 undefined. So always enable PASID support on devices which 1433 have it, even if we can't yet know if we're ever going to 1434 use it. */ 1435 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1436 info->pasid_enabled = 1; 1437 1438 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1439 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1440 info->ats_enabled = 1; 1441 domain_update_iotlb(info->domain); 1442 } 1443 } 1444 1445 static void iommu_disable_pci_caps(struct device_domain_info *info) 1446 { 1447 struct pci_dev *pdev; 1448 1449 if (!dev_is_pci(info->dev)) 1450 return; 1451 1452 pdev = to_pci_dev(info->dev); 1453 1454 if (info->ats_enabled) { 1455 pci_disable_ats(pdev); 1456 info->ats_enabled = 0; 1457 domain_update_iotlb(info->domain); 1458 } 1459 1460 if (info->pasid_enabled) { 1461 pci_disable_pasid(pdev); 1462 info->pasid_enabled = 0; 1463 } 1464 } 1465 1466 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1467 u64 addr, unsigned int mask) 1468 { 1469 u16 sid, qdep; 1470 1471 if (!info || !info->ats_enabled) 1472 return; 1473 1474 sid = info->bus << 8 | info->devfn; 1475 qdep = info->ats_qdep; 1476 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1477 qdep, addr, mask); 1478 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); 1479 } 1480 1481 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1482 u64 addr, unsigned mask) 1483 { 1484 struct dev_pasid_info *dev_pasid; 1485 struct device_domain_info *info; 1486 unsigned long flags; 1487 1488 if (!domain->has_iotlb_device) 1489 return; 1490 1491 spin_lock_irqsave(&domain->lock, flags); 1492 list_for_each_entry(info, &domain->devices, link) 1493 __iommu_flush_dev_iotlb(info, addr, mask); 1494 1495 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 1496 info = dev_iommu_priv_get(dev_pasid->dev); 1497 1498 if (!info->ats_enabled) 1499 continue; 1500 1501 qi_flush_dev_iotlb_pasid(info->iommu, 1502 PCI_DEVID(info->bus, info->devfn), 1503 info->pfsid, dev_pasid->pasid, 1504 info->ats_qdep, addr, 1505 mask); 1506 } 1507 spin_unlock_irqrestore(&domain->lock, flags); 1508 } 1509 1510 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu, 1511 struct dmar_domain *domain, u64 addr, 1512 unsigned long npages, bool ih) 1513 { 1514 u16 did = domain_id_iommu(domain, iommu); 1515 struct dev_pasid_info *dev_pasid; 1516 unsigned long flags; 1517 1518 spin_lock_irqsave(&domain->lock, flags); 1519 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) 1520 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih); 1521 1522 if (!list_empty(&domain->devices)) 1523 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih); 1524 spin_unlock_irqrestore(&domain->lock, flags); 1525 } 1526 1527 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1528 struct dmar_domain *domain, 1529 unsigned long pfn, unsigned int pages, 1530 int ih, int map) 1531 { 1532 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1533 unsigned int mask = ilog2(aligned_pages); 1534 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1535 u16 did = domain_id_iommu(domain, iommu); 1536 1537 if (WARN_ON(!pages)) 1538 return; 1539 1540 if (ih) 1541 ih = 1 << 6; 1542 1543 if (domain->use_first_level) { 1544 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih); 1545 } else { 1546 unsigned long bitmask = aligned_pages - 1; 1547 1548 /* 1549 * PSI masks the low order bits of the base address. If the 1550 * address isn't aligned to the mask, then compute a mask value 1551 * needed to ensure the target range is flushed. 1552 */ 1553 if (unlikely(bitmask & pfn)) { 1554 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1555 1556 /* 1557 * Since end_pfn <= pfn + bitmask, the only way bits 1558 * higher than bitmask can differ in pfn and end_pfn is 1559 * by carrying. This means after masking out bitmask, 1560 * high bits starting with the first set bit in 1561 * shared_bits are all equal in both pfn and end_pfn. 1562 */ 1563 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1564 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1565 } 1566 1567 /* 1568 * Fallback to domain selective flush if no PSI support or 1569 * the size is too big. 1570 */ 1571 if (!cap_pgsel_inv(iommu->cap) || 1572 mask > cap_max_amask_val(iommu->cap)) 1573 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1574 DMA_TLB_DSI_FLUSH); 1575 else 1576 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1577 DMA_TLB_PSI_FLUSH); 1578 } 1579 1580 /* 1581 * In caching mode, changes of pages from non-present to present require 1582 * flush. However, device IOTLB doesn't need to be flushed in this case. 1583 */ 1584 if (!cap_caching_mode(iommu->cap) || !map) 1585 iommu_flush_dev_iotlb(domain, addr, mask); 1586 } 1587 1588 /* Notification for newly created mappings */ 1589 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1590 struct dmar_domain *domain, 1591 unsigned long pfn, unsigned int pages) 1592 { 1593 /* 1594 * It's a non-present to present mapping. Only flush if caching mode 1595 * and second level. 1596 */ 1597 if (cap_caching_mode(iommu->cap) && !domain->use_first_level) 1598 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1599 else 1600 iommu_flush_write_buffer(iommu); 1601 } 1602 1603 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1604 { 1605 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1606 struct iommu_domain_info *info; 1607 unsigned long idx; 1608 1609 xa_for_each(&dmar_domain->iommu_array, idx, info) { 1610 struct intel_iommu *iommu = info->iommu; 1611 u16 did = domain_id_iommu(dmar_domain, iommu); 1612 1613 if (dmar_domain->use_first_level) 1614 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0); 1615 else 1616 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1617 DMA_TLB_DSI_FLUSH); 1618 1619 if (!cap_caching_mode(iommu->cap)) 1620 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1621 } 1622 } 1623 1624 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1625 { 1626 u32 pmen; 1627 unsigned long flags; 1628 1629 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1630 return; 1631 1632 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1633 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1634 pmen &= ~DMA_PMEN_EPM; 1635 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1636 1637 /* wait for the protected region status bit to clear */ 1638 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1639 readl, !(pmen & DMA_PMEN_PRS), pmen); 1640 1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1642 } 1643 1644 static void iommu_enable_translation(struct intel_iommu *iommu) 1645 { 1646 u32 sts; 1647 unsigned long flags; 1648 1649 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1650 iommu->gcmd |= DMA_GCMD_TE; 1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1652 1653 /* Make sure hardware complete it */ 1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1655 readl, (sts & DMA_GSTS_TES), sts); 1656 1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1658 } 1659 1660 static void iommu_disable_translation(struct intel_iommu *iommu) 1661 { 1662 u32 sts; 1663 unsigned long flag; 1664 1665 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1666 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1667 return; 1668 1669 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1670 iommu->gcmd &= ~DMA_GCMD_TE; 1671 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1672 1673 /* Make sure hardware complete it */ 1674 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1675 readl, (!(sts & DMA_GSTS_TES)), sts); 1676 1677 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1678 } 1679 1680 static int iommu_init_domains(struct intel_iommu *iommu) 1681 { 1682 u32 ndomains; 1683 1684 ndomains = cap_ndoms(iommu->cap); 1685 pr_debug("%s: Number of Domains supported <%d>\n", 1686 iommu->name, ndomains); 1687 1688 spin_lock_init(&iommu->lock); 1689 1690 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1691 if (!iommu->domain_ids) 1692 return -ENOMEM; 1693 1694 /* 1695 * If Caching mode is set, then invalid translations are tagged 1696 * with domain-id 0, hence we need to pre-allocate it. We also 1697 * use domain-id 0 as a marker for non-allocated domain-id, so 1698 * make sure it is not used for a real domain. 1699 */ 1700 set_bit(0, iommu->domain_ids); 1701 1702 /* 1703 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1704 * entry for first-level or pass-through translation modes should 1705 * be programmed with a domain id different from those used for 1706 * second-level or nested translation. We reserve a domain id for 1707 * this purpose. This domain id is also used for identity domain 1708 * in legacy mode. 1709 */ 1710 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1711 1712 return 0; 1713 } 1714 1715 static void disable_dmar_iommu(struct intel_iommu *iommu) 1716 { 1717 if (!iommu->domain_ids) 1718 return; 1719 1720 /* 1721 * All iommu domains must have been detached from the devices, 1722 * hence there should be no domain IDs in use. 1723 */ 1724 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) 1725 > NUM_RESERVED_DID)) 1726 return; 1727 1728 if (iommu->gcmd & DMA_GCMD_TE) 1729 iommu_disable_translation(iommu); 1730 } 1731 1732 static void free_dmar_iommu(struct intel_iommu *iommu) 1733 { 1734 if (iommu->domain_ids) { 1735 bitmap_free(iommu->domain_ids); 1736 iommu->domain_ids = NULL; 1737 } 1738 1739 if (iommu->copied_tables) { 1740 bitmap_free(iommu->copied_tables); 1741 iommu->copied_tables = NULL; 1742 } 1743 1744 /* free context mapping */ 1745 free_context_table(iommu); 1746 1747 #ifdef CONFIG_INTEL_IOMMU_SVM 1748 if (pasid_supported(iommu)) { 1749 if (ecap_prs(iommu->ecap)) 1750 intel_svm_finish_prq(iommu); 1751 } 1752 #endif 1753 } 1754 1755 /* 1756 * Check and return whether first level is used by default for 1757 * DMA translation. 1758 */ 1759 static bool first_level_by_default(unsigned int type) 1760 { 1761 /* Only SL is available in legacy mode */ 1762 if (!scalable_mode_support()) 1763 return false; 1764 1765 /* Only level (either FL or SL) is available, just use it */ 1766 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1767 return intel_cap_flts_sanity(); 1768 1769 /* Both levels are available, decide it based on domain type */ 1770 return type != IOMMU_DOMAIN_UNMANAGED; 1771 } 1772 1773 static struct dmar_domain *alloc_domain(unsigned int type) 1774 { 1775 struct dmar_domain *domain; 1776 1777 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1778 if (!domain) 1779 return NULL; 1780 1781 domain->nid = NUMA_NO_NODE; 1782 if (first_level_by_default(type)) 1783 domain->use_first_level = true; 1784 domain->has_iotlb_device = false; 1785 INIT_LIST_HEAD(&domain->devices); 1786 INIT_LIST_HEAD(&domain->dev_pasids); 1787 spin_lock_init(&domain->lock); 1788 xa_init(&domain->iommu_array); 1789 1790 return domain; 1791 } 1792 1793 static int domain_attach_iommu(struct dmar_domain *domain, 1794 struct intel_iommu *iommu) 1795 { 1796 struct iommu_domain_info *info, *curr; 1797 unsigned long ndomains; 1798 int num, ret = -ENOSPC; 1799 1800 info = kzalloc(sizeof(*info), GFP_KERNEL); 1801 if (!info) 1802 return -ENOMEM; 1803 1804 spin_lock(&iommu->lock); 1805 curr = xa_load(&domain->iommu_array, iommu->seq_id); 1806 if (curr) { 1807 curr->refcnt++; 1808 spin_unlock(&iommu->lock); 1809 kfree(info); 1810 return 0; 1811 } 1812 1813 ndomains = cap_ndoms(iommu->cap); 1814 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1815 if (num >= ndomains) { 1816 pr_err("%s: No free domain ids\n", iommu->name); 1817 goto err_unlock; 1818 } 1819 1820 set_bit(num, iommu->domain_ids); 1821 info->refcnt = 1; 1822 info->did = num; 1823 info->iommu = iommu; 1824 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, 1825 NULL, info, GFP_ATOMIC); 1826 if (curr) { 1827 ret = xa_err(curr) ? : -EBUSY; 1828 goto err_clear; 1829 } 1830 domain_update_iommu_cap(domain); 1831 1832 spin_unlock(&iommu->lock); 1833 return 0; 1834 1835 err_clear: 1836 clear_bit(info->did, iommu->domain_ids); 1837 err_unlock: 1838 spin_unlock(&iommu->lock); 1839 kfree(info); 1840 return ret; 1841 } 1842 1843 static void domain_detach_iommu(struct dmar_domain *domain, 1844 struct intel_iommu *iommu) 1845 { 1846 struct iommu_domain_info *info; 1847 1848 spin_lock(&iommu->lock); 1849 info = xa_load(&domain->iommu_array, iommu->seq_id); 1850 if (--info->refcnt == 0) { 1851 clear_bit(info->did, iommu->domain_ids); 1852 xa_erase(&domain->iommu_array, iommu->seq_id); 1853 domain->nid = NUMA_NO_NODE; 1854 domain_update_iommu_cap(domain); 1855 kfree(info); 1856 } 1857 spin_unlock(&iommu->lock); 1858 } 1859 1860 static inline int guestwidth_to_adjustwidth(int gaw) 1861 { 1862 int agaw; 1863 int r = (gaw - 12) % 9; 1864 1865 if (r == 0) 1866 agaw = gaw; 1867 else 1868 agaw = gaw + 9 - r; 1869 if (agaw > 64) 1870 agaw = 64; 1871 return agaw; 1872 } 1873 1874 static void domain_exit(struct dmar_domain *domain) 1875 { 1876 if (domain->pgd) { 1877 LIST_HEAD(freelist); 1878 1879 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1880 put_pages_list(&freelist); 1881 } 1882 1883 if (WARN_ON(!list_empty(&domain->devices))) 1884 return; 1885 1886 kfree(domain); 1887 } 1888 1889 /* 1890 * Get the PASID directory size for scalable mode context entry. 1891 * Value of X in the PDTS field of a scalable mode context entry 1892 * indicates PASID directory with 2^(X + 7) entries. 1893 */ 1894 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1895 { 1896 unsigned long pds, max_pde; 1897 1898 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1899 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1900 if (pds < 7) 1901 return 0; 1902 1903 return pds - 7; 1904 } 1905 1906 /* 1907 * Set the RID_PASID field of a scalable mode context entry. The 1908 * IOMMU hardware will use the PASID value set in this field for 1909 * DMA translations of DMA requests without PASID. 1910 */ 1911 static inline void 1912 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1913 { 1914 context->hi |= pasid & ((1 << 20) - 1); 1915 } 1916 1917 /* 1918 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1919 * entry. 1920 */ 1921 static inline void context_set_sm_dte(struct context_entry *context) 1922 { 1923 context->lo |= BIT_ULL(2); 1924 } 1925 1926 /* 1927 * Set the PRE(Page Request Enable) field of a scalable mode context 1928 * entry. 1929 */ 1930 static inline void context_set_sm_pre(struct context_entry *context) 1931 { 1932 context->lo |= BIT_ULL(4); 1933 } 1934 1935 /* Convert value to context PASID directory size field coding. */ 1936 #define context_pdts(pds) (((pds) & 0x7) << 9) 1937 1938 static int domain_context_mapping_one(struct dmar_domain *domain, 1939 struct intel_iommu *iommu, 1940 struct pasid_table *table, 1941 u8 bus, u8 devfn) 1942 { 1943 struct device_domain_info *info = 1944 domain_lookup_dev_info(domain, iommu, bus, devfn); 1945 u16 did = domain_id_iommu(domain, iommu); 1946 int translation = CONTEXT_TT_MULTI_LEVEL; 1947 struct context_entry *context; 1948 int ret; 1949 1950 if (hw_pass_through && domain_type_is_si(domain)) 1951 translation = CONTEXT_TT_PASS_THROUGH; 1952 1953 pr_debug("Set context mapping for %02x:%02x.%d\n", 1954 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1955 1956 spin_lock(&iommu->lock); 1957 ret = -ENOMEM; 1958 context = iommu_context_addr(iommu, bus, devfn, 1); 1959 if (!context) 1960 goto out_unlock; 1961 1962 ret = 0; 1963 if (context_present(context) && !context_copied(iommu, bus, devfn)) 1964 goto out_unlock; 1965 1966 /* 1967 * For kdump cases, old valid entries may be cached due to the 1968 * in-flight DMA and copied pgtable, but there is no unmapping 1969 * behaviour for them, thus we need an explicit cache flush for 1970 * the newly-mapped device. For kdump, at this point, the device 1971 * is supposed to finish reset at its driver probe stage, so no 1972 * in-flight DMA will exist, and we don't need to worry anymore 1973 * hereafter. 1974 */ 1975 if (context_copied(iommu, bus, devfn)) { 1976 u16 did_old = context_domain_id(context); 1977 1978 if (did_old < cap_ndoms(iommu->cap)) { 1979 iommu->flush.flush_context(iommu, did_old, 1980 (((u16)bus) << 8) | devfn, 1981 DMA_CCMD_MASK_NOBIT, 1982 DMA_CCMD_DEVICE_INVL); 1983 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1984 DMA_TLB_DSI_FLUSH); 1985 } 1986 1987 clear_context_copied(iommu, bus, devfn); 1988 } 1989 1990 context_clear_entry(context); 1991 1992 if (sm_supported(iommu)) { 1993 unsigned long pds; 1994 1995 /* Setup the PASID DIR pointer: */ 1996 pds = context_get_sm_pds(table); 1997 context->lo = (u64)virt_to_phys(table->table) | 1998 context_pdts(pds); 1999 2000 /* Setup the RID_PASID field: */ 2001 context_set_sm_rid2pasid(context, IOMMU_NO_PASID); 2002 2003 /* 2004 * Setup the Device-TLB enable bit and Page request 2005 * Enable bit: 2006 */ 2007 if (info && info->ats_supported) 2008 context_set_sm_dte(context); 2009 if (info && info->pri_supported) 2010 context_set_sm_pre(context); 2011 if (info && info->pasid_supported) 2012 context_set_pasid(context); 2013 } else { 2014 struct dma_pte *pgd = domain->pgd; 2015 int agaw; 2016 2017 context_set_domain_id(context, did); 2018 2019 if (translation != CONTEXT_TT_PASS_THROUGH) { 2020 /* 2021 * Skip top levels of page tables for iommu which has 2022 * less agaw than default. Unnecessary for PT mode. 2023 */ 2024 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2025 ret = -ENOMEM; 2026 pgd = phys_to_virt(dma_pte_addr(pgd)); 2027 if (!dma_pte_present(pgd)) 2028 goto out_unlock; 2029 } 2030 2031 if (info && info->ats_supported) 2032 translation = CONTEXT_TT_DEV_IOTLB; 2033 else 2034 translation = CONTEXT_TT_MULTI_LEVEL; 2035 2036 context_set_address_root(context, virt_to_phys(pgd)); 2037 context_set_address_width(context, agaw); 2038 } else { 2039 /* 2040 * In pass through mode, AW must be programmed to 2041 * indicate the largest AGAW value supported by 2042 * hardware. And ASR is ignored by hardware. 2043 */ 2044 context_set_address_width(context, iommu->msagaw); 2045 } 2046 2047 context_set_translation_type(context, translation); 2048 } 2049 2050 context_set_fault_enable(context); 2051 context_set_present(context); 2052 if (!ecap_coherent(iommu->ecap)) 2053 clflush_cache_range(context, sizeof(*context)); 2054 2055 /* 2056 * It's a non-present to present mapping. If hardware doesn't cache 2057 * non-present entry we only need to flush the write-buffer. If the 2058 * _does_ cache non-present entries, then it does so in the special 2059 * domain #0, which we have to flush: 2060 */ 2061 if (cap_caching_mode(iommu->cap)) { 2062 iommu->flush.flush_context(iommu, 0, 2063 (((u16)bus) << 8) | devfn, 2064 DMA_CCMD_MASK_NOBIT, 2065 DMA_CCMD_DEVICE_INVL); 2066 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2067 } else { 2068 iommu_flush_write_buffer(iommu); 2069 } 2070 2071 ret = 0; 2072 2073 out_unlock: 2074 spin_unlock(&iommu->lock); 2075 2076 return ret; 2077 } 2078 2079 struct domain_context_mapping_data { 2080 struct dmar_domain *domain; 2081 struct intel_iommu *iommu; 2082 struct pasid_table *table; 2083 }; 2084 2085 static int domain_context_mapping_cb(struct pci_dev *pdev, 2086 u16 alias, void *opaque) 2087 { 2088 struct domain_context_mapping_data *data = opaque; 2089 2090 return domain_context_mapping_one(data->domain, data->iommu, 2091 data->table, PCI_BUS_NUM(alias), 2092 alias & 0xff); 2093 } 2094 2095 static int 2096 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2097 { 2098 struct domain_context_mapping_data data; 2099 struct pasid_table *table; 2100 struct intel_iommu *iommu; 2101 u8 bus, devfn; 2102 2103 iommu = device_to_iommu(dev, &bus, &devfn); 2104 if (!iommu) 2105 return -ENODEV; 2106 2107 table = intel_pasid_get_table(dev); 2108 2109 if (!dev_is_pci(dev)) 2110 return domain_context_mapping_one(domain, iommu, table, 2111 bus, devfn); 2112 2113 data.domain = domain; 2114 data.iommu = iommu; 2115 data.table = table; 2116 2117 return pci_for_each_dma_alias(to_pci_dev(dev), 2118 &domain_context_mapping_cb, &data); 2119 } 2120 2121 /* Returns a number of VTD pages, but aligned to MM page size */ 2122 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2123 size_t size) 2124 { 2125 host_addr &= ~PAGE_MASK; 2126 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2127 } 2128 2129 /* Return largest possible superpage level for a given mapping */ 2130 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2131 unsigned long iov_pfn, 2132 unsigned long phy_pfn, 2133 unsigned long pages) 2134 { 2135 int support, level = 1; 2136 unsigned long pfnmerge; 2137 2138 support = domain->iommu_superpage; 2139 2140 /* To use a large page, the virtual *and* physical addresses 2141 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2142 of them will mean we have to use smaller pages. So just 2143 merge them and check both at once. */ 2144 pfnmerge = iov_pfn | phy_pfn; 2145 2146 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2147 pages >>= VTD_STRIDE_SHIFT; 2148 if (!pages) 2149 break; 2150 pfnmerge >>= VTD_STRIDE_SHIFT; 2151 level++; 2152 support--; 2153 } 2154 return level; 2155 } 2156 2157 /* 2158 * Ensure that old small page tables are removed to make room for superpage(s). 2159 * We're going to add new large pages, so make sure we don't remove their parent 2160 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2161 */ 2162 static void switch_to_super_page(struct dmar_domain *domain, 2163 unsigned long start_pfn, 2164 unsigned long end_pfn, int level) 2165 { 2166 unsigned long lvl_pages = lvl_to_nr_pages(level); 2167 struct iommu_domain_info *info; 2168 struct dma_pte *pte = NULL; 2169 unsigned long i; 2170 2171 while (start_pfn <= end_pfn) { 2172 if (!pte) 2173 pte = pfn_to_dma_pte(domain, start_pfn, &level, 2174 GFP_ATOMIC); 2175 2176 if (dma_pte_present(pte)) { 2177 dma_pte_free_pagetable(domain, start_pfn, 2178 start_pfn + lvl_pages - 1, 2179 level + 1); 2180 2181 xa_for_each(&domain->iommu_array, i, info) 2182 iommu_flush_iotlb_psi(info->iommu, domain, 2183 start_pfn, lvl_pages, 2184 0, 0); 2185 } 2186 2187 pte++; 2188 start_pfn += lvl_pages; 2189 if (first_pte_in_page(pte)) 2190 pte = NULL; 2191 } 2192 } 2193 2194 static int 2195 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2196 unsigned long phys_pfn, unsigned long nr_pages, int prot, 2197 gfp_t gfp) 2198 { 2199 struct dma_pte *first_pte = NULL, *pte = NULL; 2200 unsigned int largepage_lvl = 0; 2201 unsigned long lvl_pages = 0; 2202 phys_addr_t pteval; 2203 u64 attr; 2204 2205 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 2206 return -EINVAL; 2207 2208 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2209 return -EINVAL; 2210 2211 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2212 attr |= DMA_FL_PTE_PRESENT; 2213 if (domain->use_first_level) { 2214 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2215 if (prot & DMA_PTE_WRITE) 2216 attr |= DMA_FL_PTE_DIRTY; 2217 } 2218 2219 domain->has_mappings = true; 2220 2221 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2222 2223 while (nr_pages > 0) { 2224 uint64_t tmp; 2225 2226 if (!pte) { 2227 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2228 phys_pfn, nr_pages); 2229 2230 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 2231 gfp); 2232 if (!pte) 2233 return -ENOMEM; 2234 first_pte = pte; 2235 2236 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2237 2238 /* It is large page*/ 2239 if (largepage_lvl > 1) { 2240 unsigned long end_pfn; 2241 unsigned long pages_to_remove; 2242 2243 pteval |= DMA_PTE_LARGE_PAGE; 2244 pages_to_remove = min_t(unsigned long, nr_pages, 2245 nr_pte_to_next_page(pte) * lvl_pages); 2246 end_pfn = iov_pfn + pages_to_remove - 1; 2247 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2248 } else { 2249 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2250 } 2251 2252 } 2253 /* We don't need lock here, nobody else 2254 * touches the iova range 2255 */ 2256 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2257 if (tmp) { 2258 static int dumps = 5; 2259 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2260 iov_pfn, tmp, (unsigned long long)pteval); 2261 if (dumps) { 2262 dumps--; 2263 debug_dma_dump_mappings(NULL); 2264 } 2265 WARN_ON(1); 2266 } 2267 2268 nr_pages -= lvl_pages; 2269 iov_pfn += lvl_pages; 2270 phys_pfn += lvl_pages; 2271 pteval += lvl_pages * VTD_PAGE_SIZE; 2272 2273 /* If the next PTE would be the first in a new page, then we 2274 * need to flush the cache on the entries we've just written. 2275 * And then we'll need to recalculate 'pte', so clear it and 2276 * let it get set again in the if (!pte) block above. 2277 * 2278 * If we're done (!nr_pages) we need to flush the cache too. 2279 * 2280 * Also if we've been setting superpages, we may need to 2281 * recalculate 'pte' and switch back to smaller pages for the 2282 * end of the mapping, if the trailing size is not enough to 2283 * use another superpage (i.e. nr_pages < lvl_pages). 2284 */ 2285 pte++; 2286 if (!nr_pages || first_pte_in_page(pte) || 2287 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2288 domain_flush_cache(domain, first_pte, 2289 (void *)pte - (void *)first_pte); 2290 pte = NULL; 2291 } 2292 } 2293 2294 return 0; 2295 } 2296 2297 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2298 { 2299 struct intel_iommu *iommu = info->iommu; 2300 struct context_entry *context; 2301 u16 did_old; 2302 2303 if (!iommu) 2304 return; 2305 2306 spin_lock(&iommu->lock); 2307 context = iommu_context_addr(iommu, bus, devfn, 0); 2308 if (!context) { 2309 spin_unlock(&iommu->lock); 2310 return; 2311 } 2312 2313 if (sm_supported(iommu)) { 2314 if (hw_pass_through && domain_type_is_si(info->domain)) 2315 did_old = FLPT_DEFAULT_DID; 2316 else 2317 did_old = domain_id_iommu(info->domain, iommu); 2318 } else { 2319 did_old = context_domain_id(context); 2320 } 2321 2322 context_clear_entry(context); 2323 __iommu_flush_cache(iommu, context, sizeof(*context)); 2324 spin_unlock(&iommu->lock); 2325 iommu->flush.flush_context(iommu, 2326 did_old, 2327 (((u16)bus) << 8) | devfn, 2328 DMA_CCMD_MASK_NOBIT, 2329 DMA_CCMD_DEVICE_INVL); 2330 2331 if (sm_supported(iommu)) 2332 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2333 2334 iommu->flush.flush_iotlb(iommu, 2335 did_old, 2336 0, 2337 0, 2338 DMA_TLB_DSI_FLUSH); 2339 2340 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2341 } 2342 2343 static int domain_setup_first_level(struct intel_iommu *iommu, 2344 struct dmar_domain *domain, 2345 struct device *dev, 2346 u32 pasid) 2347 { 2348 struct dma_pte *pgd = domain->pgd; 2349 int agaw, level; 2350 int flags = 0; 2351 2352 /* 2353 * Skip top levels of page tables for iommu which has 2354 * less agaw than default. Unnecessary for PT mode. 2355 */ 2356 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2357 pgd = phys_to_virt(dma_pte_addr(pgd)); 2358 if (!dma_pte_present(pgd)) 2359 return -ENOMEM; 2360 } 2361 2362 level = agaw_to_level(agaw); 2363 if (level != 4 && level != 5) 2364 return -EINVAL; 2365 2366 if (level == 5) 2367 flags |= PASID_FLAG_FL5LP; 2368 2369 if (domain->force_snooping) 2370 flags |= PASID_FLAG_PAGE_SNOOP; 2371 2372 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2373 domain_id_iommu(domain, iommu), 2374 flags); 2375 } 2376 2377 static bool dev_is_real_dma_subdevice(struct device *dev) 2378 { 2379 return dev && dev_is_pci(dev) && 2380 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2381 } 2382 2383 static int iommu_domain_identity_map(struct dmar_domain *domain, 2384 unsigned long first_vpfn, 2385 unsigned long last_vpfn) 2386 { 2387 /* 2388 * RMRR range might have overlap with physical memory range, 2389 * clear it first 2390 */ 2391 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2392 2393 return __domain_mapping(domain, first_vpfn, 2394 first_vpfn, last_vpfn - first_vpfn + 1, 2395 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); 2396 } 2397 2398 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2399 2400 static int __init si_domain_init(int hw) 2401 { 2402 struct dmar_rmrr_unit *rmrr; 2403 struct device *dev; 2404 int i, nid, ret; 2405 2406 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2407 if (!si_domain) 2408 return -EFAULT; 2409 2410 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2411 domain_exit(si_domain); 2412 si_domain = NULL; 2413 return -EFAULT; 2414 } 2415 2416 if (hw) 2417 return 0; 2418 2419 for_each_online_node(nid) { 2420 unsigned long start_pfn, end_pfn; 2421 int i; 2422 2423 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2424 ret = iommu_domain_identity_map(si_domain, 2425 mm_to_dma_pfn_start(start_pfn), 2426 mm_to_dma_pfn_end(end_pfn-1)); 2427 if (ret) 2428 return ret; 2429 } 2430 } 2431 2432 /* 2433 * Identity map the RMRRs so that devices with RMRRs could also use 2434 * the si_domain. 2435 */ 2436 for_each_rmrr_units(rmrr) { 2437 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2438 i, dev) { 2439 unsigned long long start = rmrr->base_address; 2440 unsigned long long end = rmrr->end_address; 2441 2442 if (WARN_ON(end < start || 2443 end >> agaw_to_width(si_domain->agaw))) 2444 continue; 2445 2446 ret = iommu_domain_identity_map(si_domain, 2447 mm_to_dma_pfn_start(start >> PAGE_SHIFT), 2448 mm_to_dma_pfn_end(end >> PAGE_SHIFT)); 2449 if (ret) 2450 return ret; 2451 } 2452 } 2453 2454 return 0; 2455 } 2456 2457 static int dmar_domain_attach_device(struct dmar_domain *domain, 2458 struct device *dev) 2459 { 2460 struct device_domain_info *info = dev_iommu_priv_get(dev); 2461 struct intel_iommu *iommu; 2462 unsigned long flags; 2463 u8 bus, devfn; 2464 int ret; 2465 2466 iommu = device_to_iommu(dev, &bus, &devfn); 2467 if (!iommu) 2468 return -ENODEV; 2469 2470 ret = domain_attach_iommu(domain, iommu); 2471 if (ret) 2472 return ret; 2473 info->domain = domain; 2474 spin_lock_irqsave(&domain->lock, flags); 2475 list_add(&info->link, &domain->devices); 2476 spin_unlock_irqrestore(&domain->lock, flags); 2477 2478 /* PASID table is mandatory for a PCI device in scalable mode. */ 2479 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2480 /* Setup the PASID entry for requests without PASID: */ 2481 if (hw_pass_through && domain_type_is_si(domain)) 2482 ret = intel_pasid_setup_pass_through(iommu, domain, 2483 dev, IOMMU_NO_PASID); 2484 else if (domain->use_first_level) 2485 ret = domain_setup_first_level(iommu, domain, dev, 2486 IOMMU_NO_PASID); 2487 else 2488 ret = intel_pasid_setup_second_level(iommu, domain, 2489 dev, IOMMU_NO_PASID); 2490 if (ret) { 2491 dev_err(dev, "Setup RID2PASID failed\n"); 2492 device_block_translation(dev); 2493 return ret; 2494 } 2495 } 2496 2497 ret = domain_context_mapping(domain, dev); 2498 if (ret) { 2499 dev_err(dev, "Domain context map failed\n"); 2500 device_block_translation(dev); 2501 return ret; 2502 } 2503 2504 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) 2505 iommu_enable_pci_caps(info); 2506 2507 return 0; 2508 } 2509 2510 /** 2511 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2512 * is relaxable (ie. is allowed to be not enforced under some conditions) 2513 * @dev: device handle 2514 * 2515 * We assume that PCI USB devices with RMRRs have them largely 2516 * for historical reasons and that the RMRR space is not actively used post 2517 * boot. This exclusion may change if vendors begin to abuse it. 2518 * 2519 * The same exception is made for graphics devices, with the requirement that 2520 * any use of the RMRR regions will be torn down before assigning the device 2521 * to a guest. 2522 * 2523 * Return: true if the RMRR is relaxable, false otherwise 2524 */ 2525 static bool device_rmrr_is_relaxable(struct device *dev) 2526 { 2527 struct pci_dev *pdev; 2528 2529 if (!dev_is_pci(dev)) 2530 return false; 2531 2532 pdev = to_pci_dev(dev); 2533 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2534 return true; 2535 else 2536 return false; 2537 } 2538 2539 /* 2540 * Return the required default domain type for a specific device. 2541 * 2542 * @dev: the device in query 2543 * @startup: true if this is during early boot 2544 * 2545 * Returns: 2546 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2547 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2548 * - 0: both identity and dynamic domains work for this device 2549 */ 2550 static int device_def_domain_type(struct device *dev) 2551 { 2552 if (dev_is_pci(dev)) { 2553 struct pci_dev *pdev = to_pci_dev(dev); 2554 2555 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2556 return IOMMU_DOMAIN_IDENTITY; 2557 2558 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2559 return IOMMU_DOMAIN_IDENTITY; 2560 } 2561 2562 return 0; 2563 } 2564 2565 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2566 { 2567 /* 2568 * Start from the sane iommu hardware state. 2569 * If the queued invalidation is already initialized by us 2570 * (for example, while enabling interrupt-remapping) then 2571 * we got the things already rolling from a sane state. 2572 */ 2573 if (!iommu->qi) { 2574 /* 2575 * Clear any previous faults. 2576 */ 2577 dmar_fault(-1, iommu); 2578 /* 2579 * Disable queued invalidation if supported and already enabled 2580 * before OS handover. 2581 */ 2582 dmar_disable_qi(iommu); 2583 } 2584 2585 if (dmar_enable_qi(iommu)) { 2586 /* 2587 * Queued Invalidate not enabled, use Register Based Invalidate 2588 */ 2589 iommu->flush.flush_context = __iommu_flush_context; 2590 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2591 pr_info("%s: Using Register based invalidation\n", 2592 iommu->name); 2593 } else { 2594 iommu->flush.flush_context = qi_flush_context; 2595 iommu->flush.flush_iotlb = qi_flush_iotlb; 2596 pr_info("%s: Using Queued invalidation\n", iommu->name); 2597 } 2598 } 2599 2600 static int copy_context_table(struct intel_iommu *iommu, 2601 struct root_entry *old_re, 2602 struct context_entry **tbl, 2603 int bus, bool ext) 2604 { 2605 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2606 struct context_entry *new_ce = NULL, ce; 2607 struct context_entry *old_ce = NULL; 2608 struct root_entry re; 2609 phys_addr_t old_ce_phys; 2610 2611 tbl_idx = ext ? bus * 2 : bus; 2612 memcpy(&re, old_re, sizeof(re)); 2613 2614 for (devfn = 0; devfn < 256; devfn++) { 2615 /* First calculate the correct index */ 2616 idx = (ext ? devfn * 2 : devfn) % 256; 2617 2618 if (idx == 0) { 2619 /* First save what we may have and clean up */ 2620 if (new_ce) { 2621 tbl[tbl_idx] = new_ce; 2622 __iommu_flush_cache(iommu, new_ce, 2623 VTD_PAGE_SIZE); 2624 pos = 1; 2625 } 2626 2627 if (old_ce) 2628 memunmap(old_ce); 2629 2630 ret = 0; 2631 if (devfn < 0x80) 2632 old_ce_phys = root_entry_lctp(&re); 2633 else 2634 old_ce_phys = root_entry_uctp(&re); 2635 2636 if (!old_ce_phys) { 2637 if (ext && devfn == 0) { 2638 /* No LCTP, try UCTP */ 2639 devfn = 0x7f; 2640 continue; 2641 } else { 2642 goto out; 2643 } 2644 } 2645 2646 ret = -ENOMEM; 2647 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2648 MEMREMAP_WB); 2649 if (!old_ce) 2650 goto out; 2651 2652 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); 2653 if (!new_ce) 2654 goto out_unmap; 2655 2656 ret = 0; 2657 } 2658 2659 /* Now copy the context entry */ 2660 memcpy(&ce, old_ce + idx, sizeof(ce)); 2661 2662 if (!context_present(&ce)) 2663 continue; 2664 2665 did = context_domain_id(&ce); 2666 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2667 set_bit(did, iommu->domain_ids); 2668 2669 set_context_copied(iommu, bus, devfn); 2670 new_ce[idx] = ce; 2671 } 2672 2673 tbl[tbl_idx + pos] = new_ce; 2674 2675 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2676 2677 out_unmap: 2678 memunmap(old_ce); 2679 2680 out: 2681 return ret; 2682 } 2683 2684 static int copy_translation_tables(struct intel_iommu *iommu) 2685 { 2686 struct context_entry **ctxt_tbls; 2687 struct root_entry *old_rt; 2688 phys_addr_t old_rt_phys; 2689 int ctxt_table_entries; 2690 u64 rtaddr_reg; 2691 int bus, ret; 2692 bool new_ext, ext; 2693 2694 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2695 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 2696 new_ext = !!sm_supported(iommu); 2697 2698 /* 2699 * The RTT bit can only be changed when translation is disabled, 2700 * but disabling translation means to open a window for data 2701 * corruption. So bail out and don't copy anything if we would 2702 * have to change the bit. 2703 */ 2704 if (new_ext != ext) 2705 return -EINVAL; 2706 2707 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); 2708 if (!iommu->copied_tables) 2709 return -ENOMEM; 2710 2711 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2712 if (!old_rt_phys) 2713 return -EINVAL; 2714 2715 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2716 if (!old_rt) 2717 return -ENOMEM; 2718 2719 /* This is too big for the stack - allocate it from slab */ 2720 ctxt_table_entries = ext ? 512 : 256; 2721 ret = -ENOMEM; 2722 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2723 if (!ctxt_tbls) 2724 goto out_unmap; 2725 2726 for (bus = 0; bus < 256; bus++) { 2727 ret = copy_context_table(iommu, &old_rt[bus], 2728 ctxt_tbls, bus, ext); 2729 if (ret) { 2730 pr_err("%s: Failed to copy context table for bus %d\n", 2731 iommu->name, bus); 2732 continue; 2733 } 2734 } 2735 2736 spin_lock(&iommu->lock); 2737 2738 /* Context tables are copied, now write them to the root_entry table */ 2739 for (bus = 0; bus < 256; bus++) { 2740 int idx = ext ? bus * 2 : bus; 2741 u64 val; 2742 2743 if (ctxt_tbls[idx]) { 2744 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2745 iommu->root_entry[bus].lo = val; 2746 } 2747 2748 if (!ext || !ctxt_tbls[idx + 1]) 2749 continue; 2750 2751 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2752 iommu->root_entry[bus].hi = val; 2753 } 2754 2755 spin_unlock(&iommu->lock); 2756 2757 kfree(ctxt_tbls); 2758 2759 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2760 2761 ret = 0; 2762 2763 out_unmap: 2764 memunmap(old_rt); 2765 2766 return ret; 2767 } 2768 2769 static int __init init_dmars(void) 2770 { 2771 struct dmar_drhd_unit *drhd; 2772 struct intel_iommu *iommu; 2773 int ret; 2774 2775 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 2776 if (ret) 2777 goto free_iommu; 2778 2779 for_each_iommu(iommu, drhd) { 2780 if (drhd->ignored) { 2781 iommu_disable_translation(iommu); 2782 continue; 2783 } 2784 2785 /* 2786 * Find the max pasid size of all IOMMU's in the system. 2787 * We need to ensure the system pasid table is no bigger 2788 * than the smallest supported. 2789 */ 2790 if (pasid_supported(iommu)) { 2791 u32 temp = 2 << ecap_pss(iommu->ecap); 2792 2793 intel_pasid_max_id = min_t(u32, temp, 2794 intel_pasid_max_id); 2795 } 2796 2797 intel_iommu_init_qi(iommu); 2798 2799 ret = iommu_init_domains(iommu); 2800 if (ret) 2801 goto free_iommu; 2802 2803 init_translation_status(iommu); 2804 2805 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 2806 iommu_disable_translation(iommu); 2807 clear_translation_pre_enabled(iommu); 2808 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 2809 iommu->name); 2810 } 2811 2812 /* 2813 * TBD: 2814 * we could share the same root & context tables 2815 * among all IOMMU's. Need to Split it later. 2816 */ 2817 ret = iommu_alloc_root_entry(iommu); 2818 if (ret) 2819 goto free_iommu; 2820 2821 if (translation_pre_enabled(iommu)) { 2822 pr_info("Translation already enabled - trying to copy translation structures\n"); 2823 2824 ret = copy_translation_tables(iommu); 2825 if (ret) { 2826 /* 2827 * We found the IOMMU with translation 2828 * enabled - but failed to copy over the 2829 * old root-entry table. Try to proceed 2830 * by disabling translation now and 2831 * allocating a clean root-entry table. 2832 * This might cause DMAR faults, but 2833 * probably the dump will still succeed. 2834 */ 2835 pr_err("Failed to copy translation tables from previous kernel for %s\n", 2836 iommu->name); 2837 iommu_disable_translation(iommu); 2838 clear_translation_pre_enabled(iommu); 2839 } else { 2840 pr_info("Copied translation tables from previous kernel for %s\n", 2841 iommu->name); 2842 } 2843 } 2844 2845 if (!ecap_pass_through(iommu->ecap)) 2846 hw_pass_through = 0; 2847 intel_svm_check(iommu); 2848 } 2849 2850 /* 2851 * Now that qi is enabled on all iommus, set the root entry and flush 2852 * caches. This is required on some Intel X58 chipsets, otherwise the 2853 * flush_context function will loop forever and the boot hangs. 2854 */ 2855 for_each_active_iommu(iommu, drhd) { 2856 iommu_flush_write_buffer(iommu); 2857 iommu_set_root_entry(iommu); 2858 } 2859 2860 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 2861 dmar_map_gfx = 0; 2862 #endif 2863 2864 if (!dmar_map_gfx) 2865 iommu_identity_mapping |= IDENTMAP_GFX; 2866 2867 check_tylersburg_isoch(); 2868 2869 ret = si_domain_init(hw_pass_through); 2870 if (ret) 2871 goto free_iommu; 2872 2873 /* 2874 * for each drhd 2875 * enable fault log 2876 * global invalidate context cache 2877 * global invalidate iotlb 2878 * enable translation 2879 */ 2880 for_each_iommu(iommu, drhd) { 2881 if (drhd->ignored) { 2882 /* 2883 * we always have to disable PMRs or DMA may fail on 2884 * this device 2885 */ 2886 if (force_on) 2887 iommu_disable_protect_mem_regions(iommu); 2888 continue; 2889 } 2890 2891 iommu_flush_write_buffer(iommu); 2892 2893 #ifdef CONFIG_INTEL_IOMMU_SVM 2894 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 2895 /* 2896 * Call dmar_alloc_hwirq() with dmar_global_lock held, 2897 * could cause possible lock race condition. 2898 */ 2899 up_write(&dmar_global_lock); 2900 ret = intel_svm_enable_prq(iommu); 2901 down_write(&dmar_global_lock); 2902 if (ret) 2903 goto free_iommu; 2904 } 2905 #endif 2906 ret = dmar_set_interrupt(iommu); 2907 if (ret) 2908 goto free_iommu; 2909 } 2910 2911 return 0; 2912 2913 free_iommu: 2914 for_each_active_iommu(iommu, drhd) { 2915 disable_dmar_iommu(iommu); 2916 free_dmar_iommu(iommu); 2917 } 2918 if (si_domain) { 2919 domain_exit(si_domain); 2920 si_domain = NULL; 2921 } 2922 2923 return ret; 2924 } 2925 2926 static void __init init_no_remapping_devices(void) 2927 { 2928 struct dmar_drhd_unit *drhd; 2929 struct device *dev; 2930 int i; 2931 2932 for_each_drhd_unit(drhd) { 2933 if (!drhd->include_all) { 2934 for_each_active_dev_scope(drhd->devices, 2935 drhd->devices_cnt, i, dev) 2936 break; 2937 /* ignore DMAR unit if no devices exist */ 2938 if (i == drhd->devices_cnt) 2939 drhd->ignored = 1; 2940 } 2941 } 2942 2943 for_each_active_drhd_unit(drhd) { 2944 if (drhd->include_all) 2945 continue; 2946 2947 for_each_active_dev_scope(drhd->devices, 2948 drhd->devices_cnt, i, dev) 2949 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 2950 break; 2951 if (i < drhd->devices_cnt) 2952 continue; 2953 2954 /* This IOMMU has *only* gfx devices. Either bypass it or 2955 set the gfx_mapped flag, as appropriate */ 2956 drhd->gfx_dedicated = 1; 2957 if (!dmar_map_gfx) 2958 drhd->ignored = 1; 2959 } 2960 } 2961 2962 #ifdef CONFIG_SUSPEND 2963 static int init_iommu_hw(void) 2964 { 2965 struct dmar_drhd_unit *drhd; 2966 struct intel_iommu *iommu = NULL; 2967 int ret; 2968 2969 for_each_active_iommu(iommu, drhd) { 2970 if (iommu->qi) { 2971 ret = dmar_reenable_qi(iommu); 2972 if (ret) 2973 return ret; 2974 } 2975 } 2976 2977 for_each_iommu(iommu, drhd) { 2978 if (drhd->ignored) { 2979 /* 2980 * we always have to disable PMRs or DMA may fail on 2981 * this device 2982 */ 2983 if (force_on) 2984 iommu_disable_protect_mem_regions(iommu); 2985 continue; 2986 } 2987 2988 iommu_flush_write_buffer(iommu); 2989 iommu_set_root_entry(iommu); 2990 iommu_enable_translation(iommu); 2991 iommu_disable_protect_mem_regions(iommu); 2992 } 2993 2994 return 0; 2995 } 2996 2997 static void iommu_flush_all(void) 2998 { 2999 struct dmar_drhd_unit *drhd; 3000 struct intel_iommu *iommu; 3001 3002 for_each_active_iommu(iommu, drhd) { 3003 iommu->flush.flush_context(iommu, 0, 0, 0, 3004 DMA_CCMD_GLOBAL_INVL); 3005 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3006 DMA_TLB_GLOBAL_FLUSH); 3007 } 3008 } 3009 3010 static int iommu_suspend(void) 3011 { 3012 struct dmar_drhd_unit *drhd; 3013 struct intel_iommu *iommu = NULL; 3014 unsigned long flag; 3015 3016 iommu_flush_all(); 3017 3018 for_each_active_iommu(iommu, drhd) { 3019 iommu_disable_translation(iommu); 3020 3021 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3022 3023 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3024 readl(iommu->reg + DMAR_FECTL_REG); 3025 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3026 readl(iommu->reg + DMAR_FEDATA_REG); 3027 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3028 readl(iommu->reg + DMAR_FEADDR_REG); 3029 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3030 readl(iommu->reg + DMAR_FEUADDR_REG); 3031 3032 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3033 } 3034 return 0; 3035 } 3036 3037 static void iommu_resume(void) 3038 { 3039 struct dmar_drhd_unit *drhd; 3040 struct intel_iommu *iommu = NULL; 3041 unsigned long flag; 3042 3043 if (init_iommu_hw()) { 3044 if (force_on) 3045 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3046 else 3047 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3048 return; 3049 } 3050 3051 for_each_active_iommu(iommu, drhd) { 3052 3053 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3054 3055 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3056 iommu->reg + DMAR_FECTL_REG); 3057 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3058 iommu->reg + DMAR_FEDATA_REG); 3059 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3060 iommu->reg + DMAR_FEADDR_REG); 3061 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3062 iommu->reg + DMAR_FEUADDR_REG); 3063 3064 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3065 } 3066 } 3067 3068 static struct syscore_ops iommu_syscore_ops = { 3069 .resume = iommu_resume, 3070 .suspend = iommu_suspend, 3071 }; 3072 3073 static void __init init_iommu_pm_ops(void) 3074 { 3075 register_syscore_ops(&iommu_syscore_ops); 3076 } 3077 3078 #else 3079 static inline void init_iommu_pm_ops(void) {} 3080 #endif /* CONFIG_PM */ 3081 3082 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3083 { 3084 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3085 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3086 rmrr->end_address <= rmrr->base_address || 3087 arch_rmrr_sanity_check(rmrr)) 3088 return -EINVAL; 3089 3090 return 0; 3091 } 3092 3093 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3094 { 3095 struct acpi_dmar_reserved_memory *rmrr; 3096 struct dmar_rmrr_unit *rmrru; 3097 3098 rmrr = (struct acpi_dmar_reserved_memory *)header; 3099 if (rmrr_sanity_check(rmrr)) { 3100 pr_warn(FW_BUG 3101 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3102 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3103 rmrr->base_address, rmrr->end_address, 3104 dmi_get_system_info(DMI_BIOS_VENDOR), 3105 dmi_get_system_info(DMI_BIOS_VERSION), 3106 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3107 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3108 } 3109 3110 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3111 if (!rmrru) 3112 goto out; 3113 3114 rmrru->hdr = header; 3115 3116 rmrru->base_address = rmrr->base_address; 3117 rmrru->end_address = rmrr->end_address; 3118 3119 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3120 ((void *)rmrr) + rmrr->header.length, 3121 &rmrru->devices_cnt); 3122 if (rmrru->devices_cnt && rmrru->devices == NULL) 3123 goto free_rmrru; 3124 3125 list_add(&rmrru->list, &dmar_rmrr_units); 3126 3127 return 0; 3128 free_rmrru: 3129 kfree(rmrru); 3130 out: 3131 return -ENOMEM; 3132 } 3133 3134 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3135 { 3136 struct dmar_atsr_unit *atsru; 3137 struct acpi_dmar_atsr *tmp; 3138 3139 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3140 dmar_rcu_check()) { 3141 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3142 if (atsr->segment != tmp->segment) 3143 continue; 3144 if (atsr->header.length != tmp->header.length) 3145 continue; 3146 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3147 return atsru; 3148 } 3149 3150 return NULL; 3151 } 3152 3153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3154 { 3155 struct acpi_dmar_atsr *atsr; 3156 struct dmar_atsr_unit *atsru; 3157 3158 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3159 return 0; 3160 3161 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3162 atsru = dmar_find_atsr(atsr); 3163 if (atsru) 3164 return 0; 3165 3166 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3167 if (!atsru) 3168 return -ENOMEM; 3169 3170 /* 3171 * If memory is allocated from slab by ACPI _DSM method, we need to 3172 * copy the memory content because the memory buffer will be freed 3173 * on return. 3174 */ 3175 atsru->hdr = (void *)(atsru + 1); 3176 memcpy(atsru->hdr, hdr, hdr->length); 3177 atsru->include_all = atsr->flags & 0x1; 3178 if (!atsru->include_all) { 3179 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3180 (void *)atsr + atsr->header.length, 3181 &atsru->devices_cnt); 3182 if (atsru->devices_cnt && atsru->devices == NULL) { 3183 kfree(atsru); 3184 return -ENOMEM; 3185 } 3186 } 3187 3188 list_add_rcu(&atsru->list, &dmar_atsr_units); 3189 3190 return 0; 3191 } 3192 3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3194 { 3195 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3196 kfree(atsru); 3197 } 3198 3199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3200 { 3201 struct acpi_dmar_atsr *atsr; 3202 struct dmar_atsr_unit *atsru; 3203 3204 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3205 atsru = dmar_find_atsr(atsr); 3206 if (atsru) { 3207 list_del_rcu(&atsru->list); 3208 synchronize_rcu(); 3209 intel_iommu_free_atsr(atsru); 3210 } 3211 3212 return 0; 3213 } 3214 3215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3216 { 3217 int i; 3218 struct device *dev; 3219 struct acpi_dmar_atsr *atsr; 3220 struct dmar_atsr_unit *atsru; 3221 3222 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3223 atsru = dmar_find_atsr(atsr); 3224 if (!atsru) 3225 return 0; 3226 3227 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3228 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3229 i, dev) 3230 return -EBUSY; 3231 } 3232 3233 return 0; 3234 } 3235 3236 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3237 { 3238 struct dmar_satc_unit *satcu; 3239 struct acpi_dmar_satc *tmp; 3240 3241 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3242 dmar_rcu_check()) { 3243 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3244 if (satc->segment != tmp->segment) 3245 continue; 3246 if (satc->header.length != tmp->header.length) 3247 continue; 3248 if (memcmp(satc, tmp, satc->header.length) == 0) 3249 return satcu; 3250 } 3251 3252 return NULL; 3253 } 3254 3255 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3256 { 3257 struct acpi_dmar_satc *satc; 3258 struct dmar_satc_unit *satcu; 3259 3260 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3261 return 0; 3262 3263 satc = container_of(hdr, struct acpi_dmar_satc, header); 3264 satcu = dmar_find_satc(satc); 3265 if (satcu) 3266 return 0; 3267 3268 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3269 if (!satcu) 3270 return -ENOMEM; 3271 3272 satcu->hdr = (void *)(satcu + 1); 3273 memcpy(satcu->hdr, hdr, hdr->length); 3274 satcu->atc_required = satc->flags & 0x1; 3275 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3276 (void *)satc + satc->header.length, 3277 &satcu->devices_cnt); 3278 if (satcu->devices_cnt && !satcu->devices) { 3279 kfree(satcu); 3280 return -ENOMEM; 3281 } 3282 list_add_rcu(&satcu->list, &dmar_satc_units); 3283 3284 return 0; 3285 } 3286 3287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3288 { 3289 int sp, ret; 3290 struct intel_iommu *iommu = dmaru->iommu; 3291 3292 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3293 if (ret) 3294 goto out; 3295 3296 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3297 pr_warn("%s: Doesn't support hardware pass through.\n", 3298 iommu->name); 3299 return -ENXIO; 3300 } 3301 3302 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3303 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3304 pr_warn("%s: Doesn't support large page.\n", 3305 iommu->name); 3306 return -ENXIO; 3307 } 3308 3309 /* 3310 * Disable translation if already enabled prior to OS handover. 3311 */ 3312 if (iommu->gcmd & DMA_GCMD_TE) 3313 iommu_disable_translation(iommu); 3314 3315 ret = iommu_init_domains(iommu); 3316 if (ret == 0) 3317 ret = iommu_alloc_root_entry(iommu); 3318 if (ret) 3319 goto out; 3320 3321 intel_svm_check(iommu); 3322 3323 if (dmaru->ignored) { 3324 /* 3325 * we always have to disable PMRs or DMA may fail on this device 3326 */ 3327 if (force_on) 3328 iommu_disable_protect_mem_regions(iommu); 3329 return 0; 3330 } 3331 3332 intel_iommu_init_qi(iommu); 3333 iommu_flush_write_buffer(iommu); 3334 3335 #ifdef CONFIG_INTEL_IOMMU_SVM 3336 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3337 ret = intel_svm_enable_prq(iommu); 3338 if (ret) 3339 goto disable_iommu; 3340 } 3341 #endif 3342 ret = dmar_set_interrupt(iommu); 3343 if (ret) 3344 goto disable_iommu; 3345 3346 iommu_set_root_entry(iommu); 3347 iommu_enable_translation(iommu); 3348 3349 iommu_disable_protect_mem_regions(iommu); 3350 return 0; 3351 3352 disable_iommu: 3353 disable_dmar_iommu(iommu); 3354 out: 3355 free_dmar_iommu(iommu); 3356 return ret; 3357 } 3358 3359 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3360 { 3361 int ret = 0; 3362 struct intel_iommu *iommu = dmaru->iommu; 3363 3364 if (!intel_iommu_enabled) 3365 return 0; 3366 if (iommu == NULL) 3367 return -EINVAL; 3368 3369 if (insert) { 3370 ret = intel_iommu_add(dmaru); 3371 } else { 3372 disable_dmar_iommu(iommu); 3373 free_dmar_iommu(iommu); 3374 } 3375 3376 return ret; 3377 } 3378 3379 static void intel_iommu_free_dmars(void) 3380 { 3381 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3382 struct dmar_atsr_unit *atsru, *atsr_n; 3383 struct dmar_satc_unit *satcu, *satc_n; 3384 3385 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3386 list_del(&rmrru->list); 3387 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3388 kfree(rmrru); 3389 } 3390 3391 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3392 list_del(&atsru->list); 3393 intel_iommu_free_atsr(atsru); 3394 } 3395 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3396 list_del(&satcu->list); 3397 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3398 kfree(satcu); 3399 } 3400 } 3401 3402 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3403 { 3404 struct dmar_satc_unit *satcu; 3405 struct acpi_dmar_satc *satc; 3406 struct device *tmp; 3407 int i; 3408 3409 dev = pci_physfn(dev); 3410 rcu_read_lock(); 3411 3412 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3413 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3414 if (satc->segment != pci_domain_nr(dev->bus)) 3415 continue; 3416 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3417 if (to_pci_dev(tmp) == dev) 3418 goto out; 3419 } 3420 satcu = NULL; 3421 out: 3422 rcu_read_unlock(); 3423 return satcu; 3424 } 3425 3426 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3427 { 3428 int i, ret = 1; 3429 struct pci_bus *bus; 3430 struct pci_dev *bridge = NULL; 3431 struct device *tmp; 3432 struct acpi_dmar_atsr *atsr; 3433 struct dmar_atsr_unit *atsru; 3434 struct dmar_satc_unit *satcu; 3435 3436 dev = pci_physfn(dev); 3437 satcu = dmar_find_matched_satc_unit(dev); 3438 if (satcu) 3439 /* 3440 * This device supports ATS as it is in SATC table. 3441 * When IOMMU is in legacy mode, enabling ATS is done 3442 * automatically by HW for the device that requires 3443 * ATS, hence OS should not enable this device ATS 3444 * to avoid duplicated TLB invalidation. 3445 */ 3446 return !(satcu->atc_required && !sm_supported(iommu)); 3447 3448 for (bus = dev->bus; bus; bus = bus->parent) { 3449 bridge = bus->self; 3450 /* If it's an integrated device, allow ATS */ 3451 if (!bridge) 3452 return 1; 3453 /* Connected via non-PCIe: no ATS */ 3454 if (!pci_is_pcie(bridge) || 3455 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3456 return 0; 3457 /* If we found the root port, look it up in the ATSR */ 3458 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3459 break; 3460 } 3461 3462 rcu_read_lock(); 3463 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3464 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3465 if (atsr->segment != pci_domain_nr(dev->bus)) 3466 continue; 3467 3468 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3469 if (tmp == &bridge->dev) 3470 goto out; 3471 3472 if (atsru->include_all) 3473 goto out; 3474 } 3475 ret = 0; 3476 out: 3477 rcu_read_unlock(); 3478 3479 return ret; 3480 } 3481 3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3483 { 3484 int ret; 3485 struct dmar_rmrr_unit *rmrru; 3486 struct dmar_atsr_unit *atsru; 3487 struct dmar_satc_unit *satcu; 3488 struct acpi_dmar_atsr *atsr; 3489 struct acpi_dmar_reserved_memory *rmrr; 3490 struct acpi_dmar_satc *satc; 3491 3492 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3493 return 0; 3494 3495 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3496 rmrr = container_of(rmrru->hdr, 3497 struct acpi_dmar_reserved_memory, header); 3498 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3499 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3500 ((void *)rmrr) + rmrr->header.length, 3501 rmrr->segment, rmrru->devices, 3502 rmrru->devices_cnt); 3503 if (ret < 0) 3504 return ret; 3505 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3506 dmar_remove_dev_scope(info, rmrr->segment, 3507 rmrru->devices, rmrru->devices_cnt); 3508 } 3509 } 3510 3511 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3512 if (atsru->include_all) 3513 continue; 3514 3515 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3516 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3517 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3518 (void *)atsr + atsr->header.length, 3519 atsr->segment, atsru->devices, 3520 atsru->devices_cnt); 3521 if (ret > 0) 3522 break; 3523 else if (ret < 0) 3524 return ret; 3525 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3526 if (dmar_remove_dev_scope(info, atsr->segment, 3527 atsru->devices, atsru->devices_cnt)) 3528 break; 3529 } 3530 } 3531 list_for_each_entry(satcu, &dmar_satc_units, list) { 3532 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3533 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3534 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3535 (void *)satc + satc->header.length, 3536 satc->segment, satcu->devices, 3537 satcu->devices_cnt); 3538 if (ret > 0) 3539 break; 3540 else if (ret < 0) 3541 return ret; 3542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3543 if (dmar_remove_dev_scope(info, satc->segment, 3544 satcu->devices, satcu->devices_cnt)) 3545 break; 3546 } 3547 } 3548 3549 return 0; 3550 } 3551 3552 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3553 unsigned long val, void *v) 3554 { 3555 struct memory_notify *mhp = v; 3556 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); 3557 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + 3558 mhp->nr_pages - 1); 3559 3560 switch (val) { 3561 case MEM_GOING_ONLINE: 3562 if (iommu_domain_identity_map(si_domain, 3563 start_vpfn, last_vpfn)) { 3564 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3565 start_vpfn, last_vpfn); 3566 return NOTIFY_BAD; 3567 } 3568 break; 3569 3570 case MEM_OFFLINE: 3571 case MEM_CANCEL_ONLINE: 3572 { 3573 struct dmar_drhd_unit *drhd; 3574 struct intel_iommu *iommu; 3575 LIST_HEAD(freelist); 3576 3577 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3578 3579 rcu_read_lock(); 3580 for_each_active_iommu(iommu, drhd) 3581 iommu_flush_iotlb_psi(iommu, si_domain, 3582 start_vpfn, mhp->nr_pages, 3583 list_empty(&freelist), 0); 3584 rcu_read_unlock(); 3585 put_pages_list(&freelist); 3586 } 3587 break; 3588 } 3589 3590 return NOTIFY_OK; 3591 } 3592 3593 static struct notifier_block intel_iommu_memory_nb = { 3594 .notifier_call = intel_iommu_memory_notifier, 3595 .priority = 0 3596 }; 3597 3598 static void intel_disable_iommus(void) 3599 { 3600 struct intel_iommu *iommu = NULL; 3601 struct dmar_drhd_unit *drhd; 3602 3603 for_each_iommu(iommu, drhd) 3604 iommu_disable_translation(iommu); 3605 } 3606 3607 void intel_iommu_shutdown(void) 3608 { 3609 struct dmar_drhd_unit *drhd; 3610 struct intel_iommu *iommu = NULL; 3611 3612 if (no_iommu || dmar_disabled) 3613 return; 3614 3615 down_write(&dmar_global_lock); 3616 3617 /* Disable PMRs explicitly here. */ 3618 for_each_iommu(iommu, drhd) 3619 iommu_disable_protect_mem_regions(iommu); 3620 3621 /* Make sure the IOMMUs are switched off */ 3622 intel_disable_iommus(); 3623 3624 up_write(&dmar_global_lock); 3625 } 3626 3627 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3628 { 3629 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3630 3631 return container_of(iommu_dev, struct intel_iommu, iommu); 3632 } 3633 3634 static ssize_t version_show(struct device *dev, 3635 struct device_attribute *attr, char *buf) 3636 { 3637 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3638 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3639 return sysfs_emit(buf, "%d:%d\n", 3640 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3641 } 3642 static DEVICE_ATTR_RO(version); 3643 3644 static ssize_t address_show(struct device *dev, 3645 struct device_attribute *attr, char *buf) 3646 { 3647 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3648 return sysfs_emit(buf, "%llx\n", iommu->reg_phys); 3649 } 3650 static DEVICE_ATTR_RO(address); 3651 3652 static ssize_t cap_show(struct device *dev, 3653 struct device_attribute *attr, char *buf) 3654 { 3655 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3656 return sysfs_emit(buf, "%llx\n", iommu->cap); 3657 } 3658 static DEVICE_ATTR_RO(cap); 3659 3660 static ssize_t ecap_show(struct device *dev, 3661 struct device_attribute *attr, char *buf) 3662 { 3663 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3664 return sysfs_emit(buf, "%llx\n", iommu->ecap); 3665 } 3666 static DEVICE_ATTR_RO(ecap); 3667 3668 static ssize_t domains_supported_show(struct device *dev, 3669 struct device_attribute *attr, char *buf) 3670 { 3671 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3672 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); 3673 } 3674 static DEVICE_ATTR_RO(domains_supported); 3675 3676 static ssize_t domains_used_show(struct device *dev, 3677 struct device_attribute *attr, char *buf) 3678 { 3679 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3680 return sysfs_emit(buf, "%d\n", 3681 bitmap_weight(iommu->domain_ids, 3682 cap_ndoms(iommu->cap))); 3683 } 3684 static DEVICE_ATTR_RO(domains_used); 3685 3686 static struct attribute *intel_iommu_attrs[] = { 3687 &dev_attr_version.attr, 3688 &dev_attr_address.attr, 3689 &dev_attr_cap.attr, 3690 &dev_attr_ecap.attr, 3691 &dev_attr_domains_supported.attr, 3692 &dev_attr_domains_used.attr, 3693 NULL, 3694 }; 3695 3696 static struct attribute_group intel_iommu_group = { 3697 .name = "intel-iommu", 3698 .attrs = intel_iommu_attrs, 3699 }; 3700 3701 const struct attribute_group *intel_iommu_groups[] = { 3702 &intel_iommu_group, 3703 NULL, 3704 }; 3705 3706 static inline bool has_external_pci(void) 3707 { 3708 struct pci_dev *pdev = NULL; 3709 3710 for_each_pci_dev(pdev) 3711 if (pdev->external_facing) { 3712 pci_dev_put(pdev); 3713 return true; 3714 } 3715 3716 return false; 3717 } 3718 3719 static int __init platform_optin_force_iommu(void) 3720 { 3721 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3722 return 0; 3723 3724 if (no_iommu || dmar_disabled) 3725 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3726 3727 /* 3728 * If Intel-IOMMU is disabled by default, we will apply identity 3729 * map for all devices except those marked as being untrusted. 3730 */ 3731 if (dmar_disabled) 3732 iommu_set_default_passthrough(false); 3733 3734 dmar_disabled = 0; 3735 no_iommu = 0; 3736 3737 return 1; 3738 } 3739 3740 static int __init probe_acpi_namespace_devices(void) 3741 { 3742 struct dmar_drhd_unit *drhd; 3743 /* To avoid a -Wunused-but-set-variable warning. */ 3744 struct intel_iommu *iommu __maybe_unused; 3745 struct device *dev; 3746 int i, ret = 0; 3747 3748 for_each_active_iommu(iommu, drhd) { 3749 for_each_active_dev_scope(drhd->devices, 3750 drhd->devices_cnt, i, dev) { 3751 struct acpi_device_physical_node *pn; 3752 struct acpi_device *adev; 3753 3754 if (dev->bus != &acpi_bus_type) 3755 continue; 3756 3757 adev = to_acpi_device(dev); 3758 mutex_lock(&adev->physical_node_lock); 3759 list_for_each_entry(pn, 3760 &adev->physical_node_list, node) { 3761 ret = iommu_probe_device(pn->dev); 3762 if (ret) 3763 break; 3764 } 3765 mutex_unlock(&adev->physical_node_lock); 3766 3767 if (ret) 3768 return ret; 3769 } 3770 } 3771 3772 return 0; 3773 } 3774 3775 static __init int tboot_force_iommu(void) 3776 { 3777 if (!tboot_enabled()) 3778 return 0; 3779 3780 if (no_iommu || dmar_disabled) 3781 pr_warn("Forcing Intel-IOMMU to enabled\n"); 3782 3783 dmar_disabled = 0; 3784 no_iommu = 0; 3785 3786 return 1; 3787 } 3788 3789 int __init intel_iommu_init(void) 3790 { 3791 int ret = -ENODEV; 3792 struct dmar_drhd_unit *drhd; 3793 struct intel_iommu *iommu; 3794 3795 /* 3796 * Intel IOMMU is required for a TXT/tboot launch or platform 3797 * opt in, so enforce that. 3798 */ 3799 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 3800 platform_optin_force_iommu(); 3801 3802 down_write(&dmar_global_lock); 3803 if (dmar_table_init()) { 3804 if (force_on) 3805 panic("tboot: Failed to initialize DMAR table\n"); 3806 goto out_free_dmar; 3807 } 3808 3809 if (dmar_dev_scope_init() < 0) { 3810 if (force_on) 3811 panic("tboot: Failed to initialize DMAR device scope\n"); 3812 goto out_free_dmar; 3813 } 3814 3815 up_write(&dmar_global_lock); 3816 3817 /* 3818 * The bus notifier takes the dmar_global_lock, so lockdep will 3819 * complain later when we register it under the lock. 3820 */ 3821 dmar_register_bus_notifier(); 3822 3823 down_write(&dmar_global_lock); 3824 3825 if (!no_iommu) 3826 intel_iommu_debugfs_init(); 3827 3828 if (no_iommu || dmar_disabled) { 3829 /* 3830 * We exit the function here to ensure IOMMU's remapping and 3831 * mempool aren't setup, which means that the IOMMU's PMRs 3832 * won't be disabled via the call to init_dmars(). So disable 3833 * it explicitly here. The PMRs were setup by tboot prior to 3834 * calling SENTER, but the kernel is expected to reset/tear 3835 * down the PMRs. 3836 */ 3837 if (intel_iommu_tboot_noforce) { 3838 for_each_iommu(iommu, drhd) 3839 iommu_disable_protect_mem_regions(iommu); 3840 } 3841 3842 /* 3843 * Make sure the IOMMUs are switched off, even when we 3844 * boot into a kexec kernel and the previous kernel left 3845 * them enabled 3846 */ 3847 intel_disable_iommus(); 3848 goto out_free_dmar; 3849 } 3850 3851 if (list_empty(&dmar_rmrr_units)) 3852 pr_info("No RMRR found\n"); 3853 3854 if (list_empty(&dmar_atsr_units)) 3855 pr_info("No ATSR found\n"); 3856 3857 if (list_empty(&dmar_satc_units)) 3858 pr_info("No SATC found\n"); 3859 3860 init_no_remapping_devices(); 3861 3862 ret = init_dmars(); 3863 if (ret) { 3864 if (force_on) 3865 panic("tboot: Failed to initialize DMARs\n"); 3866 pr_err("Initialization failed\n"); 3867 goto out_free_dmar; 3868 } 3869 up_write(&dmar_global_lock); 3870 3871 init_iommu_pm_ops(); 3872 3873 down_read(&dmar_global_lock); 3874 for_each_active_iommu(iommu, drhd) { 3875 /* 3876 * The flush queue implementation does not perform 3877 * page-selective invalidations that are required for efficient 3878 * TLB flushes in virtual environments. The benefit of batching 3879 * is likely to be much lower than the overhead of synchronizing 3880 * the virtual and physical IOMMU page-tables. 3881 */ 3882 if (cap_caching_mode(iommu->cap) && 3883 !first_level_by_default(IOMMU_DOMAIN_DMA)) { 3884 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 3885 iommu_set_dma_strict(); 3886 } 3887 iommu_device_sysfs_add(&iommu->iommu, NULL, 3888 intel_iommu_groups, 3889 "%s", iommu->name); 3890 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 3891 3892 iommu_pmu_register(iommu); 3893 } 3894 up_read(&dmar_global_lock); 3895 3896 if (si_domain && !hw_pass_through) 3897 register_memory_notifier(&intel_iommu_memory_nb); 3898 3899 down_read(&dmar_global_lock); 3900 if (probe_acpi_namespace_devices()) 3901 pr_warn("ACPI name space devices didn't probe correctly\n"); 3902 3903 /* Finally, we enable the DMA remapping hardware. */ 3904 for_each_iommu(iommu, drhd) { 3905 if (!drhd->ignored && !translation_pre_enabled(iommu)) 3906 iommu_enable_translation(iommu); 3907 3908 iommu_disable_protect_mem_regions(iommu); 3909 } 3910 up_read(&dmar_global_lock); 3911 3912 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 3913 3914 intel_iommu_enabled = 1; 3915 3916 return 0; 3917 3918 out_free_dmar: 3919 intel_iommu_free_dmars(); 3920 up_write(&dmar_global_lock); 3921 return ret; 3922 } 3923 3924 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 3925 { 3926 struct device_domain_info *info = opaque; 3927 3928 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 3929 return 0; 3930 } 3931 3932 /* 3933 * NB - intel-iommu lacks any sort of reference counting for the users of 3934 * dependent devices. If multiple endpoints have intersecting dependent 3935 * devices, unbinding the driver from any one of them will possibly leave 3936 * the others unable to operate. 3937 */ 3938 static void domain_context_clear(struct device_domain_info *info) 3939 { 3940 if (!dev_is_pci(info->dev)) { 3941 domain_context_clear_one(info, info->bus, info->devfn); 3942 return; 3943 } 3944 3945 pci_for_each_dma_alias(to_pci_dev(info->dev), 3946 &domain_context_clear_one_cb, info); 3947 } 3948 3949 static void dmar_remove_one_dev_info(struct device *dev) 3950 { 3951 struct device_domain_info *info = dev_iommu_priv_get(dev); 3952 struct dmar_domain *domain = info->domain; 3953 struct intel_iommu *iommu = info->iommu; 3954 unsigned long flags; 3955 3956 if (!dev_is_real_dma_subdevice(info->dev)) { 3957 if (dev_is_pci(info->dev) && sm_supported(iommu)) 3958 intel_pasid_tear_down_entry(iommu, info->dev, 3959 IOMMU_NO_PASID, false); 3960 3961 iommu_disable_pci_caps(info); 3962 domain_context_clear(info); 3963 } 3964 3965 spin_lock_irqsave(&domain->lock, flags); 3966 list_del(&info->link); 3967 spin_unlock_irqrestore(&domain->lock, flags); 3968 3969 domain_detach_iommu(domain, iommu); 3970 info->domain = NULL; 3971 } 3972 3973 /* 3974 * Clear the page table pointer in context or pasid table entries so that 3975 * all DMA requests without PASID from the device are blocked. If the page 3976 * table has been set, clean up the data structures. 3977 */ 3978 static void device_block_translation(struct device *dev) 3979 { 3980 struct device_domain_info *info = dev_iommu_priv_get(dev); 3981 struct intel_iommu *iommu = info->iommu; 3982 unsigned long flags; 3983 3984 iommu_disable_pci_caps(info); 3985 if (!dev_is_real_dma_subdevice(dev)) { 3986 if (sm_supported(iommu)) 3987 intel_pasid_tear_down_entry(iommu, dev, 3988 IOMMU_NO_PASID, false); 3989 else 3990 domain_context_clear(info); 3991 } 3992 3993 if (!info->domain) 3994 return; 3995 3996 spin_lock_irqsave(&info->domain->lock, flags); 3997 list_del(&info->link); 3998 spin_unlock_irqrestore(&info->domain->lock, flags); 3999 4000 domain_detach_iommu(info->domain, iommu); 4001 info->domain = NULL; 4002 } 4003 4004 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4005 { 4006 int adjust_width; 4007 4008 /* calculate AGAW */ 4009 domain->gaw = guest_width; 4010 adjust_width = guestwidth_to_adjustwidth(guest_width); 4011 domain->agaw = width_to_agaw(adjust_width); 4012 4013 domain->iommu_coherency = false; 4014 domain->iommu_superpage = 0; 4015 domain->max_addr = 0; 4016 4017 /* always allocate the top pgd */ 4018 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); 4019 if (!domain->pgd) 4020 return -ENOMEM; 4021 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4022 return 0; 4023 } 4024 4025 static int blocking_domain_attach_dev(struct iommu_domain *domain, 4026 struct device *dev) 4027 { 4028 device_block_translation(dev); 4029 return 0; 4030 } 4031 4032 static struct iommu_domain blocking_domain = { 4033 .ops = &(const struct iommu_domain_ops) { 4034 .attach_dev = blocking_domain_attach_dev, 4035 .free = intel_iommu_domain_free 4036 } 4037 }; 4038 4039 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4040 { 4041 struct dmar_domain *dmar_domain; 4042 struct iommu_domain *domain; 4043 4044 switch (type) { 4045 case IOMMU_DOMAIN_BLOCKED: 4046 return &blocking_domain; 4047 case IOMMU_DOMAIN_DMA: 4048 case IOMMU_DOMAIN_UNMANAGED: 4049 dmar_domain = alloc_domain(type); 4050 if (!dmar_domain) { 4051 pr_err("Can't allocate dmar_domain\n"); 4052 return NULL; 4053 } 4054 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4055 pr_err("Domain initialization failed\n"); 4056 domain_exit(dmar_domain); 4057 return NULL; 4058 } 4059 4060 domain = &dmar_domain->domain; 4061 domain->geometry.aperture_start = 0; 4062 domain->geometry.aperture_end = 4063 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4064 domain->geometry.force_aperture = true; 4065 4066 return domain; 4067 case IOMMU_DOMAIN_IDENTITY: 4068 return &si_domain->domain; 4069 case IOMMU_DOMAIN_SVA: 4070 return intel_svm_domain_alloc(); 4071 default: 4072 return NULL; 4073 } 4074 4075 return NULL; 4076 } 4077 4078 static void intel_iommu_domain_free(struct iommu_domain *domain) 4079 { 4080 if (domain != &si_domain->domain && domain != &blocking_domain) 4081 domain_exit(to_dmar_domain(domain)); 4082 } 4083 4084 static int prepare_domain_attach_device(struct iommu_domain *domain, 4085 struct device *dev) 4086 { 4087 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4088 struct intel_iommu *iommu; 4089 int addr_width; 4090 4091 iommu = device_to_iommu(dev, NULL, NULL); 4092 if (!iommu) 4093 return -ENODEV; 4094 4095 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4096 return -EINVAL; 4097 4098 /* check if this iommu agaw is sufficient for max mapped address */ 4099 addr_width = agaw_to_width(iommu->agaw); 4100 if (addr_width > cap_mgaw(iommu->cap)) 4101 addr_width = cap_mgaw(iommu->cap); 4102 4103 if (dmar_domain->max_addr > (1LL << addr_width)) 4104 return -EINVAL; 4105 dmar_domain->gaw = addr_width; 4106 4107 /* 4108 * Knock out extra levels of page tables if necessary 4109 */ 4110 while (iommu->agaw < dmar_domain->agaw) { 4111 struct dma_pte *pte; 4112 4113 pte = dmar_domain->pgd; 4114 if (dma_pte_present(pte)) { 4115 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4116 free_pgtable_page(pte); 4117 } 4118 dmar_domain->agaw--; 4119 } 4120 4121 return 0; 4122 } 4123 4124 static int intel_iommu_attach_device(struct iommu_domain *domain, 4125 struct device *dev) 4126 { 4127 struct device_domain_info *info = dev_iommu_priv_get(dev); 4128 int ret; 4129 4130 if (info->domain) 4131 device_block_translation(dev); 4132 4133 ret = prepare_domain_attach_device(domain, dev); 4134 if (ret) 4135 return ret; 4136 4137 return dmar_domain_attach_device(to_dmar_domain(domain), dev); 4138 } 4139 4140 static int intel_iommu_map(struct iommu_domain *domain, 4141 unsigned long iova, phys_addr_t hpa, 4142 size_t size, int iommu_prot, gfp_t gfp) 4143 { 4144 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4145 u64 max_addr; 4146 int prot = 0; 4147 4148 if (iommu_prot & IOMMU_READ) 4149 prot |= DMA_PTE_READ; 4150 if (iommu_prot & IOMMU_WRITE) 4151 prot |= DMA_PTE_WRITE; 4152 if (dmar_domain->set_pte_snp) 4153 prot |= DMA_PTE_SNP; 4154 4155 max_addr = iova + size; 4156 if (dmar_domain->max_addr < max_addr) { 4157 u64 end; 4158 4159 /* check if minimum agaw is sufficient for mapped address */ 4160 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4161 if (end < max_addr) { 4162 pr_err("%s: iommu width (%d) is not " 4163 "sufficient for the mapped address (%llx)\n", 4164 __func__, dmar_domain->gaw, max_addr); 4165 return -EFAULT; 4166 } 4167 dmar_domain->max_addr = max_addr; 4168 } 4169 /* Round up size to next multiple of PAGE_SIZE, if it and 4170 the low bits of hpa would take us onto the next page */ 4171 size = aligned_nrpages(hpa, size); 4172 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4173 hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 4174 } 4175 4176 static int intel_iommu_map_pages(struct iommu_domain *domain, 4177 unsigned long iova, phys_addr_t paddr, 4178 size_t pgsize, size_t pgcount, 4179 int prot, gfp_t gfp, size_t *mapped) 4180 { 4181 unsigned long pgshift = __ffs(pgsize); 4182 size_t size = pgcount << pgshift; 4183 int ret; 4184 4185 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4186 return -EINVAL; 4187 4188 if (!IS_ALIGNED(iova | paddr, pgsize)) 4189 return -EINVAL; 4190 4191 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4192 if (!ret && mapped) 4193 *mapped = size; 4194 4195 return ret; 4196 } 4197 4198 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4199 unsigned long iova, size_t size, 4200 struct iommu_iotlb_gather *gather) 4201 { 4202 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4203 unsigned long start_pfn, last_pfn; 4204 int level = 0; 4205 4206 /* Cope with horrid API which requires us to unmap more than the 4207 size argument if it happens to be a large-page mapping. */ 4208 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 4209 &level, GFP_ATOMIC))) 4210 return 0; 4211 4212 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4213 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4214 4215 start_pfn = iova >> VTD_PAGE_SHIFT; 4216 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4217 4218 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4219 4220 if (dmar_domain->max_addr == iova + size) 4221 dmar_domain->max_addr = iova; 4222 4223 /* 4224 * We do not use page-selective IOTLB invalidation in flush queue, 4225 * so there is no need to track page and sync iotlb. 4226 */ 4227 if (!iommu_iotlb_gather_queued(gather)) 4228 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4229 4230 return size; 4231 } 4232 4233 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4234 unsigned long iova, 4235 size_t pgsize, size_t pgcount, 4236 struct iommu_iotlb_gather *gather) 4237 { 4238 unsigned long pgshift = __ffs(pgsize); 4239 size_t size = pgcount << pgshift; 4240 4241 return intel_iommu_unmap(domain, iova, size, gather); 4242 } 4243 4244 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4245 struct iommu_iotlb_gather *gather) 4246 { 4247 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4248 unsigned long iova_pfn = IOVA_PFN(gather->start); 4249 size_t size = gather->end - gather->start; 4250 struct iommu_domain_info *info; 4251 unsigned long start_pfn; 4252 unsigned long nrpages; 4253 unsigned long i; 4254 4255 nrpages = aligned_nrpages(gather->start, size); 4256 start_pfn = mm_to_dma_pfn_start(iova_pfn); 4257 4258 xa_for_each(&dmar_domain->iommu_array, i, info) 4259 iommu_flush_iotlb_psi(info->iommu, dmar_domain, 4260 start_pfn, nrpages, 4261 list_empty(&gather->freelist), 0); 4262 4263 put_pages_list(&gather->freelist); 4264 } 4265 4266 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4267 dma_addr_t iova) 4268 { 4269 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4270 struct dma_pte *pte; 4271 int level = 0; 4272 u64 phys = 0; 4273 4274 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 4275 GFP_ATOMIC); 4276 if (pte && dma_pte_present(pte)) 4277 phys = dma_pte_addr(pte) + 4278 (iova & (BIT_MASK(level_to_offset_bits(level) + 4279 VTD_PAGE_SHIFT) - 1)); 4280 4281 return phys; 4282 } 4283 4284 static bool domain_support_force_snooping(struct dmar_domain *domain) 4285 { 4286 struct device_domain_info *info; 4287 bool support = true; 4288 4289 assert_spin_locked(&domain->lock); 4290 list_for_each_entry(info, &domain->devices, link) { 4291 if (!ecap_sc_support(info->iommu->ecap)) { 4292 support = false; 4293 break; 4294 } 4295 } 4296 4297 return support; 4298 } 4299 4300 static void domain_set_force_snooping(struct dmar_domain *domain) 4301 { 4302 struct device_domain_info *info; 4303 4304 assert_spin_locked(&domain->lock); 4305 /* 4306 * Second level page table supports per-PTE snoop control. The 4307 * iommu_map() interface will handle this by setting SNP bit. 4308 */ 4309 if (!domain->use_first_level) { 4310 domain->set_pte_snp = true; 4311 return; 4312 } 4313 4314 list_for_each_entry(info, &domain->devices, link) 4315 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4316 IOMMU_NO_PASID); 4317 } 4318 4319 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4320 { 4321 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4322 unsigned long flags; 4323 4324 if (dmar_domain->force_snooping) 4325 return true; 4326 4327 spin_lock_irqsave(&dmar_domain->lock, flags); 4328 if (!domain_support_force_snooping(dmar_domain) || 4329 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { 4330 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4331 return false; 4332 } 4333 4334 domain_set_force_snooping(dmar_domain); 4335 dmar_domain->force_snooping = true; 4336 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4337 4338 return true; 4339 } 4340 4341 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) 4342 { 4343 struct device_domain_info *info = dev_iommu_priv_get(dev); 4344 4345 switch (cap) { 4346 case IOMMU_CAP_CACHE_COHERENCY: 4347 case IOMMU_CAP_DEFERRED_FLUSH: 4348 return true; 4349 case IOMMU_CAP_PRE_BOOT_PROTECTION: 4350 return dmar_platform_optin(); 4351 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 4352 return ecap_sc_support(info->iommu->ecap); 4353 default: 4354 return false; 4355 } 4356 } 4357 4358 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4359 { 4360 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4361 struct device_domain_info *info; 4362 struct intel_iommu *iommu; 4363 u8 bus, devfn; 4364 int ret; 4365 4366 iommu = device_to_iommu(dev, &bus, &devfn); 4367 if (!iommu || !iommu->iommu.ops) 4368 return ERR_PTR(-ENODEV); 4369 4370 info = kzalloc(sizeof(*info), GFP_KERNEL); 4371 if (!info) 4372 return ERR_PTR(-ENOMEM); 4373 4374 if (dev_is_real_dma_subdevice(dev)) { 4375 info->bus = pdev->bus->number; 4376 info->devfn = pdev->devfn; 4377 info->segment = pci_domain_nr(pdev->bus); 4378 } else { 4379 info->bus = bus; 4380 info->devfn = devfn; 4381 info->segment = iommu->segment; 4382 } 4383 4384 info->dev = dev; 4385 info->iommu = iommu; 4386 if (dev_is_pci(dev)) { 4387 if (ecap_dev_iotlb_support(iommu->ecap) && 4388 pci_ats_supported(pdev) && 4389 dmar_ats_supported(pdev, iommu)) { 4390 info->ats_supported = 1; 4391 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); 4392 4393 /* 4394 * For IOMMU that supports device IOTLB throttling 4395 * (DIT), we assign PFSID to the invalidation desc 4396 * of a VF such that IOMMU HW can gauge queue depth 4397 * at PF level. If DIT is not set, PFSID will be 4398 * treated as reserved, which should be set to 0. 4399 */ 4400 if (ecap_dit(iommu->ecap)) 4401 info->pfsid = pci_dev_id(pci_physfn(pdev)); 4402 info->ats_qdep = pci_ats_queue_depth(pdev); 4403 } 4404 if (sm_supported(iommu)) { 4405 if (pasid_supported(iommu)) { 4406 int features = pci_pasid_features(pdev); 4407 4408 if (features >= 0) 4409 info->pasid_supported = features | 1; 4410 } 4411 4412 if (info->ats_supported && ecap_prs(iommu->ecap) && 4413 pci_pri_supported(pdev)) 4414 info->pri_supported = 1; 4415 } 4416 } 4417 4418 dev_iommu_priv_set(dev, info); 4419 4420 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 4421 ret = intel_pasid_alloc_table(dev); 4422 if (ret) { 4423 dev_err(dev, "PASID table allocation failed\n"); 4424 dev_iommu_priv_set(dev, NULL); 4425 kfree(info); 4426 return ERR_PTR(ret); 4427 } 4428 } 4429 4430 return &iommu->iommu; 4431 } 4432 4433 static void intel_iommu_release_device(struct device *dev) 4434 { 4435 struct device_domain_info *info = dev_iommu_priv_get(dev); 4436 4437 dmar_remove_one_dev_info(dev); 4438 intel_pasid_free_table(dev); 4439 dev_iommu_priv_set(dev, NULL); 4440 kfree(info); 4441 set_dma_ops(dev, NULL); 4442 } 4443 4444 static void intel_iommu_probe_finalize(struct device *dev) 4445 { 4446 set_dma_ops(dev, NULL); 4447 iommu_setup_dma_ops(dev, 0, U64_MAX); 4448 } 4449 4450 static void intel_iommu_get_resv_regions(struct device *device, 4451 struct list_head *head) 4452 { 4453 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4454 struct iommu_resv_region *reg; 4455 struct dmar_rmrr_unit *rmrr; 4456 struct device *i_dev; 4457 int i; 4458 4459 rcu_read_lock(); 4460 for_each_rmrr_units(rmrr) { 4461 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4462 i, i_dev) { 4463 struct iommu_resv_region *resv; 4464 enum iommu_resv_type type; 4465 size_t length; 4466 4467 if (i_dev != device && 4468 !is_downstream_to_pci_bridge(device, i_dev)) 4469 continue; 4470 4471 length = rmrr->end_address - rmrr->base_address + 1; 4472 4473 type = device_rmrr_is_relaxable(device) ? 4474 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4475 4476 resv = iommu_alloc_resv_region(rmrr->base_address, 4477 length, prot, type, 4478 GFP_ATOMIC); 4479 if (!resv) 4480 break; 4481 4482 list_add_tail(&resv->list, head); 4483 } 4484 } 4485 rcu_read_unlock(); 4486 4487 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4488 if (dev_is_pci(device)) { 4489 struct pci_dev *pdev = to_pci_dev(device); 4490 4491 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4492 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4493 IOMMU_RESV_DIRECT_RELAXABLE, 4494 GFP_KERNEL); 4495 if (reg) 4496 list_add_tail(®->list, head); 4497 } 4498 } 4499 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4500 4501 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4502 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4503 0, IOMMU_RESV_MSI, GFP_KERNEL); 4504 if (!reg) 4505 return; 4506 list_add_tail(®->list, head); 4507 } 4508 4509 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4510 { 4511 if (dev_is_pci(dev)) 4512 return pci_device_group(dev); 4513 return generic_device_group(dev); 4514 } 4515 4516 static int intel_iommu_enable_sva(struct device *dev) 4517 { 4518 struct device_domain_info *info = dev_iommu_priv_get(dev); 4519 struct intel_iommu *iommu; 4520 4521 if (!info || dmar_disabled) 4522 return -EINVAL; 4523 4524 iommu = info->iommu; 4525 if (!iommu) 4526 return -EINVAL; 4527 4528 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4529 return -ENODEV; 4530 4531 if (!info->pasid_enabled || !info->ats_enabled) 4532 return -EINVAL; 4533 4534 /* 4535 * Devices having device-specific I/O fault handling should not 4536 * support PCI/PRI. The IOMMU side has no means to check the 4537 * capability of device-specific IOPF. Therefore, IOMMU can only 4538 * default that if the device driver enables SVA on a non-PRI 4539 * device, it will handle IOPF in its own way. 4540 */ 4541 if (!info->pri_supported) 4542 return 0; 4543 4544 /* Devices supporting PRI should have it enabled. */ 4545 if (!info->pri_enabled) 4546 return -EINVAL; 4547 4548 return 0; 4549 } 4550 4551 static int intel_iommu_enable_iopf(struct device *dev) 4552 { 4553 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4554 struct device_domain_info *info = dev_iommu_priv_get(dev); 4555 struct intel_iommu *iommu; 4556 int ret; 4557 4558 if (!pdev || !info || !info->ats_enabled || !info->pri_supported) 4559 return -ENODEV; 4560 4561 if (info->pri_enabled) 4562 return -EBUSY; 4563 4564 iommu = info->iommu; 4565 if (!iommu) 4566 return -EINVAL; 4567 4568 /* PASID is required in PRG Response Message. */ 4569 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) 4570 return -EINVAL; 4571 4572 ret = pci_reset_pri(pdev); 4573 if (ret) 4574 return ret; 4575 4576 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4577 if (ret) 4578 return ret; 4579 4580 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4581 if (ret) 4582 goto iopf_remove_device; 4583 4584 ret = pci_enable_pri(pdev, PRQ_DEPTH); 4585 if (ret) 4586 goto iopf_unregister_handler; 4587 info->pri_enabled = 1; 4588 4589 return 0; 4590 4591 iopf_unregister_handler: 4592 iommu_unregister_device_fault_handler(dev); 4593 iopf_remove_device: 4594 iopf_queue_remove_device(iommu->iopf_queue, dev); 4595 4596 return ret; 4597 } 4598 4599 static int intel_iommu_disable_iopf(struct device *dev) 4600 { 4601 struct device_domain_info *info = dev_iommu_priv_get(dev); 4602 struct intel_iommu *iommu = info->iommu; 4603 4604 if (!info->pri_enabled) 4605 return -EINVAL; 4606 4607 /* 4608 * PCIe spec states that by clearing PRI enable bit, the Page 4609 * Request Interface will not issue new page requests, but has 4610 * outstanding page requests that have been transmitted or are 4611 * queued for transmission. This is supposed to be called after 4612 * the device driver has stopped DMA, all PASIDs have been 4613 * unbound and the outstanding PRQs have been drained. 4614 */ 4615 pci_disable_pri(to_pci_dev(dev)); 4616 info->pri_enabled = 0; 4617 4618 /* 4619 * With PRI disabled and outstanding PRQs drained, unregistering 4620 * fault handler and removing device from iopf queue should never 4621 * fail. 4622 */ 4623 WARN_ON(iommu_unregister_device_fault_handler(dev)); 4624 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev)); 4625 4626 return 0; 4627 } 4628 4629 static int 4630 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4631 { 4632 switch (feat) { 4633 case IOMMU_DEV_FEAT_IOPF: 4634 return intel_iommu_enable_iopf(dev); 4635 4636 case IOMMU_DEV_FEAT_SVA: 4637 return intel_iommu_enable_sva(dev); 4638 4639 default: 4640 return -ENODEV; 4641 } 4642 } 4643 4644 static int 4645 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4646 { 4647 switch (feat) { 4648 case IOMMU_DEV_FEAT_IOPF: 4649 return intel_iommu_disable_iopf(dev); 4650 4651 case IOMMU_DEV_FEAT_SVA: 4652 return 0; 4653 4654 default: 4655 return -ENODEV; 4656 } 4657 } 4658 4659 static bool intel_iommu_is_attach_deferred(struct device *dev) 4660 { 4661 struct device_domain_info *info = dev_iommu_priv_get(dev); 4662 4663 return translation_pre_enabled(info->iommu) && !info->domain; 4664 } 4665 4666 /* 4667 * Check that the device does not live on an external facing PCI port that is 4668 * marked as untrusted. Such devices should not be able to apply quirks and 4669 * thus not be able to bypass the IOMMU restrictions. 4670 */ 4671 static bool risky_device(struct pci_dev *pdev) 4672 { 4673 if (pdev->untrusted) { 4674 pci_info(pdev, 4675 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4676 pdev->vendor, pdev->device); 4677 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4678 return true; 4679 } 4680 return false; 4681 } 4682 4683 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4684 unsigned long iova, size_t size) 4685 { 4686 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4687 unsigned long pages = aligned_nrpages(iova, size); 4688 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4689 struct iommu_domain_info *info; 4690 unsigned long i; 4691 4692 xa_for_each(&dmar_domain->iommu_array, i, info) 4693 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); 4694 } 4695 4696 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) 4697 { 4698 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); 4699 struct dev_pasid_info *curr, *dev_pasid = NULL; 4700 struct dmar_domain *dmar_domain; 4701 struct iommu_domain *domain; 4702 unsigned long flags; 4703 4704 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); 4705 if (WARN_ON_ONCE(!domain)) 4706 goto out_tear_down; 4707 4708 /* 4709 * The SVA implementation needs to handle its own stuffs like the mm 4710 * notification. Before consolidating that code into iommu core, let 4711 * the intel sva code handle it. 4712 */ 4713 if (domain->type == IOMMU_DOMAIN_SVA) { 4714 intel_svm_remove_dev_pasid(dev, pasid); 4715 goto out_tear_down; 4716 } 4717 4718 dmar_domain = to_dmar_domain(domain); 4719 spin_lock_irqsave(&dmar_domain->lock, flags); 4720 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { 4721 if (curr->dev == dev && curr->pasid == pasid) { 4722 list_del(&curr->link_domain); 4723 dev_pasid = curr; 4724 break; 4725 } 4726 } 4727 WARN_ON_ONCE(!dev_pasid); 4728 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4729 4730 domain_detach_iommu(dmar_domain, iommu); 4731 kfree(dev_pasid); 4732 out_tear_down: 4733 intel_pasid_tear_down_entry(iommu, dev, pasid, false); 4734 intel_drain_pasid_prq(dev, pasid); 4735 } 4736 4737 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, 4738 struct device *dev, ioasid_t pasid) 4739 { 4740 struct device_domain_info *info = dev_iommu_priv_get(dev); 4741 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4742 struct intel_iommu *iommu = info->iommu; 4743 struct dev_pasid_info *dev_pasid; 4744 unsigned long flags; 4745 int ret; 4746 4747 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 4748 return -EOPNOTSUPP; 4749 4750 if (context_copied(iommu, info->bus, info->devfn)) 4751 return -EBUSY; 4752 4753 ret = prepare_domain_attach_device(domain, dev); 4754 if (ret) 4755 return ret; 4756 4757 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); 4758 if (!dev_pasid) 4759 return -ENOMEM; 4760 4761 ret = domain_attach_iommu(dmar_domain, iommu); 4762 if (ret) 4763 goto out_free; 4764 4765 if (domain_type_is_si(dmar_domain)) 4766 ret = intel_pasid_setup_pass_through(iommu, dmar_domain, 4767 dev, pasid); 4768 else if (dmar_domain->use_first_level) 4769 ret = domain_setup_first_level(iommu, dmar_domain, 4770 dev, pasid); 4771 else 4772 ret = intel_pasid_setup_second_level(iommu, dmar_domain, 4773 dev, pasid); 4774 if (ret) 4775 goto out_detach_iommu; 4776 4777 dev_pasid->dev = dev; 4778 dev_pasid->pasid = pasid; 4779 spin_lock_irqsave(&dmar_domain->lock, flags); 4780 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); 4781 spin_unlock_irqrestore(&dmar_domain->lock, flags); 4782 4783 return 0; 4784 out_detach_iommu: 4785 domain_detach_iommu(dmar_domain, iommu); 4786 out_free: 4787 kfree(dev_pasid); 4788 return ret; 4789 } 4790 4791 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) 4792 { 4793 struct device_domain_info *info = dev_iommu_priv_get(dev); 4794 struct intel_iommu *iommu = info->iommu; 4795 struct iommu_hw_info_vtd *vtd; 4796 4797 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); 4798 if (!vtd) 4799 return ERR_PTR(-ENOMEM); 4800 4801 vtd->cap_reg = iommu->cap; 4802 vtd->ecap_reg = iommu->ecap; 4803 *length = sizeof(*vtd); 4804 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; 4805 return vtd; 4806 } 4807 4808 const struct iommu_ops intel_iommu_ops = { 4809 .capable = intel_iommu_capable, 4810 .hw_info = intel_iommu_hw_info, 4811 .domain_alloc = intel_iommu_domain_alloc, 4812 .probe_device = intel_iommu_probe_device, 4813 .probe_finalize = intel_iommu_probe_finalize, 4814 .release_device = intel_iommu_release_device, 4815 .get_resv_regions = intel_iommu_get_resv_regions, 4816 .device_group = intel_iommu_device_group, 4817 .dev_enable_feat = intel_iommu_dev_enable_feat, 4818 .dev_disable_feat = intel_iommu_dev_disable_feat, 4819 .is_attach_deferred = intel_iommu_is_attach_deferred, 4820 .def_domain_type = device_def_domain_type, 4821 .remove_dev_pasid = intel_iommu_remove_dev_pasid, 4822 .pgsize_bitmap = SZ_4K, 4823 #ifdef CONFIG_INTEL_IOMMU_SVM 4824 .page_response = intel_svm_page_response, 4825 #endif 4826 .default_domain_ops = &(const struct iommu_domain_ops) { 4827 .attach_dev = intel_iommu_attach_device, 4828 .set_dev_pasid = intel_iommu_set_dev_pasid, 4829 .map_pages = intel_iommu_map_pages, 4830 .unmap_pages = intel_iommu_unmap_pages, 4831 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4832 .flush_iotlb_all = intel_flush_iotlb_all, 4833 .iotlb_sync = intel_iommu_tlb_sync, 4834 .iova_to_phys = intel_iommu_iova_to_phys, 4835 .free = intel_iommu_domain_free, 4836 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4837 } 4838 }; 4839 4840 static void quirk_iommu_igfx(struct pci_dev *dev) 4841 { 4842 if (risky_device(dev)) 4843 return; 4844 4845 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4846 dmar_map_gfx = 0; 4847 } 4848 4849 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4857 4858 /* Broadwell igfx malfunctions with dmar */ 4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4883 4884 static void quirk_iommu_rwbf(struct pci_dev *dev) 4885 { 4886 if (risky_device(dev)) 4887 return; 4888 4889 /* 4890 * Mobile 4 Series Chipset neglects to set RWBF capability, 4891 * but needs it. Same seems to hold for the desktop versions. 4892 */ 4893 pci_info(dev, "Forcing write-buffer flush capability\n"); 4894 rwbf_quirk = 1; 4895 } 4896 4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4904 4905 #define GGC 0x52 4906 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4907 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4908 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4909 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4910 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4911 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4912 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4913 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4914 4915 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4916 { 4917 unsigned short ggc; 4918 4919 if (risky_device(dev)) 4920 return; 4921 4922 if (pci_read_config_word(dev, GGC, &ggc)) 4923 return; 4924 4925 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4926 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4927 dmar_map_gfx = 0; 4928 } else if (dmar_map_gfx) { 4929 /* we have to ensure the gfx device is idle before we flush */ 4930 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4931 iommu_set_dma_strict(); 4932 } 4933 } 4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 4938 4939 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 4940 { 4941 unsigned short ver; 4942 4943 if (!IS_GFX_DEVICE(dev)) 4944 return; 4945 4946 ver = (dev->device >> 8) & 0xff; 4947 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 4948 ver != 0x4e && ver != 0x8a && ver != 0x98 && 4949 ver != 0x9a && ver != 0xa7 && ver != 0x7d) 4950 return; 4951 4952 if (risky_device(dev)) 4953 return; 4954 4955 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 4956 iommu_skip_te_disable = 1; 4957 } 4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 4959 4960 /* On Tylersburg chipsets, some BIOSes have been known to enable the 4961 ISOCH DMAR unit for the Azalia sound device, but not give it any 4962 TLB entries, which causes it to deadlock. Check for that. We do 4963 this in a function called from init_dmars(), instead of in a PCI 4964 quirk, because we don't want to print the obnoxious "BIOS broken" 4965 message if VT-d is actually disabled. 4966 */ 4967 static void __init check_tylersburg_isoch(void) 4968 { 4969 struct pci_dev *pdev; 4970 uint32_t vtisochctrl; 4971 4972 /* If there's no Azalia in the system anyway, forget it. */ 4973 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 4974 if (!pdev) 4975 return; 4976 4977 if (risky_device(pdev)) { 4978 pci_dev_put(pdev); 4979 return; 4980 } 4981 4982 pci_dev_put(pdev); 4983 4984 /* System Management Registers. Might be hidden, in which case 4985 we can't do the sanity check. But that's OK, because the 4986 known-broken BIOSes _don't_ actually hide it, so far. */ 4987 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 4988 if (!pdev) 4989 return; 4990 4991 if (risky_device(pdev)) { 4992 pci_dev_put(pdev); 4993 return; 4994 } 4995 4996 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 4997 pci_dev_put(pdev); 4998 return; 4999 } 5000 5001 pci_dev_put(pdev); 5002 5003 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5004 if (vtisochctrl & 1) 5005 return; 5006 5007 /* Drop all bits other than the number of TLB entries */ 5008 vtisochctrl &= 0x1c; 5009 5010 /* If we have the recommended number of TLB entries (16), fine. */ 5011 if (vtisochctrl == 0x10) 5012 return; 5013 5014 /* Zero TLB entries? You get to ride the short bus to school. */ 5015 if (!vtisochctrl) { 5016 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5017 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5018 dmi_get_system_info(DMI_BIOS_VENDOR), 5019 dmi_get_system_info(DMI_BIOS_VERSION), 5020 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5021 iommu_identity_mapping |= IDENTMAP_AZALIA; 5022 return; 5023 } 5024 5025 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5026 vtisochctrl); 5027 } 5028 5029 /* 5030 * Here we deal with a device TLB defect where device may inadvertently issue ATS 5031 * invalidation completion before posted writes initiated with translated address 5032 * that utilized translations matching the invalidation address range, violating 5033 * the invalidation completion ordering. 5034 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is 5035 * vulnerable to this defect. In other words, any dTLB invalidation initiated not 5036 * under the control of the trusted/privileged host device driver must use this 5037 * quirk. 5038 * Device TLBs are invalidated under the following six conditions: 5039 * 1. Device driver does DMA API unmap IOVA 5040 * 2. Device driver unbind a PASID from a process, sva_unbind_device() 5041 * 3. PASID is torn down, after PASID cache is flushed. e.g. process 5042 * exit_mmap() due to crash 5043 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where 5044 * VM has to free pages that were unmapped 5045 * 5. Userspace driver unmaps a DMA buffer 5046 * 6. Cache invalidation in vSVA usage (upcoming) 5047 * 5048 * For #1 and #2, device drivers are responsible for stopping DMA traffic 5049 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to 5050 * invalidate TLB the same way as normal user unmap which will use this quirk. 5051 * The dTLB invalidation after PASID cache flush does not need this quirk. 5052 * 5053 * As a reminder, #6 will *NEED* this quirk as we enable nested translation. 5054 */ 5055 void quirk_extra_dev_tlb_flush(struct device_domain_info *info, 5056 unsigned long address, unsigned long mask, 5057 u32 pasid, u16 qdep) 5058 { 5059 u16 sid; 5060 5061 if (likely(!info->dtlb_extra_inval)) 5062 return; 5063 5064 sid = PCI_DEVID(info->bus, info->devfn); 5065 if (pasid == IOMMU_NO_PASID) { 5066 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 5067 qdep, address, mask); 5068 } else { 5069 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, 5070 pasid, qdep, address, mask); 5071 } 5072 } 5073 5074 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) 5075 5076 /* 5077 * Function to submit a command to the enhanced command interface. The 5078 * valid enhanced command descriptions are defined in Table 47 of the 5079 * VT-d spec. The VT-d hardware implementation may support some but not 5080 * all commands, which can be determined by checking the Enhanced 5081 * Command Capability Register. 5082 * 5083 * Return values: 5084 * - 0: Command successful without any error; 5085 * - Negative: software error value; 5086 * - Nonzero positive: failure status code defined in Table 48. 5087 */ 5088 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) 5089 { 5090 unsigned long flags; 5091 u64 res; 5092 int ret; 5093 5094 if (!cap_ecmds(iommu->cap)) 5095 return -ENODEV; 5096 5097 raw_spin_lock_irqsave(&iommu->register_lock, flags); 5098 5099 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 5100 if (res & DMA_ECMD_ECRSP_IP) { 5101 ret = -EBUSY; 5102 goto err; 5103 } 5104 5105 /* 5106 * Unconditionally write the operand B, because 5107 * - There is no side effect if an ecmd doesn't require an 5108 * operand B, but we set the register to some value. 5109 * - It's not invoked in any critical path. The extra MMIO 5110 * write doesn't bring any performance concerns. 5111 */ 5112 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 5113 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 5114 5115 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 5116 !(res & DMA_ECMD_ECRSP_IP), res); 5117 5118 if (res & DMA_ECMD_ECRSP_IP) { 5119 ret = -ETIMEDOUT; 5120 goto err; 5121 } 5122 5123 ret = ecmd_get_status_code(res); 5124 err: 5125 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 5126 5127 return ret; 5128 } 5129