1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dma-iommu.h> 19 #include <linux/dmi.h> 20 #include <linux/intel-iommu.h> 21 #include <linux/intel-svm.h> 22 #include <linux/memory.h> 23 #include <linux/pci.h> 24 #include <linux/pci-ats.h> 25 #include <linux/spinlock.h> 26 #include <linux/syscore_ops.h> 27 #include <linux/tboot.h> 28 29 #include "../irq_remapping.h" 30 #include "../iommu-sva-lib.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 /* global iommu list, set NULL for ignored DMAR units */ 130 static struct intel_iommu **g_iommus; 131 132 static void __init check_tylersburg_isoch(void); 133 static int rwbf_quirk; 134 static inline struct device_domain_info * 135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_clear_pasid_enable(struct context_entry *context) 172 { 173 context->lo &= ~(1ULL << 11); 174 } 175 176 static inline bool context_pasid_enabled(struct context_entry *context) 177 { 178 return !!(context->lo & (1ULL << 11)); 179 } 180 181 static inline void context_set_copied(struct context_entry *context) 182 { 183 context->hi |= (1ull << 3); 184 } 185 186 static inline bool context_copied(struct context_entry *context) 187 { 188 return !!(context->hi & (1ULL << 3)); 189 } 190 191 static inline bool __context_present(struct context_entry *context) 192 { 193 return (context->lo & 1); 194 } 195 196 bool context_present(struct context_entry *context) 197 { 198 return context_pasid_enabled(context) ? 199 __context_present(context) : 200 __context_present(context) && !context_copied(context); 201 } 202 203 static inline void context_set_present(struct context_entry *context) 204 { 205 context->lo |= 1; 206 } 207 208 static inline void context_set_fault_enable(struct context_entry *context) 209 { 210 context->lo &= (((u64)-1) << 2) | 1; 211 } 212 213 static inline void context_set_translation_type(struct context_entry *context, 214 unsigned long value) 215 { 216 context->lo &= (((u64)-1) << 4) | 3; 217 context->lo |= (value & 3) << 2; 218 } 219 220 static inline void context_set_address_root(struct context_entry *context, 221 unsigned long value) 222 { 223 context->lo &= ~VTD_PAGE_MASK; 224 context->lo |= value & VTD_PAGE_MASK; 225 } 226 227 static inline void context_set_address_width(struct context_entry *context, 228 unsigned long value) 229 { 230 context->hi |= value & 7; 231 } 232 233 static inline void context_set_domain_id(struct context_entry *context, 234 unsigned long value) 235 { 236 context->hi |= (value & ((1 << 16) - 1)) << 8; 237 } 238 239 static inline int context_domain_id(struct context_entry *c) 240 { 241 return((c->hi >> 8) & 0xffff); 242 } 243 244 static inline void context_clear_entry(struct context_entry *context) 245 { 246 context->lo = 0; 247 context->hi = 0; 248 } 249 250 /* 251 * This domain is a statically identity mapping domain. 252 * 1. This domain creats a static 1:1 mapping to all usable memory. 253 * 2. It maps to each iommu if successful. 254 * 3. Each iommu mapps to this domain if successful. 255 */ 256 static struct dmar_domain *si_domain; 257 static int hw_pass_through = 1; 258 259 #define for_each_domain_iommu(idx, domain) \ 260 for (idx = 0; idx < g_num_of_iommus; idx++) \ 261 if (domain->iommu_refcnt[idx]) 262 263 struct dmar_rmrr_unit { 264 struct list_head list; /* list of rmrr units */ 265 struct acpi_dmar_header *hdr; /* ACPI header */ 266 u64 base_address; /* reserved base address*/ 267 u64 end_address; /* reserved end address */ 268 struct dmar_dev_scope *devices; /* target devices */ 269 int devices_cnt; /* target device count */ 270 }; 271 272 struct dmar_atsr_unit { 273 struct list_head list; /* list of ATSR units */ 274 struct acpi_dmar_header *hdr; /* ACPI header */ 275 struct dmar_dev_scope *devices; /* target devices */ 276 int devices_cnt; /* target device count */ 277 u8 include_all:1; /* include all ports */ 278 }; 279 280 struct dmar_satc_unit { 281 struct list_head list; /* list of SATC units */ 282 struct acpi_dmar_header *hdr; /* ACPI header */ 283 struct dmar_dev_scope *devices; /* target devices */ 284 struct intel_iommu *iommu; /* the corresponding iommu */ 285 int devices_cnt; /* target device count */ 286 u8 atc_required:1; /* ATS is required */ 287 }; 288 289 static LIST_HEAD(dmar_atsr_units); 290 static LIST_HEAD(dmar_rmrr_units); 291 static LIST_HEAD(dmar_satc_units); 292 293 #define for_each_rmrr_units(rmrr) \ 294 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 295 296 /* bitmap for indexing intel_iommus */ 297 static int g_num_of_iommus; 298 299 static void domain_remove_dev_info(struct dmar_domain *domain); 300 static void dmar_remove_one_dev_info(struct device *dev); 301 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 302 303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 305 306 int intel_iommu_enabled = 0; 307 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 308 309 static int dmar_map_gfx = 1; 310 static int intel_iommu_superpage = 1; 311 static int iommu_identity_mapping; 312 static int iommu_skip_te_disable; 313 314 #define IDENTMAP_GFX 2 315 #define IDENTMAP_AZALIA 4 316 317 int intel_iommu_gfx_mapped; 318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 319 320 DEFINE_SPINLOCK(device_domain_lock); 321 static LIST_HEAD(device_domain_list); 322 323 const struct iommu_ops intel_iommu_ops; 324 325 static bool translation_pre_enabled(struct intel_iommu *iommu) 326 { 327 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 328 } 329 330 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 331 { 332 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 333 } 334 335 static void init_translation_status(struct intel_iommu *iommu) 336 { 337 u32 gsts; 338 339 gsts = readl(iommu->reg + DMAR_GSTS_REG); 340 if (gsts & DMA_GSTS_TES) 341 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 342 } 343 344 static int __init intel_iommu_setup(char *str) 345 { 346 if (!str) 347 return -EINVAL; 348 349 while (*str) { 350 if (!strncmp(str, "on", 2)) { 351 dmar_disabled = 0; 352 pr_info("IOMMU enabled\n"); 353 } else if (!strncmp(str, "off", 3)) { 354 dmar_disabled = 1; 355 no_platform_optin = 1; 356 pr_info("IOMMU disabled\n"); 357 } else if (!strncmp(str, "igfx_off", 8)) { 358 dmar_map_gfx = 0; 359 pr_info("Disable GFX device mapping\n"); 360 } else if (!strncmp(str, "forcedac", 8)) { 361 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 362 iommu_dma_forcedac = true; 363 } else if (!strncmp(str, "strict", 6)) { 364 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 365 iommu_set_dma_strict(); 366 } else if (!strncmp(str, "sp_off", 6)) { 367 pr_info("Disable supported super page\n"); 368 intel_iommu_superpage = 0; 369 } else if (!strncmp(str, "sm_on", 5)) { 370 pr_info("Enable scalable mode if hardware supports\n"); 371 intel_iommu_sm = 1; 372 } else if (!strncmp(str, "sm_off", 6)) { 373 pr_info("Scalable mode is disallowed\n"); 374 intel_iommu_sm = 0; 375 } else if (!strncmp(str, "tboot_noforce", 13)) { 376 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 377 intel_iommu_tboot_noforce = 1; 378 } else { 379 pr_notice("Unknown option - '%s'\n", str); 380 } 381 382 str += strcspn(str, ","); 383 while (*str == ',') 384 str++; 385 } 386 387 return 1; 388 } 389 __setup("intel_iommu=", intel_iommu_setup); 390 391 void *alloc_pgtable_page(int node) 392 { 393 struct page *page; 394 void *vaddr = NULL; 395 396 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 397 if (page) 398 vaddr = page_address(page); 399 return vaddr; 400 } 401 402 void free_pgtable_page(void *vaddr) 403 { 404 free_page((unsigned long)vaddr); 405 } 406 407 static inline int domain_type_is_si(struct dmar_domain *domain) 408 { 409 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 410 } 411 412 static inline bool domain_use_first_level(struct dmar_domain *domain) 413 { 414 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 415 } 416 417 static inline int domain_pfn_supported(struct dmar_domain *domain, 418 unsigned long pfn) 419 { 420 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 421 422 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 423 } 424 425 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 426 { 427 unsigned long sagaw; 428 int agaw; 429 430 sagaw = cap_sagaw(iommu->cap); 431 for (agaw = width_to_agaw(max_gaw); 432 agaw >= 0; agaw--) { 433 if (test_bit(agaw, &sagaw)) 434 break; 435 } 436 437 return agaw; 438 } 439 440 /* 441 * Calculate max SAGAW for each iommu. 442 */ 443 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 444 { 445 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 446 } 447 448 /* 449 * calculate agaw for each iommu. 450 * "SAGAW" may be different across iommus, use a default agaw, and 451 * get a supported less agaw for iommus that don't support the default agaw. 452 */ 453 int iommu_calculate_agaw(struct intel_iommu *iommu) 454 { 455 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 456 } 457 458 /* This functionin only returns single iommu in a domain */ 459 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 460 { 461 int iommu_id; 462 463 /* si_domain and vm domain should not get here. */ 464 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 465 return NULL; 466 467 for_each_domain_iommu(iommu_id, domain) 468 break; 469 470 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 471 return NULL; 472 473 return g_iommus[iommu_id]; 474 } 475 476 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 477 { 478 return sm_supported(iommu) ? 479 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 480 } 481 482 static void domain_update_iommu_coherency(struct dmar_domain *domain) 483 { 484 struct dmar_drhd_unit *drhd; 485 struct intel_iommu *iommu; 486 bool found = false; 487 int i; 488 489 domain->iommu_coherency = true; 490 491 for_each_domain_iommu(i, domain) { 492 found = true; 493 if (!iommu_paging_structure_coherency(g_iommus[i])) { 494 domain->iommu_coherency = false; 495 break; 496 } 497 } 498 if (found) 499 return; 500 501 /* No hardware attached; use lowest common denominator */ 502 rcu_read_lock(); 503 for_each_active_iommu(iommu, drhd) { 504 if (!iommu_paging_structure_coherency(iommu)) { 505 domain->iommu_coherency = false; 506 break; 507 } 508 } 509 rcu_read_unlock(); 510 } 511 512 static int domain_update_iommu_superpage(struct dmar_domain *domain, 513 struct intel_iommu *skip) 514 { 515 struct dmar_drhd_unit *drhd; 516 struct intel_iommu *iommu; 517 int mask = 0x3; 518 519 if (!intel_iommu_superpage) 520 return 0; 521 522 /* set iommu_superpage to the smallest common denominator */ 523 rcu_read_lock(); 524 for_each_active_iommu(iommu, drhd) { 525 if (iommu != skip) { 526 if (domain && domain_use_first_level(domain)) { 527 if (!cap_fl1gp_support(iommu->cap)) 528 mask = 0x1; 529 } else { 530 mask &= cap_super_page_val(iommu->cap); 531 } 532 533 if (!mask) 534 break; 535 } 536 } 537 rcu_read_unlock(); 538 539 return fls(mask); 540 } 541 542 static int domain_update_device_node(struct dmar_domain *domain) 543 { 544 struct device_domain_info *info; 545 int nid = NUMA_NO_NODE; 546 547 assert_spin_locked(&device_domain_lock); 548 549 if (list_empty(&domain->devices)) 550 return NUMA_NO_NODE; 551 552 list_for_each_entry(info, &domain->devices, link) { 553 if (!info->dev) 554 continue; 555 556 /* 557 * There could possibly be multiple device numa nodes as devices 558 * within the same domain may sit behind different IOMMUs. There 559 * isn't perfect answer in such situation, so we select first 560 * come first served policy. 561 */ 562 nid = dev_to_node(info->dev); 563 if (nid != NUMA_NO_NODE) 564 break; 565 } 566 567 return nid; 568 } 569 570 static void domain_update_iotlb(struct dmar_domain *domain); 571 572 /* Return the super pagesize bitmap if supported. */ 573 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 574 { 575 unsigned long bitmap = 0; 576 577 /* 578 * 1-level super page supports page size of 2MiB, 2-level super page 579 * supports page size of both 2MiB and 1GiB. 580 */ 581 if (domain->iommu_superpage == 1) 582 bitmap |= SZ_2M; 583 else if (domain->iommu_superpage == 2) 584 bitmap |= SZ_2M | SZ_1G; 585 586 return bitmap; 587 } 588 589 /* Some capabilities may be different across iommus */ 590 static void domain_update_iommu_cap(struct dmar_domain *domain) 591 { 592 domain_update_iommu_coherency(domain); 593 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 594 595 /* 596 * If RHSA is missing, we should default to the device numa domain 597 * as fall back. 598 */ 599 if (domain->nid == NUMA_NO_NODE) 600 domain->nid = domain_update_device_node(domain); 601 602 /* 603 * First-level translation restricts the input-address to a 604 * canonical address (i.e., address bits 63:N have the same 605 * value as address bit [N-1], where N is 48-bits with 4-level 606 * paging and 57-bits with 5-level paging). Hence, skip bit 607 * [N-1]. 608 */ 609 if (domain_use_first_level(domain)) 610 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 611 else 612 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 613 614 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 615 domain_update_iotlb(domain); 616 } 617 618 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 619 u8 devfn, int alloc) 620 { 621 struct root_entry *root = &iommu->root_entry[bus]; 622 struct context_entry *context; 623 u64 *entry; 624 625 entry = &root->lo; 626 if (sm_supported(iommu)) { 627 if (devfn >= 0x80) { 628 devfn -= 0x80; 629 entry = &root->hi; 630 } 631 devfn *= 2; 632 } 633 if (*entry & 1) 634 context = phys_to_virt(*entry & VTD_PAGE_MASK); 635 else { 636 unsigned long phy_addr; 637 if (!alloc) 638 return NULL; 639 640 context = alloc_pgtable_page(iommu->node); 641 if (!context) 642 return NULL; 643 644 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 645 phy_addr = virt_to_phys((void *)context); 646 *entry = phy_addr | 1; 647 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 648 } 649 return &context[devfn]; 650 } 651 652 /** 653 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 654 * sub-hierarchy of a candidate PCI-PCI bridge 655 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 656 * @bridge: the candidate PCI-PCI bridge 657 * 658 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 659 */ 660 static bool 661 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 662 { 663 struct pci_dev *pdev, *pbridge; 664 665 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 666 return false; 667 668 pdev = to_pci_dev(dev); 669 pbridge = to_pci_dev(bridge); 670 671 if (pbridge->subordinate && 672 pbridge->subordinate->number <= pdev->bus->number && 673 pbridge->subordinate->busn_res.end >= pdev->bus->number) 674 return true; 675 676 return false; 677 } 678 679 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 680 { 681 struct dmar_drhd_unit *drhd; 682 u32 vtbar; 683 int rc; 684 685 /* We know that this device on this chipset has its own IOMMU. 686 * If we find it under a different IOMMU, then the BIOS is lying 687 * to us. Hope that the IOMMU for this device is actually 688 * disabled, and it needs no translation... 689 */ 690 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 691 if (rc) { 692 /* "can't" happen */ 693 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 694 return false; 695 } 696 vtbar &= 0xffff0000; 697 698 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 699 drhd = dmar_find_matched_drhd_unit(pdev); 700 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 701 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 702 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 703 return true; 704 } 705 706 return false; 707 } 708 709 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 710 { 711 if (!iommu || iommu->drhd->ignored) 712 return true; 713 714 if (dev_is_pci(dev)) { 715 struct pci_dev *pdev = to_pci_dev(dev); 716 717 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 718 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 719 quirk_ioat_snb_local_iommu(pdev)) 720 return true; 721 } 722 723 return false; 724 } 725 726 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 727 { 728 struct dmar_drhd_unit *drhd = NULL; 729 struct pci_dev *pdev = NULL; 730 struct intel_iommu *iommu; 731 struct device *tmp; 732 u16 segment = 0; 733 int i; 734 735 if (!dev) 736 return NULL; 737 738 if (dev_is_pci(dev)) { 739 struct pci_dev *pf_pdev; 740 741 pdev = pci_real_dma_dev(to_pci_dev(dev)); 742 743 /* VFs aren't listed in scope tables; we need to look up 744 * the PF instead to find the IOMMU. */ 745 pf_pdev = pci_physfn(pdev); 746 dev = &pf_pdev->dev; 747 segment = pci_domain_nr(pdev->bus); 748 } else if (has_acpi_companion(dev)) 749 dev = &ACPI_COMPANION(dev)->dev; 750 751 rcu_read_lock(); 752 for_each_iommu(iommu, drhd) { 753 if (pdev && segment != drhd->segment) 754 continue; 755 756 for_each_active_dev_scope(drhd->devices, 757 drhd->devices_cnt, i, tmp) { 758 if (tmp == dev) { 759 /* For a VF use its original BDF# not that of the PF 760 * which we used for the IOMMU lookup. Strictly speaking 761 * we could do this for all PCI devices; we only need to 762 * get the BDF# from the scope table for ACPI matches. */ 763 if (pdev && pdev->is_virtfn) 764 goto got_pdev; 765 766 if (bus && devfn) { 767 *bus = drhd->devices[i].bus; 768 *devfn = drhd->devices[i].devfn; 769 } 770 goto out; 771 } 772 773 if (is_downstream_to_pci_bridge(dev, tmp)) 774 goto got_pdev; 775 } 776 777 if (pdev && drhd->include_all) { 778 got_pdev: 779 if (bus && devfn) { 780 *bus = pdev->bus->number; 781 *devfn = pdev->devfn; 782 } 783 goto out; 784 } 785 } 786 iommu = NULL; 787 out: 788 if (iommu_is_dummy(iommu, dev)) 789 iommu = NULL; 790 791 rcu_read_unlock(); 792 793 return iommu; 794 } 795 796 static void domain_flush_cache(struct dmar_domain *domain, 797 void *addr, int size) 798 { 799 if (!domain->iommu_coherency) 800 clflush_cache_range(addr, size); 801 } 802 803 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 804 { 805 struct context_entry *context; 806 int ret = 0; 807 unsigned long flags; 808 809 spin_lock_irqsave(&iommu->lock, flags); 810 context = iommu_context_addr(iommu, bus, devfn, 0); 811 if (context) 812 ret = context_present(context); 813 spin_unlock_irqrestore(&iommu->lock, flags); 814 return ret; 815 } 816 817 static void free_context_table(struct intel_iommu *iommu) 818 { 819 int i; 820 unsigned long flags; 821 struct context_entry *context; 822 823 spin_lock_irqsave(&iommu->lock, flags); 824 if (!iommu->root_entry) { 825 goto out; 826 } 827 for (i = 0; i < ROOT_ENTRY_NR; i++) { 828 context = iommu_context_addr(iommu, i, 0, 0); 829 if (context) 830 free_pgtable_page(context); 831 832 if (!sm_supported(iommu)) 833 continue; 834 835 context = iommu_context_addr(iommu, i, 0x80, 0); 836 if (context) 837 free_pgtable_page(context); 838 839 } 840 free_pgtable_page(iommu->root_entry); 841 iommu->root_entry = NULL; 842 out: 843 spin_unlock_irqrestore(&iommu->lock, flags); 844 } 845 846 #ifdef CONFIG_DMAR_DEBUG 847 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 848 { 849 struct device_domain_info *info; 850 struct dma_pte *parent, *pte; 851 struct dmar_domain *domain; 852 int offset, level; 853 854 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 855 if (!info || !info->domain) { 856 pr_info("device [%02x:%02x.%d] not probed\n", 857 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 858 return; 859 } 860 861 domain = info->domain; 862 level = agaw_to_level(domain->agaw); 863 parent = domain->pgd; 864 if (!parent) { 865 pr_info("no page table setup\n"); 866 return; 867 } 868 869 while (1) { 870 offset = pfn_level_offset(pfn, level); 871 pte = &parent[offset]; 872 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 873 pr_info("PTE not present at level %d\n", level); 874 break; 875 } 876 877 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 878 879 if (level == 1) 880 break; 881 882 parent = phys_to_virt(dma_pte_addr(pte)); 883 level--; 884 } 885 } 886 887 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 888 unsigned long long addr, u32 pasid) 889 { 890 struct pasid_dir_entry *dir, *pde; 891 struct pasid_entry *entries, *pte; 892 struct context_entry *ctx_entry; 893 struct root_entry *rt_entry; 894 u8 devfn = source_id & 0xff; 895 u8 bus = source_id >> 8; 896 int i, dir_index, index; 897 898 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 899 900 /* root entry dump */ 901 rt_entry = &iommu->root_entry[bus]; 902 if (!rt_entry) { 903 pr_info("root table entry is not present\n"); 904 return; 905 } 906 907 if (sm_supported(iommu)) 908 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 909 rt_entry->hi, rt_entry->lo); 910 else 911 pr_info("root entry: 0x%016llx", rt_entry->lo); 912 913 /* context entry dump */ 914 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 915 if (!ctx_entry) { 916 pr_info("context table entry is not present\n"); 917 return; 918 } 919 920 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 921 ctx_entry->hi, ctx_entry->lo); 922 923 /* legacy mode does not require PASID entries */ 924 if (!sm_supported(iommu)) 925 goto pgtable_walk; 926 927 /* get the pointer to pasid directory entry */ 928 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 929 if (!dir) { 930 pr_info("pasid directory entry is not present\n"); 931 return; 932 } 933 /* For request-without-pasid, get the pasid from context entry */ 934 if (intel_iommu_sm && pasid == INVALID_IOASID) 935 pasid = PASID_RID2PASID; 936 937 dir_index = pasid >> PASID_PDE_SHIFT; 938 pde = &dir[dir_index]; 939 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 940 941 /* get the pointer to the pasid table entry */ 942 entries = get_pasid_table_from_pde(pde); 943 if (!entries) { 944 pr_info("pasid table entry is not present\n"); 945 return; 946 } 947 index = pasid & PASID_PTE_MASK; 948 pte = &entries[index]; 949 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 950 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 951 952 pgtable_walk: 953 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 954 } 955 #endif 956 957 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 958 unsigned long pfn, int *target_level) 959 { 960 struct dma_pte *parent, *pte; 961 int level = agaw_to_level(domain->agaw); 962 int offset; 963 964 BUG_ON(!domain->pgd); 965 966 if (!domain_pfn_supported(domain, pfn)) 967 /* Address beyond IOMMU's addressing capabilities. */ 968 return NULL; 969 970 parent = domain->pgd; 971 972 while (1) { 973 void *tmp_page; 974 975 offset = pfn_level_offset(pfn, level); 976 pte = &parent[offset]; 977 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 978 break; 979 if (level == *target_level) 980 break; 981 982 if (!dma_pte_present(pte)) { 983 uint64_t pteval; 984 985 tmp_page = alloc_pgtable_page(domain->nid); 986 987 if (!tmp_page) 988 return NULL; 989 990 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 991 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 992 if (domain_use_first_level(domain)) { 993 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 994 if (iommu_is_dma_domain(&domain->domain)) 995 pteval |= DMA_FL_PTE_ACCESS; 996 } 997 if (cmpxchg64(&pte->val, 0ULL, pteval)) 998 /* Someone else set it while we were thinking; use theirs. */ 999 free_pgtable_page(tmp_page); 1000 else 1001 domain_flush_cache(domain, pte, sizeof(*pte)); 1002 } 1003 if (level == 1) 1004 break; 1005 1006 parent = phys_to_virt(dma_pte_addr(pte)); 1007 level--; 1008 } 1009 1010 if (!*target_level) 1011 *target_level = level; 1012 1013 return pte; 1014 } 1015 1016 /* return address's pte at specific level */ 1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1018 unsigned long pfn, 1019 int level, int *large_page) 1020 { 1021 struct dma_pte *parent, *pte; 1022 int total = agaw_to_level(domain->agaw); 1023 int offset; 1024 1025 parent = domain->pgd; 1026 while (level <= total) { 1027 offset = pfn_level_offset(pfn, total); 1028 pte = &parent[offset]; 1029 if (level == total) 1030 return pte; 1031 1032 if (!dma_pte_present(pte)) { 1033 *large_page = total; 1034 break; 1035 } 1036 1037 if (dma_pte_superpage(pte)) { 1038 *large_page = total; 1039 return pte; 1040 } 1041 1042 parent = phys_to_virt(dma_pte_addr(pte)); 1043 total--; 1044 } 1045 return NULL; 1046 } 1047 1048 /* clear last level pte, a tlb flush should be followed */ 1049 static void dma_pte_clear_range(struct dmar_domain *domain, 1050 unsigned long start_pfn, 1051 unsigned long last_pfn) 1052 { 1053 unsigned int large_page; 1054 struct dma_pte *first_pte, *pte; 1055 1056 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1057 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1058 BUG_ON(start_pfn > last_pfn); 1059 1060 /* we don't need lock here; nobody else touches the iova range */ 1061 do { 1062 large_page = 1; 1063 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1064 if (!pte) { 1065 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1066 continue; 1067 } 1068 do { 1069 dma_clear_pte(pte); 1070 start_pfn += lvl_to_nr_pages(large_page); 1071 pte++; 1072 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1073 1074 domain_flush_cache(domain, first_pte, 1075 (void *)pte - (void *)first_pte); 1076 1077 } while (start_pfn && start_pfn <= last_pfn); 1078 } 1079 1080 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1081 int retain_level, struct dma_pte *pte, 1082 unsigned long pfn, unsigned long start_pfn, 1083 unsigned long last_pfn) 1084 { 1085 pfn = max(start_pfn, pfn); 1086 pte = &pte[pfn_level_offset(pfn, level)]; 1087 1088 do { 1089 unsigned long level_pfn; 1090 struct dma_pte *level_pte; 1091 1092 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1093 goto next; 1094 1095 level_pfn = pfn & level_mask(level); 1096 level_pte = phys_to_virt(dma_pte_addr(pte)); 1097 1098 if (level > 2) { 1099 dma_pte_free_level(domain, level - 1, retain_level, 1100 level_pte, level_pfn, start_pfn, 1101 last_pfn); 1102 } 1103 1104 /* 1105 * Free the page table if we're below the level we want to 1106 * retain and the range covers the entire table. 1107 */ 1108 if (level < retain_level && !(start_pfn > level_pfn || 1109 last_pfn < level_pfn + level_size(level) - 1)) { 1110 dma_clear_pte(pte); 1111 domain_flush_cache(domain, pte, sizeof(*pte)); 1112 free_pgtable_page(level_pte); 1113 } 1114 next: 1115 pfn += level_size(level); 1116 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1117 } 1118 1119 /* 1120 * clear last level (leaf) ptes and free page table pages below the 1121 * level we wish to keep intact. 1122 */ 1123 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1124 unsigned long start_pfn, 1125 unsigned long last_pfn, 1126 int retain_level) 1127 { 1128 dma_pte_clear_range(domain, start_pfn, last_pfn); 1129 1130 /* We don't need lock here; nobody else touches the iova range */ 1131 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1132 domain->pgd, 0, start_pfn, last_pfn); 1133 1134 /* free pgd */ 1135 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1136 free_pgtable_page(domain->pgd); 1137 domain->pgd = NULL; 1138 } 1139 } 1140 1141 /* When a page at a given level is being unlinked from its parent, we don't 1142 need to *modify* it at all. All we need to do is make a list of all the 1143 pages which can be freed just as soon as we've flushed the IOTLB and we 1144 know the hardware page-walk will no longer touch them. 1145 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1146 be freed. */ 1147 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1148 int level, struct dma_pte *pte, 1149 struct list_head *freelist) 1150 { 1151 struct page *pg; 1152 1153 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1154 list_add_tail(&pg->lru, freelist); 1155 1156 if (level == 1) 1157 return; 1158 1159 pte = page_address(pg); 1160 do { 1161 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1162 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1163 pte++; 1164 } while (!first_pte_in_page(pte)); 1165 } 1166 1167 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1168 struct dma_pte *pte, unsigned long pfn, 1169 unsigned long start_pfn, unsigned long last_pfn, 1170 struct list_head *freelist) 1171 { 1172 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1173 1174 pfn = max(start_pfn, pfn); 1175 pte = &pte[pfn_level_offset(pfn, level)]; 1176 1177 do { 1178 unsigned long level_pfn = pfn & level_mask(level); 1179 1180 if (!dma_pte_present(pte)) 1181 goto next; 1182 1183 /* If range covers entire pagetable, free it */ 1184 if (start_pfn <= level_pfn && 1185 last_pfn >= level_pfn + level_size(level) - 1) { 1186 /* These suborbinate page tables are going away entirely. Don't 1187 bother to clear them; we're just going to *free* them. */ 1188 if (level > 1 && !dma_pte_superpage(pte)) 1189 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1190 1191 dma_clear_pte(pte); 1192 if (!first_pte) 1193 first_pte = pte; 1194 last_pte = pte; 1195 } else if (level > 1) { 1196 /* Recurse down into a level that isn't *entirely* obsolete */ 1197 dma_pte_clear_level(domain, level - 1, 1198 phys_to_virt(dma_pte_addr(pte)), 1199 level_pfn, start_pfn, last_pfn, 1200 freelist); 1201 } 1202 next: 1203 pfn = level_pfn + level_size(level); 1204 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1205 1206 if (first_pte) 1207 domain_flush_cache(domain, first_pte, 1208 (void *)++last_pte - (void *)first_pte); 1209 } 1210 1211 /* We can't just free the pages because the IOMMU may still be walking 1212 the page tables, and may have cached the intermediate levels. The 1213 pages can only be freed after the IOTLB flush has been done. */ 1214 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1215 unsigned long last_pfn, struct list_head *freelist) 1216 { 1217 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1218 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1219 BUG_ON(start_pfn > last_pfn); 1220 1221 /* we don't need lock here; nobody else touches the iova range */ 1222 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1223 domain->pgd, 0, start_pfn, last_pfn, freelist); 1224 1225 /* free pgd */ 1226 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1227 struct page *pgd_page = virt_to_page(domain->pgd); 1228 list_add_tail(&pgd_page->lru, freelist); 1229 domain->pgd = NULL; 1230 } 1231 } 1232 1233 /* iommu handling */ 1234 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1235 { 1236 struct root_entry *root; 1237 unsigned long flags; 1238 1239 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1240 if (!root) { 1241 pr_err("Allocating root entry for %s failed\n", 1242 iommu->name); 1243 return -ENOMEM; 1244 } 1245 1246 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1247 1248 spin_lock_irqsave(&iommu->lock, flags); 1249 iommu->root_entry = root; 1250 spin_unlock_irqrestore(&iommu->lock, flags); 1251 1252 return 0; 1253 } 1254 1255 static void iommu_set_root_entry(struct intel_iommu *iommu) 1256 { 1257 u64 addr; 1258 u32 sts; 1259 unsigned long flag; 1260 1261 addr = virt_to_phys(iommu->root_entry); 1262 if (sm_supported(iommu)) 1263 addr |= DMA_RTADDR_SMT; 1264 1265 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1266 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1267 1268 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1269 1270 /* Make sure hardware complete it */ 1271 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1272 readl, (sts & DMA_GSTS_RTPS), sts); 1273 1274 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1275 1276 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1277 if (sm_supported(iommu)) 1278 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1279 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1280 } 1281 1282 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1283 { 1284 u32 val; 1285 unsigned long flag; 1286 1287 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1288 return; 1289 1290 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1291 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1292 1293 /* Make sure hardware complete it */ 1294 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1295 readl, (!(val & DMA_GSTS_WBFS)), val); 1296 1297 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1298 } 1299 1300 /* return value determine if we need a write buffer flush */ 1301 static void __iommu_flush_context(struct intel_iommu *iommu, 1302 u16 did, u16 source_id, u8 function_mask, 1303 u64 type) 1304 { 1305 u64 val = 0; 1306 unsigned long flag; 1307 1308 switch (type) { 1309 case DMA_CCMD_GLOBAL_INVL: 1310 val = DMA_CCMD_GLOBAL_INVL; 1311 break; 1312 case DMA_CCMD_DOMAIN_INVL: 1313 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1314 break; 1315 case DMA_CCMD_DEVICE_INVL: 1316 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1317 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1318 break; 1319 default: 1320 BUG(); 1321 } 1322 val |= DMA_CCMD_ICC; 1323 1324 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1325 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1326 1327 /* Make sure hardware complete it */ 1328 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1329 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1330 1331 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1332 } 1333 1334 /* return value determine if we need a write buffer flush */ 1335 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1336 u64 addr, unsigned int size_order, u64 type) 1337 { 1338 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1339 u64 val = 0, val_iva = 0; 1340 unsigned long flag; 1341 1342 switch (type) { 1343 case DMA_TLB_GLOBAL_FLUSH: 1344 /* global flush doesn't need set IVA_REG */ 1345 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1346 break; 1347 case DMA_TLB_DSI_FLUSH: 1348 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1349 break; 1350 case DMA_TLB_PSI_FLUSH: 1351 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1352 /* IH bit is passed in as part of address */ 1353 val_iva = size_order | addr; 1354 break; 1355 default: 1356 BUG(); 1357 } 1358 /* Note: set drain read/write */ 1359 #if 0 1360 /* 1361 * This is probably to be super secure.. Looks like we can 1362 * ignore it without any impact. 1363 */ 1364 if (cap_read_drain(iommu->cap)) 1365 val |= DMA_TLB_READ_DRAIN; 1366 #endif 1367 if (cap_write_drain(iommu->cap)) 1368 val |= DMA_TLB_WRITE_DRAIN; 1369 1370 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1371 /* Note: Only uses first TLB reg currently */ 1372 if (val_iva) 1373 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1374 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1375 1376 /* Make sure hardware complete it */ 1377 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1378 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1379 1380 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1381 1382 /* check IOTLB invalidation granularity */ 1383 if (DMA_TLB_IAIG(val) == 0) 1384 pr_err("Flush IOTLB failed\n"); 1385 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1386 pr_debug("TLB flush request %Lx, actual %Lx\n", 1387 (unsigned long long)DMA_TLB_IIRG(type), 1388 (unsigned long long)DMA_TLB_IAIG(val)); 1389 } 1390 1391 static struct device_domain_info * 1392 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1393 u8 bus, u8 devfn) 1394 { 1395 struct device_domain_info *info; 1396 1397 assert_spin_locked(&device_domain_lock); 1398 1399 if (!iommu->qi) 1400 return NULL; 1401 1402 list_for_each_entry(info, &domain->devices, link) 1403 if (info->iommu == iommu && info->bus == bus && 1404 info->devfn == devfn) { 1405 if (info->ats_supported && info->dev) 1406 return info; 1407 break; 1408 } 1409 1410 return NULL; 1411 } 1412 1413 static void domain_update_iotlb(struct dmar_domain *domain) 1414 { 1415 struct device_domain_info *info; 1416 bool has_iotlb_device = false; 1417 1418 assert_spin_locked(&device_domain_lock); 1419 1420 list_for_each_entry(info, &domain->devices, link) 1421 if (info->ats_enabled) { 1422 has_iotlb_device = true; 1423 break; 1424 } 1425 1426 domain->has_iotlb_device = has_iotlb_device; 1427 } 1428 1429 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1430 { 1431 struct pci_dev *pdev; 1432 1433 assert_spin_locked(&device_domain_lock); 1434 1435 if (!info || !dev_is_pci(info->dev)) 1436 return; 1437 1438 pdev = to_pci_dev(info->dev); 1439 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1440 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1441 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1442 * reserved, which should be set to 0. 1443 */ 1444 if (!ecap_dit(info->iommu->ecap)) 1445 info->pfsid = 0; 1446 else { 1447 struct pci_dev *pf_pdev; 1448 1449 /* pdev will be returned if device is not a vf */ 1450 pf_pdev = pci_physfn(pdev); 1451 info->pfsid = pci_dev_id(pf_pdev); 1452 } 1453 1454 #ifdef CONFIG_INTEL_IOMMU_SVM 1455 /* The PCIe spec, in its wisdom, declares that the behaviour of 1456 the device if you enable PASID support after ATS support is 1457 undefined. So always enable PASID support on devices which 1458 have it, even if we can't yet know if we're ever going to 1459 use it. */ 1460 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1461 info->pasid_enabled = 1; 1462 1463 if (info->pri_supported && 1464 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1465 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1466 info->pri_enabled = 1; 1467 #endif 1468 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1469 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1470 info->ats_enabled = 1; 1471 domain_update_iotlb(info->domain); 1472 info->ats_qdep = pci_ats_queue_depth(pdev); 1473 } 1474 } 1475 1476 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1477 { 1478 struct pci_dev *pdev; 1479 1480 assert_spin_locked(&device_domain_lock); 1481 1482 if (!dev_is_pci(info->dev)) 1483 return; 1484 1485 pdev = to_pci_dev(info->dev); 1486 1487 if (info->ats_enabled) { 1488 pci_disable_ats(pdev); 1489 info->ats_enabled = 0; 1490 domain_update_iotlb(info->domain); 1491 } 1492 #ifdef CONFIG_INTEL_IOMMU_SVM 1493 if (info->pri_enabled) { 1494 pci_disable_pri(pdev); 1495 info->pri_enabled = 0; 1496 } 1497 if (info->pasid_enabled) { 1498 pci_disable_pasid(pdev); 1499 info->pasid_enabled = 0; 1500 } 1501 #endif 1502 } 1503 1504 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1505 u64 addr, unsigned int mask) 1506 { 1507 u16 sid, qdep; 1508 1509 if (!info || !info->ats_enabled) 1510 return; 1511 1512 sid = info->bus << 8 | info->devfn; 1513 qdep = info->ats_qdep; 1514 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1515 qdep, addr, mask); 1516 } 1517 1518 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1519 u64 addr, unsigned mask) 1520 { 1521 unsigned long flags; 1522 struct device_domain_info *info; 1523 1524 if (!domain->has_iotlb_device) 1525 return; 1526 1527 spin_lock_irqsave(&device_domain_lock, flags); 1528 list_for_each_entry(info, &domain->devices, link) 1529 __iommu_flush_dev_iotlb(info, addr, mask); 1530 1531 spin_unlock_irqrestore(&device_domain_lock, flags); 1532 } 1533 1534 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1535 struct dmar_domain *domain, 1536 unsigned long pfn, unsigned int pages, 1537 int ih, int map) 1538 { 1539 unsigned int aligned_pages = __roundup_pow_of_two(pages); 1540 unsigned int mask = ilog2(aligned_pages); 1541 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1542 u16 did = domain->iommu_did[iommu->seq_id]; 1543 1544 BUG_ON(pages == 0); 1545 1546 if (ih) 1547 ih = 1 << 6; 1548 1549 if (domain_use_first_level(domain)) { 1550 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1551 } else { 1552 unsigned long bitmask = aligned_pages - 1; 1553 1554 /* 1555 * PSI masks the low order bits of the base address. If the 1556 * address isn't aligned to the mask, then compute a mask value 1557 * needed to ensure the target range is flushed. 1558 */ 1559 if (unlikely(bitmask & pfn)) { 1560 unsigned long end_pfn = pfn + pages - 1, shared_bits; 1561 1562 /* 1563 * Since end_pfn <= pfn + bitmask, the only way bits 1564 * higher than bitmask can differ in pfn and end_pfn is 1565 * by carrying. This means after masking out bitmask, 1566 * high bits starting with the first set bit in 1567 * shared_bits are all equal in both pfn and end_pfn. 1568 */ 1569 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 1570 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; 1571 } 1572 1573 /* 1574 * Fallback to domain selective flush if no PSI support or 1575 * the size is too big. 1576 */ 1577 if (!cap_pgsel_inv(iommu->cap) || 1578 mask > cap_max_amask_val(iommu->cap)) 1579 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1580 DMA_TLB_DSI_FLUSH); 1581 else 1582 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1583 DMA_TLB_PSI_FLUSH); 1584 } 1585 1586 /* 1587 * In caching mode, changes of pages from non-present to present require 1588 * flush. However, device IOTLB doesn't need to be flushed in this case. 1589 */ 1590 if (!cap_caching_mode(iommu->cap) || !map) 1591 iommu_flush_dev_iotlb(domain, addr, mask); 1592 } 1593 1594 /* Notification for newly created mappings */ 1595 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1596 struct dmar_domain *domain, 1597 unsigned long pfn, unsigned int pages) 1598 { 1599 /* 1600 * It's a non-present to present mapping. Only flush if caching mode 1601 * and second level. 1602 */ 1603 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1604 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1605 else 1606 iommu_flush_write_buffer(iommu); 1607 } 1608 1609 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1610 { 1611 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1612 int idx; 1613 1614 for_each_domain_iommu(idx, dmar_domain) { 1615 struct intel_iommu *iommu = g_iommus[idx]; 1616 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1617 1618 if (domain_use_first_level(dmar_domain)) 1619 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1620 else 1621 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1622 DMA_TLB_DSI_FLUSH); 1623 1624 if (!cap_caching_mode(iommu->cap)) 1625 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1626 } 1627 } 1628 1629 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1630 { 1631 u32 pmen; 1632 unsigned long flags; 1633 1634 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1635 return; 1636 1637 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1638 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1639 pmen &= ~DMA_PMEN_EPM; 1640 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1641 1642 /* wait for the protected region status bit to clear */ 1643 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1644 readl, !(pmen & DMA_PMEN_PRS), pmen); 1645 1646 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1647 } 1648 1649 static void iommu_enable_translation(struct intel_iommu *iommu) 1650 { 1651 u32 sts; 1652 unsigned long flags; 1653 1654 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1655 iommu->gcmd |= DMA_GCMD_TE; 1656 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1657 1658 /* Make sure hardware complete it */ 1659 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1660 readl, (sts & DMA_GSTS_TES), sts); 1661 1662 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1663 } 1664 1665 static void iommu_disable_translation(struct intel_iommu *iommu) 1666 { 1667 u32 sts; 1668 unsigned long flag; 1669 1670 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1671 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1672 return; 1673 1674 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1675 iommu->gcmd &= ~DMA_GCMD_TE; 1676 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1677 1678 /* Make sure hardware complete it */ 1679 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1680 readl, (!(sts & DMA_GSTS_TES)), sts); 1681 1682 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1683 } 1684 1685 static int iommu_init_domains(struct intel_iommu *iommu) 1686 { 1687 u32 ndomains; 1688 1689 ndomains = cap_ndoms(iommu->cap); 1690 pr_debug("%s: Number of Domains supported <%d>\n", 1691 iommu->name, ndomains); 1692 1693 spin_lock_init(&iommu->lock); 1694 1695 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1696 if (!iommu->domain_ids) 1697 return -ENOMEM; 1698 1699 /* 1700 * If Caching mode is set, then invalid translations are tagged 1701 * with domain-id 0, hence we need to pre-allocate it. We also 1702 * use domain-id 0 as a marker for non-allocated domain-id, so 1703 * make sure it is not used for a real domain. 1704 */ 1705 set_bit(0, iommu->domain_ids); 1706 1707 /* 1708 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1709 * entry for first-level or pass-through translation modes should 1710 * be programmed with a domain id different from those used for 1711 * second-level or nested translation. We reserve a domain id for 1712 * this purpose. 1713 */ 1714 if (sm_supported(iommu)) 1715 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1716 1717 return 0; 1718 } 1719 1720 static void disable_dmar_iommu(struct intel_iommu *iommu) 1721 { 1722 struct device_domain_info *info, *tmp; 1723 unsigned long flags; 1724 1725 if (!iommu->domain_ids) 1726 return; 1727 1728 spin_lock_irqsave(&device_domain_lock, flags); 1729 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1730 if (info->iommu != iommu) 1731 continue; 1732 1733 if (!info->dev || !info->domain) 1734 continue; 1735 1736 __dmar_remove_one_dev_info(info); 1737 } 1738 spin_unlock_irqrestore(&device_domain_lock, flags); 1739 1740 if (iommu->gcmd & DMA_GCMD_TE) 1741 iommu_disable_translation(iommu); 1742 } 1743 1744 static void free_dmar_iommu(struct intel_iommu *iommu) 1745 { 1746 if (iommu->domain_ids) { 1747 bitmap_free(iommu->domain_ids); 1748 iommu->domain_ids = NULL; 1749 } 1750 1751 g_iommus[iommu->seq_id] = NULL; 1752 1753 /* free context mapping */ 1754 free_context_table(iommu); 1755 1756 #ifdef CONFIG_INTEL_IOMMU_SVM 1757 if (pasid_supported(iommu)) { 1758 if (ecap_prs(iommu->ecap)) 1759 intel_svm_finish_prq(iommu); 1760 } 1761 if (vccap_pasid(iommu->vccap)) 1762 ioasid_unregister_allocator(&iommu->pasid_allocator); 1763 1764 #endif 1765 } 1766 1767 /* 1768 * Check and return whether first level is used by default for 1769 * DMA translation. 1770 */ 1771 static bool first_level_by_default(unsigned int type) 1772 { 1773 /* Only SL is available in legacy mode */ 1774 if (!scalable_mode_support()) 1775 return false; 1776 1777 /* Only level (either FL or SL) is available, just use it */ 1778 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1779 return intel_cap_flts_sanity(); 1780 1781 /* Both levels are available, decide it based on domain type */ 1782 return type != IOMMU_DOMAIN_UNMANAGED; 1783 } 1784 1785 static struct dmar_domain *alloc_domain(unsigned int type) 1786 { 1787 struct dmar_domain *domain; 1788 1789 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1790 if (!domain) 1791 return NULL; 1792 1793 domain->nid = NUMA_NO_NODE; 1794 if (first_level_by_default(type)) 1795 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1796 domain->has_iotlb_device = false; 1797 INIT_LIST_HEAD(&domain->devices); 1798 1799 return domain; 1800 } 1801 1802 /* Must be called with iommu->lock */ 1803 static int domain_attach_iommu(struct dmar_domain *domain, 1804 struct intel_iommu *iommu) 1805 { 1806 unsigned long ndomains; 1807 int num; 1808 1809 assert_spin_locked(&device_domain_lock); 1810 assert_spin_locked(&iommu->lock); 1811 1812 domain->iommu_refcnt[iommu->seq_id] += 1; 1813 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1814 ndomains = cap_ndoms(iommu->cap); 1815 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1816 1817 if (num >= ndomains) { 1818 pr_err("%s: No free domain ids\n", iommu->name); 1819 domain->iommu_refcnt[iommu->seq_id] -= 1; 1820 return -ENOSPC; 1821 } 1822 1823 set_bit(num, iommu->domain_ids); 1824 domain->iommu_did[iommu->seq_id] = num; 1825 domain->nid = iommu->node; 1826 domain_update_iommu_cap(domain); 1827 } 1828 1829 return 0; 1830 } 1831 1832 static void domain_detach_iommu(struct dmar_domain *domain, 1833 struct intel_iommu *iommu) 1834 { 1835 int num; 1836 1837 assert_spin_locked(&device_domain_lock); 1838 assert_spin_locked(&iommu->lock); 1839 1840 domain->iommu_refcnt[iommu->seq_id] -= 1; 1841 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1842 num = domain->iommu_did[iommu->seq_id]; 1843 clear_bit(num, iommu->domain_ids); 1844 domain_update_iommu_cap(domain); 1845 domain->iommu_did[iommu->seq_id] = 0; 1846 } 1847 } 1848 1849 static inline int guestwidth_to_adjustwidth(int gaw) 1850 { 1851 int agaw; 1852 int r = (gaw - 12) % 9; 1853 1854 if (r == 0) 1855 agaw = gaw; 1856 else 1857 agaw = gaw + 9 - r; 1858 if (agaw > 64) 1859 agaw = 64; 1860 return agaw; 1861 } 1862 1863 static void domain_exit(struct dmar_domain *domain) 1864 { 1865 1866 /* Remove associated devices and clear attached or cached domains */ 1867 domain_remove_dev_info(domain); 1868 1869 if (domain->pgd) { 1870 LIST_HEAD(freelist); 1871 1872 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1873 put_pages_list(&freelist); 1874 } 1875 1876 kfree(domain); 1877 } 1878 1879 /* 1880 * Get the PASID directory size for scalable mode context entry. 1881 * Value of X in the PDTS field of a scalable mode context entry 1882 * indicates PASID directory with 2^(X + 7) entries. 1883 */ 1884 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1885 { 1886 unsigned long pds, max_pde; 1887 1888 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1889 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1890 if (pds < 7) 1891 return 0; 1892 1893 return pds - 7; 1894 } 1895 1896 /* 1897 * Set the RID_PASID field of a scalable mode context entry. The 1898 * IOMMU hardware will use the PASID value set in this field for 1899 * DMA translations of DMA requests without PASID. 1900 */ 1901 static inline void 1902 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1903 { 1904 context->hi |= pasid & ((1 << 20) - 1); 1905 } 1906 1907 /* 1908 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1909 * entry. 1910 */ 1911 static inline void context_set_sm_dte(struct context_entry *context) 1912 { 1913 context->lo |= (1 << 2); 1914 } 1915 1916 /* 1917 * Set the PRE(Page Request Enable) field of a scalable mode context 1918 * entry. 1919 */ 1920 static inline void context_set_sm_pre(struct context_entry *context) 1921 { 1922 context->lo |= (1 << 4); 1923 } 1924 1925 /* Convert value to context PASID directory size field coding. */ 1926 #define context_pdts(pds) (((pds) & 0x7) << 9) 1927 1928 static int domain_context_mapping_one(struct dmar_domain *domain, 1929 struct intel_iommu *iommu, 1930 struct pasid_table *table, 1931 u8 bus, u8 devfn) 1932 { 1933 u16 did = domain->iommu_did[iommu->seq_id]; 1934 int translation = CONTEXT_TT_MULTI_LEVEL; 1935 struct device_domain_info *info = NULL; 1936 struct context_entry *context; 1937 unsigned long flags; 1938 int ret; 1939 1940 WARN_ON(did == 0); 1941 1942 if (hw_pass_through && domain_type_is_si(domain)) 1943 translation = CONTEXT_TT_PASS_THROUGH; 1944 1945 pr_debug("Set context mapping for %02x:%02x.%d\n", 1946 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1947 1948 BUG_ON(!domain->pgd); 1949 1950 spin_lock_irqsave(&device_domain_lock, flags); 1951 spin_lock(&iommu->lock); 1952 1953 ret = -ENOMEM; 1954 context = iommu_context_addr(iommu, bus, devfn, 1); 1955 if (!context) 1956 goto out_unlock; 1957 1958 ret = 0; 1959 if (context_present(context)) 1960 goto out_unlock; 1961 1962 /* 1963 * For kdump cases, old valid entries may be cached due to the 1964 * in-flight DMA and copied pgtable, but there is no unmapping 1965 * behaviour for them, thus we need an explicit cache flush for 1966 * the newly-mapped device. For kdump, at this point, the device 1967 * is supposed to finish reset at its driver probe stage, so no 1968 * in-flight DMA will exist, and we don't need to worry anymore 1969 * hereafter. 1970 */ 1971 if (context_copied(context)) { 1972 u16 did_old = context_domain_id(context); 1973 1974 if (did_old < cap_ndoms(iommu->cap)) { 1975 iommu->flush.flush_context(iommu, did_old, 1976 (((u16)bus) << 8) | devfn, 1977 DMA_CCMD_MASK_NOBIT, 1978 DMA_CCMD_DEVICE_INVL); 1979 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 1980 DMA_TLB_DSI_FLUSH); 1981 } 1982 } 1983 1984 context_clear_entry(context); 1985 1986 if (sm_supported(iommu)) { 1987 unsigned long pds; 1988 1989 WARN_ON(!table); 1990 1991 /* Setup the PASID DIR pointer: */ 1992 pds = context_get_sm_pds(table); 1993 context->lo = (u64)virt_to_phys(table->table) | 1994 context_pdts(pds); 1995 1996 /* Setup the RID_PASID field: */ 1997 context_set_sm_rid2pasid(context, PASID_RID2PASID); 1998 1999 /* 2000 * Setup the Device-TLB enable bit and Page request 2001 * Enable bit: 2002 */ 2003 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2004 if (info && info->ats_supported) 2005 context_set_sm_dte(context); 2006 if (info && info->pri_supported) 2007 context_set_sm_pre(context); 2008 } else { 2009 struct dma_pte *pgd = domain->pgd; 2010 int agaw; 2011 2012 context_set_domain_id(context, did); 2013 2014 if (translation != CONTEXT_TT_PASS_THROUGH) { 2015 /* 2016 * Skip top levels of page tables for iommu which has 2017 * less agaw than default. Unnecessary for PT mode. 2018 */ 2019 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2020 ret = -ENOMEM; 2021 pgd = phys_to_virt(dma_pte_addr(pgd)); 2022 if (!dma_pte_present(pgd)) 2023 goto out_unlock; 2024 } 2025 2026 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2027 if (info && info->ats_supported) 2028 translation = CONTEXT_TT_DEV_IOTLB; 2029 else 2030 translation = CONTEXT_TT_MULTI_LEVEL; 2031 2032 context_set_address_root(context, virt_to_phys(pgd)); 2033 context_set_address_width(context, agaw); 2034 } else { 2035 /* 2036 * In pass through mode, AW must be programmed to 2037 * indicate the largest AGAW value supported by 2038 * hardware. And ASR is ignored by hardware. 2039 */ 2040 context_set_address_width(context, iommu->msagaw); 2041 } 2042 2043 context_set_translation_type(context, translation); 2044 } 2045 2046 context_set_fault_enable(context); 2047 context_set_present(context); 2048 if (!ecap_coherent(iommu->ecap)) 2049 clflush_cache_range(context, sizeof(*context)); 2050 2051 /* 2052 * It's a non-present to present mapping. If hardware doesn't cache 2053 * non-present entry we only need to flush the write-buffer. If the 2054 * _does_ cache non-present entries, then it does so in the special 2055 * domain #0, which we have to flush: 2056 */ 2057 if (cap_caching_mode(iommu->cap)) { 2058 iommu->flush.flush_context(iommu, 0, 2059 (((u16)bus) << 8) | devfn, 2060 DMA_CCMD_MASK_NOBIT, 2061 DMA_CCMD_DEVICE_INVL); 2062 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2063 } else { 2064 iommu_flush_write_buffer(iommu); 2065 } 2066 iommu_enable_dev_iotlb(info); 2067 2068 ret = 0; 2069 2070 out_unlock: 2071 spin_unlock(&iommu->lock); 2072 spin_unlock_irqrestore(&device_domain_lock, flags); 2073 2074 return ret; 2075 } 2076 2077 struct domain_context_mapping_data { 2078 struct dmar_domain *domain; 2079 struct intel_iommu *iommu; 2080 struct pasid_table *table; 2081 }; 2082 2083 static int domain_context_mapping_cb(struct pci_dev *pdev, 2084 u16 alias, void *opaque) 2085 { 2086 struct domain_context_mapping_data *data = opaque; 2087 2088 return domain_context_mapping_one(data->domain, data->iommu, 2089 data->table, PCI_BUS_NUM(alias), 2090 alias & 0xff); 2091 } 2092 2093 static int 2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2095 { 2096 struct domain_context_mapping_data data; 2097 struct pasid_table *table; 2098 struct intel_iommu *iommu; 2099 u8 bus, devfn; 2100 2101 iommu = device_to_iommu(dev, &bus, &devfn); 2102 if (!iommu) 2103 return -ENODEV; 2104 2105 table = intel_pasid_get_table(dev); 2106 2107 if (!dev_is_pci(dev)) 2108 return domain_context_mapping_one(domain, iommu, table, 2109 bus, devfn); 2110 2111 data.domain = domain; 2112 data.iommu = iommu; 2113 data.table = table; 2114 2115 return pci_for_each_dma_alias(to_pci_dev(dev), 2116 &domain_context_mapping_cb, &data); 2117 } 2118 2119 static int domain_context_mapped_cb(struct pci_dev *pdev, 2120 u16 alias, void *opaque) 2121 { 2122 struct intel_iommu *iommu = opaque; 2123 2124 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2125 } 2126 2127 static int domain_context_mapped(struct device *dev) 2128 { 2129 struct intel_iommu *iommu; 2130 u8 bus, devfn; 2131 2132 iommu = device_to_iommu(dev, &bus, &devfn); 2133 if (!iommu) 2134 return -ENODEV; 2135 2136 if (!dev_is_pci(dev)) 2137 return device_context_mapped(iommu, bus, devfn); 2138 2139 return !pci_for_each_dma_alias(to_pci_dev(dev), 2140 domain_context_mapped_cb, iommu); 2141 } 2142 2143 /* Returns a number of VTD pages, but aligned to MM page size */ 2144 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2145 size_t size) 2146 { 2147 host_addr &= ~PAGE_MASK; 2148 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2149 } 2150 2151 /* Return largest possible superpage level for a given mapping */ 2152 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2153 unsigned long iov_pfn, 2154 unsigned long phy_pfn, 2155 unsigned long pages) 2156 { 2157 int support, level = 1; 2158 unsigned long pfnmerge; 2159 2160 support = domain->iommu_superpage; 2161 2162 /* To use a large page, the virtual *and* physical addresses 2163 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2164 of them will mean we have to use smaller pages. So just 2165 merge them and check both at once. */ 2166 pfnmerge = iov_pfn | phy_pfn; 2167 2168 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2169 pages >>= VTD_STRIDE_SHIFT; 2170 if (!pages) 2171 break; 2172 pfnmerge >>= VTD_STRIDE_SHIFT; 2173 level++; 2174 support--; 2175 } 2176 return level; 2177 } 2178 2179 /* 2180 * Ensure that old small page tables are removed to make room for superpage(s). 2181 * We're going to add new large pages, so make sure we don't remove their parent 2182 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2183 */ 2184 static void switch_to_super_page(struct dmar_domain *domain, 2185 unsigned long start_pfn, 2186 unsigned long end_pfn, int level) 2187 { 2188 unsigned long lvl_pages = lvl_to_nr_pages(level); 2189 struct dma_pte *pte = NULL; 2190 int i; 2191 2192 while (start_pfn <= end_pfn) { 2193 if (!pte) 2194 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2195 2196 if (dma_pte_present(pte)) { 2197 dma_pte_free_pagetable(domain, start_pfn, 2198 start_pfn + lvl_pages - 1, 2199 level + 1); 2200 2201 for_each_domain_iommu(i, domain) 2202 iommu_flush_iotlb_psi(g_iommus[i], domain, 2203 start_pfn, lvl_pages, 2204 0, 0); 2205 } 2206 2207 pte++; 2208 start_pfn += lvl_pages; 2209 if (first_pte_in_page(pte)) 2210 pte = NULL; 2211 } 2212 } 2213 2214 static int 2215 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2216 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2217 { 2218 struct dma_pte *first_pte = NULL, *pte = NULL; 2219 unsigned int largepage_lvl = 0; 2220 unsigned long lvl_pages = 0; 2221 phys_addr_t pteval; 2222 u64 attr; 2223 2224 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2225 2226 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2227 return -EINVAL; 2228 2229 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2230 attr |= DMA_FL_PTE_PRESENT; 2231 if (domain_use_first_level(domain)) { 2232 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2233 if (prot & DMA_PTE_WRITE) 2234 attr |= DMA_FL_PTE_DIRTY; 2235 } 2236 2237 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2238 2239 while (nr_pages > 0) { 2240 uint64_t tmp; 2241 2242 if (!pte) { 2243 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2244 phys_pfn, nr_pages); 2245 2246 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2247 if (!pte) 2248 return -ENOMEM; 2249 first_pte = pte; 2250 2251 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2252 2253 /* It is large page*/ 2254 if (largepage_lvl > 1) { 2255 unsigned long end_pfn; 2256 unsigned long pages_to_remove; 2257 2258 pteval |= DMA_PTE_LARGE_PAGE; 2259 pages_to_remove = min_t(unsigned long, nr_pages, 2260 nr_pte_to_next_page(pte) * lvl_pages); 2261 end_pfn = iov_pfn + pages_to_remove - 1; 2262 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2263 } else { 2264 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2265 } 2266 2267 } 2268 /* We don't need lock here, nobody else 2269 * touches the iova range 2270 */ 2271 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2272 if (tmp) { 2273 static int dumps = 5; 2274 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2275 iov_pfn, tmp, (unsigned long long)pteval); 2276 if (dumps) { 2277 dumps--; 2278 debug_dma_dump_mappings(NULL); 2279 } 2280 WARN_ON(1); 2281 } 2282 2283 nr_pages -= lvl_pages; 2284 iov_pfn += lvl_pages; 2285 phys_pfn += lvl_pages; 2286 pteval += lvl_pages * VTD_PAGE_SIZE; 2287 2288 /* If the next PTE would be the first in a new page, then we 2289 * need to flush the cache on the entries we've just written. 2290 * And then we'll need to recalculate 'pte', so clear it and 2291 * let it get set again in the if (!pte) block above. 2292 * 2293 * If we're done (!nr_pages) we need to flush the cache too. 2294 * 2295 * Also if we've been setting superpages, we may need to 2296 * recalculate 'pte' and switch back to smaller pages for the 2297 * end of the mapping, if the trailing size is not enough to 2298 * use another superpage (i.e. nr_pages < lvl_pages). 2299 */ 2300 pte++; 2301 if (!nr_pages || first_pte_in_page(pte) || 2302 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2303 domain_flush_cache(domain, first_pte, 2304 (void *)pte - (void *)first_pte); 2305 pte = NULL; 2306 } 2307 } 2308 2309 return 0; 2310 } 2311 2312 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2313 { 2314 struct intel_iommu *iommu = info->iommu; 2315 struct context_entry *context; 2316 unsigned long flags; 2317 u16 did_old; 2318 2319 if (!iommu) 2320 return; 2321 2322 spin_lock_irqsave(&iommu->lock, flags); 2323 context = iommu_context_addr(iommu, bus, devfn, 0); 2324 if (!context) { 2325 spin_unlock_irqrestore(&iommu->lock, flags); 2326 return; 2327 } 2328 2329 if (sm_supported(iommu)) { 2330 if (hw_pass_through && domain_type_is_si(info->domain)) 2331 did_old = FLPT_DEFAULT_DID; 2332 else 2333 did_old = info->domain->iommu_did[iommu->seq_id]; 2334 } else { 2335 did_old = context_domain_id(context); 2336 } 2337 2338 context_clear_entry(context); 2339 __iommu_flush_cache(iommu, context, sizeof(*context)); 2340 spin_unlock_irqrestore(&iommu->lock, flags); 2341 iommu->flush.flush_context(iommu, 2342 did_old, 2343 (((u16)bus) << 8) | devfn, 2344 DMA_CCMD_MASK_NOBIT, 2345 DMA_CCMD_DEVICE_INVL); 2346 2347 if (sm_supported(iommu)) 2348 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2349 2350 iommu->flush.flush_iotlb(iommu, 2351 did_old, 2352 0, 2353 0, 2354 DMA_TLB_DSI_FLUSH); 2355 2356 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2357 } 2358 2359 static void domain_remove_dev_info(struct dmar_domain *domain) 2360 { 2361 struct device_domain_info *info, *tmp; 2362 unsigned long flags; 2363 2364 spin_lock_irqsave(&device_domain_lock, flags); 2365 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2366 __dmar_remove_one_dev_info(info); 2367 spin_unlock_irqrestore(&device_domain_lock, flags); 2368 } 2369 2370 static inline struct device_domain_info * 2371 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2372 { 2373 struct device_domain_info *info; 2374 2375 list_for_each_entry(info, &device_domain_list, global) 2376 if (info->segment == segment && info->bus == bus && 2377 info->devfn == devfn) 2378 return info; 2379 2380 return NULL; 2381 } 2382 2383 static int domain_setup_first_level(struct intel_iommu *iommu, 2384 struct dmar_domain *domain, 2385 struct device *dev, 2386 u32 pasid) 2387 { 2388 struct dma_pte *pgd = domain->pgd; 2389 int agaw, level; 2390 int flags = 0; 2391 2392 /* 2393 * Skip top levels of page tables for iommu which has 2394 * less agaw than default. Unnecessary for PT mode. 2395 */ 2396 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2397 pgd = phys_to_virt(dma_pte_addr(pgd)); 2398 if (!dma_pte_present(pgd)) 2399 return -ENOMEM; 2400 } 2401 2402 level = agaw_to_level(agaw); 2403 if (level != 4 && level != 5) 2404 return -EINVAL; 2405 2406 if (pasid != PASID_RID2PASID) 2407 flags |= PASID_FLAG_SUPERVISOR_MODE; 2408 if (level == 5) 2409 flags |= PASID_FLAG_FL5LP; 2410 2411 if (domain->force_snooping) 2412 flags |= PASID_FLAG_PAGE_SNOOP; 2413 2414 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2415 domain->iommu_did[iommu->seq_id], 2416 flags); 2417 } 2418 2419 static bool dev_is_real_dma_subdevice(struct device *dev) 2420 { 2421 return dev && dev_is_pci(dev) && 2422 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2423 } 2424 2425 static int iommu_domain_identity_map(struct dmar_domain *domain, 2426 unsigned long first_vpfn, 2427 unsigned long last_vpfn) 2428 { 2429 /* 2430 * RMRR range might have overlap with physical memory range, 2431 * clear it first 2432 */ 2433 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2434 2435 return __domain_mapping(domain, first_vpfn, 2436 first_vpfn, last_vpfn - first_vpfn + 1, 2437 DMA_PTE_READ|DMA_PTE_WRITE); 2438 } 2439 2440 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2441 2442 static int __init si_domain_init(int hw) 2443 { 2444 struct dmar_rmrr_unit *rmrr; 2445 struct device *dev; 2446 int i, nid, ret; 2447 2448 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2449 if (!si_domain) 2450 return -EFAULT; 2451 2452 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2453 domain_exit(si_domain); 2454 return -EFAULT; 2455 } 2456 2457 if (hw) 2458 return 0; 2459 2460 for_each_online_node(nid) { 2461 unsigned long start_pfn, end_pfn; 2462 int i; 2463 2464 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2465 ret = iommu_domain_identity_map(si_domain, 2466 mm_to_dma_pfn(start_pfn), 2467 mm_to_dma_pfn(end_pfn)); 2468 if (ret) 2469 return ret; 2470 } 2471 } 2472 2473 /* 2474 * Identity map the RMRRs so that devices with RMRRs could also use 2475 * the si_domain. 2476 */ 2477 for_each_rmrr_units(rmrr) { 2478 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2479 i, dev) { 2480 unsigned long long start = rmrr->base_address; 2481 unsigned long long end = rmrr->end_address; 2482 2483 if (WARN_ON(end < start || 2484 end >> agaw_to_width(si_domain->agaw))) 2485 continue; 2486 2487 ret = iommu_domain_identity_map(si_domain, 2488 mm_to_dma_pfn(start >> PAGE_SHIFT), 2489 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2490 if (ret) 2491 return ret; 2492 } 2493 } 2494 2495 return 0; 2496 } 2497 2498 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2499 { 2500 struct device_domain_info *info = dev_iommu_priv_get(dev); 2501 struct intel_iommu *iommu; 2502 unsigned long flags; 2503 u8 bus, devfn; 2504 int ret; 2505 2506 iommu = device_to_iommu(dev, &bus, &devfn); 2507 if (!iommu) 2508 return -ENODEV; 2509 2510 spin_lock_irqsave(&device_domain_lock, flags); 2511 info->domain = domain; 2512 spin_lock(&iommu->lock); 2513 ret = domain_attach_iommu(domain, iommu); 2514 spin_unlock(&iommu->lock); 2515 if (ret) { 2516 spin_unlock_irqrestore(&device_domain_lock, flags); 2517 return ret; 2518 } 2519 list_add(&info->link, &domain->devices); 2520 spin_unlock_irqrestore(&device_domain_lock, flags); 2521 2522 /* PASID table is mandatory for a PCI device in scalable mode. */ 2523 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2524 ret = intel_pasid_alloc_table(dev); 2525 if (ret) { 2526 dev_err(dev, "PASID table allocation failed\n"); 2527 dmar_remove_one_dev_info(dev); 2528 return ret; 2529 } 2530 2531 /* Setup the PASID entry for requests without PASID: */ 2532 spin_lock_irqsave(&iommu->lock, flags); 2533 if (hw_pass_through && domain_type_is_si(domain)) 2534 ret = intel_pasid_setup_pass_through(iommu, domain, 2535 dev, PASID_RID2PASID); 2536 else if (domain_use_first_level(domain)) 2537 ret = domain_setup_first_level(iommu, domain, dev, 2538 PASID_RID2PASID); 2539 else 2540 ret = intel_pasid_setup_second_level(iommu, domain, 2541 dev, PASID_RID2PASID); 2542 spin_unlock_irqrestore(&iommu->lock, flags); 2543 if (ret) { 2544 dev_err(dev, "Setup RID2PASID failed\n"); 2545 dmar_remove_one_dev_info(dev); 2546 return ret; 2547 } 2548 } 2549 2550 ret = domain_context_mapping(domain, dev); 2551 if (ret) { 2552 dev_err(dev, "Domain context map failed\n"); 2553 dmar_remove_one_dev_info(dev); 2554 return ret; 2555 } 2556 2557 return 0; 2558 } 2559 2560 static bool device_has_rmrr(struct device *dev) 2561 { 2562 struct dmar_rmrr_unit *rmrr; 2563 struct device *tmp; 2564 int i; 2565 2566 rcu_read_lock(); 2567 for_each_rmrr_units(rmrr) { 2568 /* 2569 * Return TRUE if this RMRR contains the device that 2570 * is passed in. 2571 */ 2572 for_each_active_dev_scope(rmrr->devices, 2573 rmrr->devices_cnt, i, tmp) 2574 if (tmp == dev || 2575 is_downstream_to_pci_bridge(dev, tmp)) { 2576 rcu_read_unlock(); 2577 return true; 2578 } 2579 } 2580 rcu_read_unlock(); 2581 return false; 2582 } 2583 2584 /** 2585 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2586 * is relaxable (ie. is allowed to be not enforced under some conditions) 2587 * @dev: device handle 2588 * 2589 * We assume that PCI USB devices with RMRRs have them largely 2590 * for historical reasons and that the RMRR space is not actively used post 2591 * boot. This exclusion may change if vendors begin to abuse it. 2592 * 2593 * The same exception is made for graphics devices, with the requirement that 2594 * any use of the RMRR regions will be torn down before assigning the device 2595 * to a guest. 2596 * 2597 * Return: true if the RMRR is relaxable, false otherwise 2598 */ 2599 static bool device_rmrr_is_relaxable(struct device *dev) 2600 { 2601 struct pci_dev *pdev; 2602 2603 if (!dev_is_pci(dev)) 2604 return false; 2605 2606 pdev = to_pci_dev(dev); 2607 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2608 return true; 2609 else 2610 return false; 2611 } 2612 2613 /* 2614 * There are a couple cases where we need to restrict the functionality of 2615 * devices associated with RMRRs. The first is when evaluating a device for 2616 * identity mapping because problems exist when devices are moved in and out 2617 * of domains and their respective RMRR information is lost. This means that 2618 * a device with associated RMRRs will never be in a "passthrough" domain. 2619 * The second is use of the device through the IOMMU API. This interface 2620 * expects to have full control of the IOVA space for the device. We cannot 2621 * satisfy both the requirement that RMRR access is maintained and have an 2622 * unencumbered IOVA space. We also have no ability to quiesce the device's 2623 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2624 * We therefore prevent devices associated with an RMRR from participating in 2625 * the IOMMU API, which eliminates them from device assignment. 2626 * 2627 * In both cases, devices which have relaxable RMRRs are not concerned by this 2628 * restriction. See device_rmrr_is_relaxable comment. 2629 */ 2630 static bool device_is_rmrr_locked(struct device *dev) 2631 { 2632 if (!device_has_rmrr(dev)) 2633 return false; 2634 2635 if (device_rmrr_is_relaxable(dev)) 2636 return false; 2637 2638 return true; 2639 } 2640 2641 /* 2642 * Return the required default domain type for a specific device. 2643 * 2644 * @dev: the device in query 2645 * @startup: true if this is during early boot 2646 * 2647 * Returns: 2648 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2649 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2650 * - 0: both identity and dynamic domains work for this device 2651 */ 2652 static int device_def_domain_type(struct device *dev) 2653 { 2654 if (dev_is_pci(dev)) { 2655 struct pci_dev *pdev = to_pci_dev(dev); 2656 2657 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2658 return IOMMU_DOMAIN_IDENTITY; 2659 2660 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2661 return IOMMU_DOMAIN_IDENTITY; 2662 } 2663 2664 return 0; 2665 } 2666 2667 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2668 { 2669 /* 2670 * Start from the sane iommu hardware state. 2671 * If the queued invalidation is already initialized by us 2672 * (for example, while enabling interrupt-remapping) then 2673 * we got the things already rolling from a sane state. 2674 */ 2675 if (!iommu->qi) { 2676 /* 2677 * Clear any previous faults. 2678 */ 2679 dmar_fault(-1, iommu); 2680 /* 2681 * Disable queued invalidation if supported and already enabled 2682 * before OS handover. 2683 */ 2684 dmar_disable_qi(iommu); 2685 } 2686 2687 if (dmar_enable_qi(iommu)) { 2688 /* 2689 * Queued Invalidate not enabled, use Register Based Invalidate 2690 */ 2691 iommu->flush.flush_context = __iommu_flush_context; 2692 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2693 pr_info("%s: Using Register based invalidation\n", 2694 iommu->name); 2695 } else { 2696 iommu->flush.flush_context = qi_flush_context; 2697 iommu->flush.flush_iotlb = qi_flush_iotlb; 2698 pr_info("%s: Using Queued invalidation\n", iommu->name); 2699 } 2700 } 2701 2702 static int copy_context_table(struct intel_iommu *iommu, 2703 struct root_entry *old_re, 2704 struct context_entry **tbl, 2705 int bus, bool ext) 2706 { 2707 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2708 struct context_entry *new_ce = NULL, ce; 2709 struct context_entry *old_ce = NULL; 2710 struct root_entry re; 2711 phys_addr_t old_ce_phys; 2712 2713 tbl_idx = ext ? bus * 2 : bus; 2714 memcpy(&re, old_re, sizeof(re)); 2715 2716 for (devfn = 0; devfn < 256; devfn++) { 2717 /* First calculate the correct index */ 2718 idx = (ext ? devfn * 2 : devfn) % 256; 2719 2720 if (idx == 0) { 2721 /* First save what we may have and clean up */ 2722 if (new_ce) { 2723 tbl[tbl_idx] = new_ce; 2724 __iommu_flush_cache(iommu, new_ce, 2725 VTD_PAGE_SIZE); 2726 pos = 1; 2727 } 2728 2729 if (old_ce) 2730 memunmap(old_ce); 2731 2732 ret = 0; 2733 if (devfn < 0x80) 2734 old_ce_phys = root_entry_lctp(&re); 2735 else 2736 old_ce_phys = root_entry_uctp(&re); 2737 2738 if (!old_ce_phys) { 2739 if (ext && devfn == 0) { 2740 /* No LCTP, try UCTP */ 2741 devfn = 0x7f; 2742 continue; 2743 } else { 2744 goto out; 2745 } 2746 } 2747 2748 ret = -ENOMEM; 2749 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2750 MEMREMAP_WB); 2751 if (!old_ce) 2752 goto out; 2753 2754 new_ce = alloc_pgtable_page(iommu->node); 2755 if (!new_ce) 2756 goto out_unmap; 2757 2758 ret = 0; 2759 } 2760 2761 /* Now copy the context entry */ 2762 memcpy(&ce, old_ce + idx, sizeof(ce)); 2763 2764 if (!__context_present(&ce)) 2765 continue; 2766 2767 did = context_domain_id(&ce); 2768 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2769 set_bit(did, iommu->domain_ids); 2770 2771 /* 2772 * We need a marker for copied context entries. This 2773 * marker needs to work for the old format as well as 2774 * for extended context entries. 2775 * 2776 * Bit 67 of the context entry is used. In the old 2777 * format this bit is available to software, in the 2778 * extended format it is the PGE bit, but PGE is ignored 2779 * by HW if PASIDs are disabled (and thus still 2780 * available). 2781 * 2782 * So disable PASIDs first and then mark the entry 2783 * copied. This means that we don't copy PASID 2784 * translations from the old kernel, but this is fine as 2785 * faults there are not fatal. 2786 */ 2787 context_clear_pasid_enable(&ce); 2788 context_set_copied(&ce); 2789 2790 new_ce[idx] = ce; 2791 } 2792 2793 tbl[tbl_idx + pos] = new_ce; 2794 2795 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2796 2797 out_unmap: 2798 memunmap(old_ce); 2799 2800 out: 2801 return ret; 2802 } 2803 2804 static int copy_translation_tables(struct intel_iommu *iommu) 2805 { 2806 struct context_entry **ctxt_tbls; 2807 struct root_entry *old_rt; 2808 phys_addr_t old_rt_phys; 2809 int ctxt_table_entries; 2810 unsigned long flags; 2811 u64 rtaddr_reg; 2812 int bus, ret; 2813 bool new_ext, ext; 2814 2815 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2816 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2817 new_ext = !!ecap_ecs(iommu->ecap); 2818 2819 /* 2820 * The RTT bit can only be changed when translation is disabled, 2821 * but disabling translation means to open a window for data 2822 * corruption. So bail out and don't copy anything if we would 2823 * have to change the bit. 2824 */ 2825 if (new_ext != ext) 2826 return -EINVAL; 2827 2828 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2829 if (!old_rt_phys) 2830 return -EINVAL; 2831 2832 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2833 if (!old_rt) 2834 return -ENOMEM; 2835 2836 /* This is too big for the stack - allocate it from slab */ 2837 ctxt_table_entries = ext ? 512 : 256; 2838 ret = -ENOMEM; 2839 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2840 if (!ctxt_tbls) 2841 goto out_unmap; 2842 2843 for (bus = 0; bus < 256; bus++) { 2844 ret = copy_context_table(iommu, &old_rt[bus], 2845 ctxt_tbls, bus, ext); 2846 if (ret) { 2847 pr_err("%s: Failed to copy context table for bus %d\n", 2848 iommu->name, bus); 2849 continue; 2850 } 2851 } 2852 2853 spin_lock_irqsave(&iommu->lock, flags); 2854 2855 /* Context tables are copied, now write them to the root_entry table */ 2856 for (bus = 0; bus < 256; bus++) { 2857 int idx = ext ? bus * 2 : bus; 2858 u64 val; 2859 2860 if (ctxt_tbls[idx]) { 2861 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2862 iommu->root_entry[bus].lo = val; 2863 } 2864 2865 if (!ext || !ctxt_tbls[idx + 1]) 2866 continue; 2867 2868 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2869 iommu->root_entry[bus].hi = val; 2870 } 2871 2872 spin_unlock_irqrestore(&iommu->lock, flags); 2873 2874 kfree(ctxt_tbls); 2875 2876 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2877 2878 ret = 0; 2879 2880 out_unmap: 2881 memunmap(old_rt); 2882 2883 return ret; 2884 } 2885 2886 #ifdef CONFIG_INTEL_IOMMU_SVM 2887 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 2888 { 2889 struct intel_iommu *iommu = data; 2890 ioasid_t ioasid; 2891 2892 if (!iommu) 2893 return INVALID_IOASID; 2894 /* 2895 * VT-d virtual command interface always uses the full 20 bit 2896 * PASID range. Host can partition guest PASID range based on 2897 * policies but it is out of guest's control. 2898 */ 2899 if (min < PASID_MIN || max > intel_pasid_max_id) 2900 return INVALID_IOASID; 2901 2902 if (vcmd_alloc_pasid(iommu, &ioasid)) 2903 return INVALID_IOASID; 2904 2905 return ioasid; 2906 } 2907 2908 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 2909 { 2910 struct intel_iommu *iommu = data; 2911 2912 if (!iommu) 2913 return; 2914 /* 2915 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 2916 * We can only free the PASID when all the devices are unbound. 2917 */ 2918 if (ioasid_find(NULL, ioasid, NULL)) { 2919 pr_alert("Cannot free active IOASID %d\n", ioasid); 2920 return; 2921 } 2922 vcmd_free_pasid(iommu, ioasid); 2923 } 2924 2925 static void register_pasid_allocator(struct intel_iommu *iommu) 2926 { 2927 /* 2928 * If we are running in the host, no need for custom allocator 2929 * in that PASIDs are allocated from the host system-wide. 2930 */ 2931 if (!cap_caching_mode(iommu->cap)) 2932 return; 2933 2934 if (!sm_supported(iommu)) { 2935 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 2936 return; 2937 } 2938 2939 /* 2940 * Register a custom PASID allocator if we are running in a guest, 2941 * guest PASID must be obtained via virtual command interface. 2942 * There can be multiple vIOMMUs in each guest but only one allocator 2943 * is active. All vIOMMU allocators will eventually be calling the same 2944 * host allocator. 2945 */ 2946 if (!vccap_pasid(iommu->vccap)) 2947 return; 2948 2949 pr_info("Register custom PASID allocator\n"); 2950 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 2951 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 2952 iommu->pasid_allocator.pdata = (void *)iommu; 2953 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 2954 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 2955 /* 2956 * Disable scalable mode on this IOMMU if there 2957 * is no custom allocator. Mixing SM capable vIOMMU 2958 * and non-SM vIOMMU are not supported. 2959 */ 2960 intel_iommu_sm = 0; 2961 } 2962 } 2963 #endif 2964 2965 static int __init init_dmars(void) 2966 { 2967 struct dmar_drhd_unit *drhd; 2968 struct intel_iommu *iommu; 2969 int ret; 2970 2971 /* 2972 * for each drhd 2973 * allocate root 2974 * initialize and program root entry to not present 2975 * endfor 2976 */ 2977 for_each_drhd_unit(drhd) { 2978 /* 2979 * lock not needed as this is only incremented in the single 2980 * threaded kernel __init code path all other access are read 2981 * only 2982 */ 2983 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 2984 g_num_of_iommus++; 2985 continue; 2986 } 2987 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 2988 } 2989 2990 /* Preallocate enough resources for IOMMU hot-addition */ 2991 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 2992 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 2993 2994 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 2995 GFP_KERNEL); 2996 if (!g_iommus) { 2997 ret = -ENOMEM; 2998 goto error; 2999 } 3000 3001 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3002 if (ret) 3003 goto free_iommu; 3004 3005 for_each_iommu(iommu, drhd) { 3006 if (drhd->ignored) { 3007 iommu_disable_translation(iommu); 3008 continue; 3009 } 3010 3011 /* 3012 * Find the max pasid size of all IOMMU's in the system. 3013 * We need to ensure the system pasid table is no bigger 3014 * than the smallest supported. 3015 */ 3016 if (pasid_supported(iommu)) { 3017 u32 temp = 2 << ecap_pss(iommu->ecap); 3018 3019 intel_pasid_max_id = min_t(u32, temp, 3020 intel_pasid_max_id); 3021 } 3022 3023 g_iommus[iommu->seq_id] = iommu; 3024 3025 intel_iommu_init_qi(iommu); 3026 3027 ret = iommu_init_domains(iommu); 3028 if (ret) 3029 goto free_iommu; 3030 3031 init_translation_status(iommu); 3032 3033 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3034 iommu_disable_translation(iommu); 3035 clear_translation_pre_enabled(iommu); 3036 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3037 iommu->name); 3038 } 3039 3040 /* 3041 * TBD: 3042 * we could share the same root & context tables 3043 * among all IOMMU's. Need to Split it later. 3044 */ 3045 ret = iommu_alloc_root_entry(iommu); 3046 if (ret) 3047 goto free_iommu; 3048 3049 if (translation_pre_enabled(iommu)) { 3050 pr_info("Translation already enabled - trying to copy translation structures\n"); 3051 3052 ret = copy_translation_tables(iommu); 3053 if (ret) { 3054 /* 3055 * We found the IOMMU with translation 3056 * enabled - but failed to copy over the 3057 * old root-entry table. Try to proceed 3058 * by disabling translation now and 3059 * allocating a clean root-entry table. 3060 * This might cause DMAR faults, but 3061 * probably the dump will still succeed. 3062 */ 3063 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3064 iommu->name); 3065 iommu_disable_translation(iommu); 3066 clear_translation_pre_enabled(iommu); 3067 } else { 3068 pr_info("Copied translation tables from previous kernel for %s\n", 3069 iommu->name); 3070 } 3071 } 3072 3073 if (!ecap_pass_through(iommu->ecap)) 3074 hw_pass_through = 0; 3075 intel_svm_check(iommu); 3076 } 3077 3078 /* 3079 * Now that qi is enabled on all iommus, set the root entry and flush 3080 * caches. This is required on some Intel X58 chipsets, otherwise the 3081 * flush_context function will loop forever and the boot hangs. 3082 */ 3083 for_each_active_iommu(iommu, drhd) { 3084 iommu_flush_write_buffer(iommu); 3085 #ifdef CONFIG_INTEL_IOMMU_SVM 3086 register_pasid_allocator(iommu); 3087 #endif 3088 iommu_set_root_entry(iommu); 3089 } 3090 3091 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3092 dmar_map_gfx = 0; 3093 #endif 3094 3095 if (!dmar_map_gfx) 3096 iommu_identity_mapping |= IDENTMAP_GFX; 3097 3098 check_tylersburg_isoch(); 3099 3100 ret = si_domain_init(hw_pass_through); 3101 if (ret) 3102 goto free_iommu; 3103 3104 /* 3105 * for each drhd 3106 * enable fault log 3107 * global invalidate context cache 3108 * global invalidate iotlb 3109 * enable translation 3110 */ 3111 for_each_iommu(iommu, drhd) { 3112 if (drhd->ignored) { 3113 /* 3114 * we always have to disable PMRs or DMA may fail on 3115 * this device 3116 */ 3117 if (force_on) 3118 iommu_disable_protect_mem_regions(iommu); 3119 continue; 3120 } 3121 3122 iommu_flush_write_buffer(iommu); 3123 3124 #ifdef CONFIG_INTEL_IOMMU_SVM 3125 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3126 /* 3127 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3128 * could cause possible lock race condition. 3129 */ 3130 up_write(&dmar_global_lock); 3131 ret = intel_svm_enable_prq(iommu); 3132 down_write(&dmar_global_lock); 3133 if (ret) 3134 goto free_iommu; 3135 } 3136 #endif 3137 ret = dmar_set_interrupt(iommu); 3138 if (ret) 3139 goto free_iommu; 3140 } 3141 3142 return 0; 3143 3144 free_iommu: 3145 for_each_active_iommu(iommu, drhd) { 3146 disable_dmar_iommu(iommu); 3147 free_dmar_iommu(iommu); 3148 } 3149 3150 kfree(g_iommus); 3151 3152 error: 3153 return ret; 3154 } 3155 3156 static void __init init_no_remapping_devices(void) 3157 { 3158 struct dmar_drhd_unit *drhd; 3159 struct device *dev; 3160 int i; 3161 3162 for_each_drhd_unit(drhd) { 3163 if (!drhd->include_all) { 3164 for_each_active_dev_scope(drhd->devices, 3165 drhd->devices_cnt, i, dev) 3166 break; 3167 /* ignore DMAR unit if no devices exist */ 3168 if (i == drhd->devices_cnt) 3169 drhd->ignored = 1; 3170 } 3171 } 3172 3173 for_each_active_drhd_unit(drhd) { 3174 if (drhd->include_all) 3175 continue; 3176 3177 for_each_active_dev_scope(drhd->devices, 3178 drhd->devices_cnt, i, dev) 3179 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3180 break; 3181 if (i < drhd->devices_cnt) 3182 continue; 3183 3184 /* This IOMMU has *only* gfx devices. Either bypass it or 3185 set the gfx_mapped flag, as appropriate */ 3186 drhd->gfx_dedicated = 1; 3187 if (!dmar_map_gfx) 3188 drhd->ignored = 1; 3189 } 3190 } 3191 3192 #ifdef CONFIG_SUSPEND 3193 static int init_iommu_hw(void) 3194 { 3195 struct dmar_drhd_unit *drhd; 3196 struct intel_iommu *iommu = NULL; 3197 3198 for_each_active_iommu(iommu, drhd) 3199 if (iommu->qi) 3200 dmar_reenable_qi(iommu); 3201 3202 for_each_iommu(iommu, drhd) { 3203 if (drhd->ignored) { 3204 /* 3205 * we always have to disable PMRs or DMA may fail on 3206 * this device 3207 */ 3208 if (force_on) 3209 iommu_disable_protect_mem_regions(iommu); 3210 continue; 3211 } 3212 3213 iommu_flush_write_buffer(iommu); 3214 iommu_set_root_entry(iommu); 3215 iommu_enable_translation(iommu); 3216 iommu_disable_protect_mem_regions(iommu); 3217 } 3218 3219 return 0; 3220 } 3221 3222 static void iommu_flush_all(void) 3223 { 3224 struct dmar_drhd_unit *drhd; 3225 struct intel_iommu *iommu; 3226 3227 for_each_active_iommu(iommu, drhd) { 3228 iommu->flush.flush_context(iommu, 0, 0, 0, 3229 DMA_CCMD_GLOBAL_INVL); 3230 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3231 DMA_TLB_GLOBAL_FLUSH); 3232 } 3233 } 3234 3235 static int iommu_suspend(void) 3236 { 3237 struct dmar_drhd_unit *drhd; 3238 struct intel_iommu *iommu = NULL; 3239 unsigned long flag; 3240 3241 for_each_active_iommu(iommu, drhd) { 3242 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3243 GFP_KERNEL); 3244 if (!iommu->iommu_state) 3245 goto nomem; 3246 } 3247 3248 iommu_flush_all(); 3249 3250 for_each_active_iommu(iommu, drhd) { 3251 iommu_disable_translation(iommu); 3252 3253 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3254 3255 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3256 readl(iommu->reg + DMAR_FECTL_REG); 3257 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3258 readl(iommu->reg + DMAR_FEDATA_REG); 3259 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3260 readl(iommu->reg + DMAR_FEADDR_REG); 3261 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3262 readl(iommu->reg + DMAR_FEUADDR_REG); 3263 3264 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3265 } 3266 return 0; 3267 3268 nomem: 3269 for_each_active_iommu(iommu, drhd) 3270 kfree(iommu->iommu_state); 3271 3272 return -ENOMEM; 3273 } 3274 3275 static void iommu_resume(void) 3276 { 3277 struct dmar_drhd_unit *drhd; 3278 struct intel_iommu *iommu = NULL; 3279 unsigned long flag; 3280 3281 if (init_iommu_hw()) { 3282 if (force_on) 3283 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3284 else 3285 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3286 return; 3287 } 3288 3289 for_each_active_iommu(iommu, drhd) { 3290 3291 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3292 3293 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3294 iommu->reg + DMAR_FECTL_REG); 3295 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3296 iommu->reg + DMAR_FEDATA_REG); 3297 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3298 iommu->reg + DMAR_FEADDR_REG); 3299 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3300 iommu->reg + DMAR_FEUADDR_REG); 3301 3302 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3303 } 3304 3305 for_each_active_iommu(iommu, drhd) 3306 kfree(iommu->iommu_state); 3307 } 3308 3309 static struct syscore_ops iommu_syscore_ops = { 3310 .resume = iommu_resume, 3311 .suspend = iommu_suspend, 3312 }; 3313 3314 static void __init init_iommu_pm_ops(void) 3315 { 3316 register_syscore_ops(&iommu_syscore_ops); 3317 } 3318 3319 #else 3320 static inline void init_iommu_pm_ops(void) {} 3321 #endif /* CONFIG_PM */ 3322 3323 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3324 { 3325 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3326 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3327 rmrr->end_address <= rmrr->base_address || 3328 arch_rmrr_sanity_check(rmrr)) 3329 return -EINVAL; 3330 3331 return 0; 3332 } 3333 3334 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3335 { 3336 struct acpi_dmar_reserved_memory *rmrr; 3337 struct dmar_rmrr_unit *rmrru; 3338 3339 rmrr = (struct acpi_dmar_reserved_memory *)header; 3340 if (rmrr_sanity_check(rmrr)) { 3341 pr_warn(FW_BUG 3342 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3343 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3344 rmrr->base_address, rmrr->end_address, 3345 dmi_get_system_info(DMI_BIOS_VENDOR), 3346 dmi_get_system_info(DMI_BIOS_VERSION), 3347 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3348 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3349 } 3350 3351 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3352 if (!rmrru) 3353 goto out; 3354 3355 rmrru->hdr = header; 3356 3357 rmrru->base_address = rmrr->base_address; 3358 rmrru->end_address = rmrr->end_address; 3359 3360 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3361 ((void *)rmrr) + rmrr->header.length, 3362 &rmrru->devices_cnt); 3363 if (rmrru->devices_cnt && rmrru->devices == NULL) 3364 goto free_rmrru; 3365 3366 list_add(&rmrru->list, &dmar_rmrr_units); 3367 3368 return 0; 3369 free_rmrru: 3370 kfree(rmrru); 3371 out: 3372 return -ENOMEM; 3373 } 3374 3375 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3376 { 3377 struct dmar_atsr_unit *atsru; 3378 struct acpi_dmar_atsr *tmp; 3379 3380 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3381 dmar_rcu_check()) { 3382 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3383 if (atsr->segment != tmp->segment) 3384 continue; 3385 if (atsr->header.length != tmp->header.length) 3386 continue; 3387 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3388 return atsru; 3389 } 3390 3391 return NULL; 3392 } 3393 3394 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3395 { 3396 struct acpi_dmar_atsr *atsr; 3397 struct dmar_atsr_unit *atsru; 3398 3399 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3400 return 0; 3401 3402 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3403 atsru = dmar_find_atsr(atsr); 3404 if (atsru) 3405 return 0; 3406 3407 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3408 if (!atsru) 3409 return -ENOMEM; 3410 3411 /* 3412 * If memory is allocated from slab by ACPI _DSM method, we need to 3413 * copy the memory content because the memory buffer will be freed 3414 * on return. 3415 */ 3416 atsru->hdr = (void *)(atsru + 1); 3417 memcpy(atsru->hdr, hdr, hdr->length); 3418 atsru->include_all = atsr->flags & 0x1; 3419 if (!atsru->include_all) { 3420 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3421 (void *)atsr + atsr->header.length, 3422 &atsru->devices_cnt); 3423 if (atsru->devices_cnt && atsru->devices == NULL) { 3424 kfree(atsru); 3425 return -ENOMEM; 3426 } 3427 } 3428 3429 list_add_rcu(&atsru->list, &dmar_atsr_units); 3430 3431 return 0; 3432 } 3433 3434 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3435 { 3436 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3437 kfree(atsru); 3438 } 3439 3440 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3441 { 3442 struct acpi_dmar_atsr *atsr; 3443 struct dmar_atsr_unit *atsru; 3444 3445 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3446 atsru = dmar_find_atsr(atsr); 3447 if (atsru) { 3448 list_del_rcu(&atsru->list); 3449 synchronize_rcu(); 3450 intel_iommu_free_atsr(atsru); 3451 } 3452 3453 return 0; 3454 } 3455 3456 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3457 { 3458 int i; 3459 struct device *dev; 3460 struct acpi_dmar_atsr *atsr; 3461 struct dmar_atsr_unit *atsru; 3462 3463 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3464 atsru = dmar_find_atsr(atsr); 3465 if (!atsru) 3466 return 0; 3467 3468 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3469 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3470 i, dev) 3471 return -EBUSY; 3472 } 3473 3474 return 0; 3475 } 3476 3477 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3478 { 3479 struct dmar_satc_unit *satcu; 3480 struct acpi_dmar_satc *tmp; 3481 3482 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3483 dmar_rcu_check()) { 3484 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3485 if (satc->segment != tmp->segment) 3486 continue; 3487 if (satc->header.length != tmp->header.length) 3488 continue; 3489 if (memcmp(satc, tmp, satc->header.length) == 0) 3490 return satcu; 3491 } 3492 3493 return NULL; 3494 } 3495 3496 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3497 { 3498 struct acpi_dmar_satc *satc; 3499 struct dmar_satc_unit *satcu; 3500 3501 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3502 return 0; 3503 3504 satc = container_of(hdr, struct acpi_dmar_satc, header); 3505 satcu = dmar_find_satc(satc); 3506 if (satcu) 3507 return 0; 3508 3509 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3510 if (!satcu) 3511 return -ENOMEM; 3512 3513 satcu->hdr = (void *)(satcu + 1); 3514 memcpy(satcu->hdr, hdr, hdr->length); 3515 satcu->atc_required = satc->flags & 0x1; 3516 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3517 (void *)satc + satc->header.length, 3518 &satcu->devices_cnt); 3519 if (satcu->devices_cnt && !satcu->devices) { 3520 kfree(satcu); 3521 return -ENOMEM; 3522 } 3523 list_add_rcu(&satcu->list, &dmar_satc_units); 3524 3525 return 0; 3526 } 3527 3528 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3529 { 3530 int sp, ret; 3531 struct intel_iommu *iommu = dmaru->iommu; 3532 3533 if (g_iommus[iommu->seq_id]) 3534 return 0; 3535 3536 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3537 if (ret) 3538 goto out; 3539 3540 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3541 pr_warn("%s: Doesn't support hardware pass through.\n", 3542 iommu->name); 3543 return -ENXIO; 3544 } 3545 3546 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3547 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3548 pr_warn("%s: Doesn't support large page.\n", 3549 iommu->name); 3550 return -ENXIO; 3551 } 3552 3553 /* 3554 * Disable translation if already enabled prior to OS handover. 3555 */ 3556 if (iommu->gcmd & DMA_GCMD_TE) 3557 iommu_disable_translation(iommu); 3558 3559 g_iommus[iommu->seq_id] = iommu; 3560 ret = iommu_init_domains(iommu); 3561 if (ret == 0) 3562 ret = iommu_alloc_root_entry(iommu); 3563 if (ret) 3564 goto out; 3565 3566 intel_svm_check(iommu); 3567 3568 if (dmaru->ignored) { 3569 /* 3570 * we always have to disable PMRs or DMA may fail on this device 3571 */ 3572 if (force_on) 3573 iommu_disable_protect_mem_regions(iommu); 3574 return 0; 3575 } 3576 3577 intel_iommu_init_qi(iommu); 3578 iommu_flush_write_buffer(iommu); 3579 3580 #ifdef CONFIG_INTEL_IOMMU_SVM 3581 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3582 ret = intel_svm_enable_prq(iommu); 3583 if (ret) 3584 goto disable_iommu; 3585 } 3586 #endif 3587 ret = dmar_set_interrupt(iommu); 3588 if (ret) 3589 goto disable_iommu; 3590 3591 iommu_set_root_entry(iommu); 3592 iommu_enable_translation(iommu); 3593 3594 iommu_disable_protect_mem_regions(iommu); 3595 return 0; 3596 3597 disable_iommu: 3598 disable_dmar_iommu(iommu); 3599 out: 3600 free_dmar_iommu(iommu); 3601 return ret; 3602 } 3603 3604 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3605 { 3606 int ret = 0; 3607 struct intel_iommu *iommu = dmaru->iommu; 3608 3609 if (!intel_iommu_enabled) 3610 return 0; 3611 if (iommu == NULL) 3612 return -EINVAL; 3613 3614 if (insert) { 3615 ret = intel_iommu_add(dmaru); 3616 } else { 3617 disable_dmar_iommu(iommu); 3618 free_dmar_iommu(iommu); 3619 } 3620 3621 return ret; 3622 } 3623 3624 static void intel_iommu_free_dmars(void) 3625 { 3626 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3627 struct dmar_atsr_unit *atsru, *atsr_n; 3628 struct dmar_satc_unit *satcu, *satc_n; 3629 3630 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3631 list_del(&rmrru->list); 3632 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3633 kfree(rmrru); 3634 } 3635 3636 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3637 list_del(&atsru->list); 3638 intel_iommu_free_atsr(atsru); 3639 } 3640 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3641 list_del(&satcu->list); 3642 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3643 kfree(satcu); 3644 } 3645 } 3646 3647 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3648 { 3649 struct dmar_satc_unit *satcu; 3650 struct acpi_dmar_satc *satc; 3651 struct device *tmp; 3652 int i; 3653 3654 dev = pci_physfn(dev); 3655 rcu_read_lock(); 3656 3657 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3658 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3659 if (satc->segment != pci_domain_nr(dev->bus)) 3660 continue; 3661 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3662 if (to_pci_dev(tmp) == dev) 3663 goto out; 3664 } 3665 satcu = NULL; 3666 out: 3667 rcu_read_unlock(); 3668 return satcu; 3669 } 3670 3671 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3672 { 3673 int i, ret = 1; 3674 struct pci_bus *bus; 3675 struct pci_dev *bridge = NULL; 3676 struct device *tmp; 3677 struct acpi_dmar_atsr *atsr; 3678 struct dmar_atsr_unit *atsru; 3679 struct dmar_satc_unit *satcu; 3680 3681 dev = pci_physfn(dev); 3682 satcu = dmar_find_matched_satc_unit(dev); 3683 if (satcu) 3684 /* 3685 * This device supports ATS as it is in SATC table. 3686 * When IOMMU is in legacy mode, enabling ATS is done 3687 * automatically by HW for the device that requires 3688 * ATS, hence OS should not enable this device ATS 3689 * to avoid duplicated TLB invalidation. 3690 */ 3691 return !(satcu->atc_required && !sm_supported(iommu)); 3692 3693 for (bus = dev->bus; bus; bus = bus->parent) { 3694 bridge = bus->self; 3695 /* If it's an integrated device, allow ATS */ 3696 if (!bridge) 3697 return 1; 3698 /* Connected via non-PCIe: no ATS */ 3699 if (!pci_is_pcie(bridge) || 3700 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3701 return 0; 3702 /* If we found the root port, look it up in the ATSR */ 3703 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3704 break; 3705 } 3706 3707 rcu_read_lock(); 3708 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3709 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3710 if (atsr->segment != pci_domain_nr(dev->bus)) 3711 continue; 3712 3713 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3714 if (tmp == &bridge->dev) 3715 goto out; 3716 3717 if (atsru->include_all) 3718 goto out; 3719 } 3720 ret = 0; 3721 out: 3722 rcu_read_unlock(); 3723 3724 return ret; 3725 } 3726 3727 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3728 { 3729 int ret; 3730 struct dmar_rmrr_unit *rmrru; 3731 struct dmar_atsr_unit *atsru; 3732 struct dmar_satc_unit *satcu; 3733 struct acpi_dmar_atsr *atsr; 3734 struct acpi_dmar_reserved_memory *rmrr; 3735 struct acpi_dmar_satc *satc; 3736 3737 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3738 return 0; 3739 3740 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3741 rmrr = container_of(rmrru->hdr, 3742 struct acpi_dmar_reserved_memory, header); 3743 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3744 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3745 ((void *)rmrr) + rmrr->header.length, 3746 rmrr->segment, rmrru->devices, 3747 rmrru->devices_cnt); 3748 if (ret < 0) 3749 return ret; 3750 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3751 dmar_remove_dev_scope(info, rmrr->segment, 3752 rmrru->devices, rmrru->devices_cnt); 3753 } 3754 } 3755 3756 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3757 if (atsru->include_all) 3758 continue; 3759 3760 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3761 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3762 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3763 (void *)atsr + atsr->header.length, 3764 atsr->segment, atsru->devices, 3765 atsru->devices_cnt); 3766 if (ret > 0) 3767 break; 3768 else if (ret < 0) 3769 return ret; 3770 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3771 if (dmar_remove_dev_scope(info, atsr->segment, 3772 atsru->devices, atsru->devices_cnt)) 3773 break; 3774 } 3775 } 3776 list_for_each_entry(satcu, &dmar_satc_units, list) { 3777 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3778 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3779 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3780 (void *)satc + satc->header.length, 3781 satc->segment, satcu->devices, 3782 satcu->devices_cnt); 3783 if (ret > 0) 3784 break; 3785 else if (ret < 0) 3786 return ret; 3787 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3788 if (dmar_remove_dev_scope(info, satc->segment, 3789 satcu->devices, satcu->devices_cnt)) 3790 break; 3791 } 3792 } 3793 3794 return 0; 3795 } 3796 3797 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3798 unsigned long val, void *v) 3799 { 3800 struct memory_notify *mhp = v; 3801 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3802 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3803 mhp->nr_pages - 1); 3804 3805 switch (val) { 3806 case MEM_GOING_ONLINE: 3807 if (iommu_domain_identity_map(si_domain, 3808 start_vpfn, last_vpfn)) { 3809 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3810 start_vpfn, last_vpfn); 3811 return NOTIFY_BAD; 3812 } 3813 break; 3814 3815 case MEM_OFFLINE: 3816 case MEM_CANCEL_ONLINE: 3817 { 3818 struct dmar_drhd_unit *drhd; 3819 struct intel_iommu *iommu; 3820 LIST_HEAD(freelist); 3821 3822 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3823 3824 rcu_read_lock(); 3825 for_each_active_iommu(iommu, drhd) 3826 iommu_flush_iotlb_psi(iommu, si_domain, 3827 start_vpfn, mhp->nr_pages, 3828 list_empty(&freelist), 0); 3829 rcu_read_unlock(); 3830 put_pages_list(&freelist); 3831 } 3832 break; 3833 } 3834 3835 return NOTIFY_OK; 3836 } 3837 3838 static struct notifier_block intel_iommu_memory_nb = { 3839 .notifier_call = intel_iommu_memory_notifier, 3840 .priority = 0 3841 }; 3842 3843 static void intel_disable_iommus(void) 3844 { 3845 struct intel_iommu *iommu = NULL; 3846 struct dmar_drhd_unit *drhd; 3847 3848 for_each_iommu(iommu, drhd) 3849 iommu_disable_translation(iommu); 3850 } 3851 3852 void intel_iommu_shutdown(void) 3853 { 3854 struct dmar_drhd_unit *drhd; 3855 struct intel_iommu *iommu = NULL; 3856 3857 if (no_iommu || dmar_disabled) 3858 return; 3859 3860 down_write(&dmar_global_lock); 3861 3862 /* Disable PMRs explicitly here. */ 3863 for_each_iommu(iommu, drhd) 3864 iommu_disable_protect_mem_regions(iommu); 3865 3866 /* Make sure the IOMMUs are switched off */ 3867 intel_disable_iommus(); 3868 3869 up_write(&dmar_global_lock); 3870 } 3871 3872 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3873 { 3874 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3875 3876 return container_of(iommu_dev, struct intel_iommu, iommu); 3877 } 3878 3879 static ssize_t version_show(struct device *dev, 3880 struct device_attribute *attr, char *buf) 3881 { 3882 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3883 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3884 return sprintf(buf, "%d:%d\n", 3885 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3886 } 3887 static DEVICE_ATTR_RO(version); 3888 3889 static ssize_t address_show(struct device *dev, 3890 struct device_attribute *attr, char *buf) 3891 { 3892 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3893 return sprintf(buf, "%llx\n", iommu->reg_phys); 3894 } 3895 static DEVICE_ATTR_RO(address); 3896 3897 static ssize_t cap_show(struct device *dev, 3898 struct device_attribute *attr, char *buf) 3899 { 3900 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3901 return sprintf(buf, "%llx\n", iommu->cap); 3902 } 3903 static DEVICE_ATTR_RO(cap); 3904 3905 static ssize_t ecap_show(struct device *dev, 3906 struct device_attribute *attr, char *buf) 3907 { 3908 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3909 return sprintf(buf, "%llx\n", iommu->ecap); 3910 } 3911 static DEVICE_ATTR_RO(ecap); 3912 3913 static ssize_t domains_supported_show(struct device *dev, 3914 struct device_attribute *attr, char *buf) 3915 { 3916 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3917 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 3918 } 3919 static DEVICE_ATTR_RO(domains_supported); 3920 3921 static ssize_t domains_used_show(struct device *dev, 3922 struct device_attribute *attr, char *buf) 3923 { 3924 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3925 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 3926 cap_ndoms(iommu->cap))); 3927 } 3928 static DEVICE_ATTR_RO(domains_used); 3929 3930 static struct attribute *intel_iommu_attrs[] = { 3931 &dev_attr_version.attr, 3932 &dev_attr_address.attr, 3933 &dev_attr_cap.attr, 3934 &dev_attr_ecap.attr, 3935 &dev_attr_domains_supported.attr, 3936 &dev_attr_domains_used.attr, 3937 NULL, 3938 }; 3939 3940 static struct attribute_group intel_iommu_group = { 3941 .name = "intel-iommu", 3942 .attrs = intel_iommu_attrs, 3943 }; 3944 3945 const struct attribute_group *intel_iommu_groups[] = { 3946 &intel_iommu_group, 3947 NULL, 3948 }; 3949 3950 static inline bool has_external_pci(void) 3951 { 3952 struct pci_dev *pdev = NULL; 3953 3954 for_each_pci_dev(pdev) 3955 if (pdev->external_facing) 3956 return true; 3957 3958 return false; 3959 } 3960 3961 static int __init platform_optin_force_iommu(void) 3962 { 3963 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 3964 return 0; 3965 3966 if (no_iommu || dmar_disabled) 3967 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 3968 3969 /* 3970 * If Intel-IOMMU is disabled by default, we will apply identity 3971 * map for all devices except those marked as being untrusted. 3972 */ 3973 if (dmar_disabled) 3974 iommu_set_default_passthrough(false); 3975 3976 dmar_disabled = 0; 3977 no_iommu = 0; 3978 3979 return 1; 3980 } 3981 3982 static int __init probe_acpi_namespace_devices(void) 3983 { 3984 struct dmar_drhd_unit *drhd; 3985 /* To avoid a -Wunused-but-set-variable warning. */ 3986 struct intel_iommu *iommu __maybe_unused; 3987 struct device *dev; 3988 int i, ret = 0; 3989 3990 for_each_active_iommu(iommu, drhd) { 3991 for_each_active_dev_scope(drhd->devices, 3992 drhd->devices_cnt, i, dev) { 3993 struct acpi_device_physical_node *pn; 3994 struct iommu_group *group; 3995 struct acpi_device *adev; 3996 3997 if (dev->bus != &acpi_bus_type) 3998 continue; 3999 4000 adev = to_acpi_device(dev); 4001 mutex_lock(&adev->physical_node_lock); 4002 list_for_each_entry(pn, 4003 &adev->physical_node_list, node) { 4004 group = iommu_group_get(pn->dev); 4005 if (group) { 4006 iommu_group_put(group); 4007 continue; 4008 } 4009 4010 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4011 ret = iommu_probe_device(pn->dev); 4012 if (ret) 4013 break; 4014 } 4015 mutex_unlock(&adev->physical_node_lock); 4016 4017 if (ret) 4018 return ret; 4019 } 4020 } 4021 4022 return 0; 4023 } 4024 4025 int __init intel_iommu_init(void) 4026 { 4027 int ret = -ENODEV; 4028 struct dmar_drhd_unit *drhd; 4029 struct intel_iommu *iommu; 4030 4031 /* 4032 * Intel IOMMU is required for a TXT/tboot launch or platform 4033 * opt in, so enforce that. 4034 */ 4035 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4036 platform_optin_force_iommu(); 4037 4038 down_write(&dmar_global_lock); 4039 if (dmar_table_init()) { 4040 if (force_on) 4041 panic("tboot: Failed to initialize DMAR table\n"); 4042 goto out_free_dmar; 4043 } 4044 4045 if (dmar_dev_scope_init() < 0) { 4046 if (force_on) 4047 panic("tboot: Failed to initialize DMAR device scope\n"); 4048 goto out_free_dmar; 4049 } 4050 4051 up_write(&dmar_global_lock); 4052 4053 /* 4054 * The bus notifier takes the dmar_global_lock, so lockdep will 4055 * complain later when we register it under the lock. 4056 */ 4057 dmar_register_bus_notifier(); 4058 4059 down_write(&dmar_global_lock); 4060 4061 if (!no_iommu) 4062 intel_iommu_debugfs_init(); 4063 4064 if (no_iommu || dmar_disabled) { 4065 /* 4066 * We exit the function here to ensure IOMMU's remapping and 4067 * mempool aren't setup, which means that the IOMMU's PMRs 4068 * won't be disabled via the call to init_dmars(). So disable 4069 * it explicitly here. The PMRs were setup by tboot prior to 4070 * calling SENTER, but the kernel is expected to reset/tear 4071 * down the PMRs. 4072 */ 4073 if (intel_iommu_tboot_noforce) { 4074 for_each_iommu(iommu, drhd) 4075 iommu_disable_protect_mem_regions(iommu); 4076 } 4077 4078 /* 4079 * Make sure the IOMMUs are switched off, even when we 4080 * boot into a kexec kernel and the previous kernel left 4081 * them enabled 4082 */ 4083 intel_disable_iommus(); 4084 goto out_free_dmar; 4085 } 4086 4087 if (list_empty(&dmar_rmrr_units)) 4088 pr_info("No RMRR found\n"); 4089 4090 if (list_empty(&dmar_atsr_units)) 4091 pr_info("No ATSR found\n"); 4092 4093 if (list_empty(&dmar_satc_units)) 4094 pr_info("No SATC found\n"); 4095 4096 if (dmar_map_gfx) 4097 intel_iommu_gfx_mapped = 1; 4098 4099 init_no_remapping_devices(); 4100 4101 ret = init_dmars(); 4102 if (ret) { 4103 if (force_on) 4104 panic("tboot: Failed to initialize DMARs\n"); 4105 pr_err("Initialization failed\n"); 4106 goto out_free_dmar; 4107 } 4108 up_write(&dmar_global_lock); 4109 4110 init_iommu_pm_ops(); 4111 4112 down_read(&dmar_global_lock); 4113 for_each_active_iommu(iommu, drhd) { 4114 /* 4115 * The flush queue implementation does not perform 4116 * page-selective invalidations that are required for efficient 4117 * TLB flushes in virtual environments. The benefit of batching 4118 * is likely to be much lower than the overhead of synchronizing 4119 * the virtual and physical IOMMU page-tables. 4120 */ 4121 if (cap_caching_mode(iommu->cap)) { 4122 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4123 iommu_set_dma_strict(); 4124 } 4125 iommu_device_sysfs_add(&iommu->iommu, NULL, 4126 intel_iommu_groups, 4127 "%s", iommu->name); 4128 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4129 } 4130 up_read(&dmar_global_lock); 4131 4132 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4133 if (si_domain && !hw_pass_through) 4134 register_memory_notifier(&intel_iommu_memory_nb); 4135 4136 down_read(&dmar_global_lock); 4137 if (probe_acpi_namespace_devices()) 4138 pr_warn("ACPI name space devices didn't probe correctly\n"); 4139 4140 /* Finally, we enable the DMA remapping hardware. */ 4141 for_each_iommu(iommu, drhd) { 4142 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4143 iommu_enable_translation(iommu); 4144 4145 iommu_disable_protect_mem_regions(iommu); 4146 } 4147 up_read(&dmar_global_lock); 4148 4149 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4150 4151 intel_iommu_enabled = 1; 4152 4153 return 0; 4154 4155 out_free_dmar: 4156 intel_iommu_free_dmars(); 4157 up_write(&dmar_global_lock); 4158 return ret; 4159 } 4160 4161 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4162 { 4163 struct device_domain_info *info = opaque; 4164 4165 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4166 return 0; 4167 } 4168 4169 /* 4170 * NB - intel-iommu lacks any sort of reference counting for the users of 4171 * dependent devices. If multiple endpoints have intersecting dependent 4172 * devices, unbinding the driver from any one of them will possibly leave 4173 * the others unable to operate. 4174 */ 4175 static void domain_context_clear(struct device_domain_info *info) 4176 { 4177 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4178 return; 4179 4180 pci_for_each_dma_alias(to_pci_dev(info->dev), 4181 &domain_context_clear_one_cb, info); 4182 } 4183 4184 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4185 { 4186 struct dmar_domain *domain; 4187 struct intel_iommu *iommu; 4188 unsigned long flags; 4189 4190 assert_spin_locked(&device_domain_lock); 4191 4192 if (WARN_ON(!info)) 4193 return; 4194 4195 iommu = info->iommu; 4196 domain = info->domain; 4197 4198 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4199 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4200 intel_pasid_tear_down_entry(iommu, info->dev, 4201 PASID_RID2PASID, false); 4202 4203 iommu_disable_dev_iotlb(info); 4204 domain_context_clear(info); 4205 intel_pasid_free_table(info->dev); 4206 } 4207 4208 list_del(&info->link); 4209 4210 spin_lock_irqsave(&iommu->lock, flags); 4211 domain_detach_iommu(domain, iommu); 4212 spin_unlock_irqrestore(&iommu->lock, flags); 4213 } 4214 4215 static void dmar_remove_one_dev_info(struct device *dev) 4216 { 4217 struct device_domain_info *info; 4218 unsigned long flags; 4219 4220 spin_lock_irqsave(&device_domain_lock, flags); 4221 info = dev_iommu_priv_get(dev); 4222 if (info) 4223 __dmar_remove_one_dev_info(info); 4224 spin_unlock_irqrestore(&device_domain_lock, flags); 4225 } 4226 4227 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4228 { 4229 int adjust_width; 4230 4231 /* calculate AGAW */ 4232 domain->gaw = guest_width; 4233 adjust_width = guestwidth_to_adjustwidth(guest_width); 4234 domain->agaw = width_to_agaw(adjust_width); 4235 4236 domain->iommu_coherency = false; 4237 domain->iommu_superpage = 0; 4238 domain->max_addr = 0; 4239 4240 /* always allocate the top pgd */ 4241 domain->pgd = alloc_pgtable_page(domain->nid); 4242 if (!domain->pgd) 4243 return -ENOMEM; 4244 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4245 return 0; 4246 } 4247 4248 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4249 { 4250 struct dmar_domain *dmar_domain; 4251 struct iommu_domain *domain; 4252 4253 switch (type) { 4254 case IOMMU_DOMAIN_DMA: 4255 case IOMMU_DOMAIN_DMA_FQ: 4256 case IOMMU_DOMAIN_UNMANAGED: 4257 dmar_domain = alloc_domain(type); 4258 if (!dmar_domain) { 4259 pr_err("Can't allocate dmar_domain\n"); 4260 return NULL; 4261 } 4262 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4263 pr_err("Domain initialization failed\n"); 4264 domain_exit(dmar_domain); 4265 return NULL; 4266 } 4267 4268 domain = &dmar_domain->domain; 4269 domain->geometry.aperture_start = 0; 4270 domain->geometry.aperture_end = 4271 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4272 domain->geometry.force_aperture = true; 4273 4274 return domain; 4275 case IOMMU_DOMAIN_IDENTITY: 4276 return &si_domain->domain; 4277 default: 4278 return NULL; 4279 } 4280 4281 return NULL; 4282 } 4283 4284 static void intel_iommu_domain_free(struct iommu_domain *domain) 4285 { 4286 if (domain != &si_domain->domain) 4287 domain_exit(to_dmar_domain(domain)); 4288 } 4289 4290 static int prepare_domain_attach_device(struct iommu_domain *domain, 4291 struct device *dev) 4292 { 4293 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4294 struct intel_iommu *iommu; 4295 int addr_width; 4296 4297 iommu = device_to_iommu(dev, NULL, NULL); 4298 if (!iommu) 4299 return -ENODEV; 4300 4301 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 4302 return -EOPNOTSUPP; 4303 4304 /* check if this iommu agaw is sufficient for max mapped address */ 4305 addr_width = agaw_to_width(iommu->agaw); 4306 if (addr_width > cap_mgaw(iommu->cap)) 4307 addr_width = cap_mgaw(iommu->cap); 4308 4309 if (dmar_domain->max_addr > (1LL << addr_width)) { 4310 dev_err(dev, "%s: iommu width (%d) is not " 4311 "sufficient for the mapped address (%llx)\n", 4312 __func__, addr_width, dmar_domain->max_addr); 4313 return -EFAULT; 4314 } 4315 dmar_domain->gaw = addr_width; 4316 4317 /* 4318 * Knock out extra levels of page tables if necessary 4319 */ 4320 while (iommu->agaw < dmar_domain->agaw) { 4321 struct dma_pte *pte; 4322 4323 pte = dmar_domain->pgd; 4324 if (dma_pte_present(pte)) { 4325 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4326 free_pgtable_page(pte); 4327 } 4328 dmar_domain->agaw--; 4329 } 4330 4331 return 0; 4332 } 4333 4334 static int intel_iommu_attach_device(struct iommu_domain *domain, 4335 struct device *dev) 4336 { 4337 int ret; 4338 4339 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4340 device_is_rmrr_locked(dev)) { 4341 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4342 return -EPERM; 4343 } 4344 4345 /* normally dev is not mapped */ 4346 if (unlikely(domain_context_mapped(dev))) { 4347 struct device_domain_info *info = dev_iommu_priv_get(dev); 4348 4349 if (info->domain) 4350 dmar_remove_one_dev_info(dev); 4351 } 4352 4353 ret = prepare_domain_attach_device(domain, dev); 4354 if (ret) 4355 return ret; 4356 4357 return domain_add_dev_info(to_dmar_domain(domain), dev); 4358 } 4359 4360 static void intel_iommu_detach_device(struct iommu_domain *domain, 4361 struct device *dev) 4362 { 4363 dmar_remove_one_dev_info(dev); 4364 } 4365 4366 static int intel_iommu_map(struct iommu_domain *domain, 4367 unsigned long iova, phys_addr_t hpa, 4368 size_t size, int iommu_prot, gfp_t gfp) 4369 { 4370 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4371 u64 max_addr; 4372 int prot = 0; 4373 4374 if (iommu_prot & IOMMU_READ) 4375 prot |= DMA_PTE_READ; 4376 if (iommu_prot & IOMMU_WRITE) 4377 prot |= DMA_PTE_WRITE; 4378 if (dmar_domain->set_pte_snp) 4379 prot |= DMA_PTE_SNP; 4380 4381 max_addr = iova + size; 4382 if (dmar_domain->max_addr < max_addr) { 4383 u64 end; 4384 4385 /* check if minimum agaw is sufficient for mapped address */ 4386 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4387 if (end < max_addr) { 4388 pr_err("%s: iommu width (%d) is not " 4389 "sufficient for the mapped address (%llx)\n", 4390 __func__, dmar_domain->gaw, max_addr); 4391 return -EFAULT; 4392 } 4393 dmar_domain->max_addr = max_addr; 4394 } 4395 /* Round up size to next multiple of PAGE_SIZE, if it and 4396 the low bits of hpa would take us onto the next page */ 4397 size = aligned_nrpages(hpa, size); 4398 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4399 hpa >> VTD_PAGE_SHIFT, size, prot); 4400 } 4401 4402 static int intel_iommu_map_pages(struct iommu_domain *domain, 4403 unsigned long iova, phys_addr_t paddr, 4404 size_t pgsize, size_t pgcount, 4405 int prot, gfp_t gfp, size_t *mapped) 4406 { 4407 unsigned long pgshift = __ffs(pgsize); 4408 size_t size = pgcount << pgshift; 4409 int ret; 4410 4411 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4412 return -EINVAL; 4413 4414 if (!IS_ALIGNED(iova | paddr, pgsize)) 4415 return -EINVAL; 4416 4417 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4418 if (!ret && mapped) 4419 *mapped = size; 4420 4421 return ret; 4422 } 4423 4424 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4425 unsigned long iova, size_t size, 4426 struct iommu_iotlb_gather *gather) 4427 { 4428 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4429 unsigned long start_pfn, last_pfn; 4430 int level = 0; 4431 4432 /* Cope with horrid API which requires us to unmap more than the 4433 size argument if it happens to be a large-page mapping. */ 4434 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4435 4436 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4437 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4438 4439 start_pfn = iova >> VTD_PAGE_SHIFT; 4440 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4441 4442 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4443 4444 if (dmar_domain->max_addr == iova + size) 4445 dmar_domain->max_addr = iova; 4446 4447 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4448 4449 return size; 4450 } 4451 4452 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4453 unsigned long iova, 4454 size_t pgsize, size_t pgcount, 4455 struct iommu_iotlb_gather *gather) 4456 { 4457 unsigned long pgshift = __ffs(pgsize); 4458 size_t size = pgcount << pgshift; 4459 4460 return intel_iommu_unmap(domain, iova, size, gather); 4461 } 4462 4463 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4464 struct iommu_iotlb_gather *gather) 4465 { 4466 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4467 unsigned long iova_pfn = IOVA_PFN(gather->start); 4468 size_t size = gather->end - gather->start; 4469 unsigned long start_pfn; 4470 unsigned long nrpages; 4471 int iommu_id; 4472 4473 nrpages = aligned_nrpages(gather->start, size); 4474 start_pfn = mm_to_dma_pfn(iova_pfn); 4475 4476 for_each_domain_iommu(iommu_id, dmar_domain) 4477 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 4478 start_pfn, nrpages, 4479 list_empty(&gather->freelist), 0); 4480 4481 put_pages_list(&gather->freelist); 4482 } 4483 4484 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4485 dma_addr_t iova) 4486 { 4487 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4488 struct dma_pte *pte; 4489 int level = 0; 4490 u64 phys = 0; 4491 4492 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4493 if (pte && dma_pte_present(pte)) 4494 phys = dma_pte_addr(pte) + 4495 (iova & (BIT_MASK(level_to_offset_bits(level) + 4496 VTD_PAGE_SHIFT) - 1)); 4497 4498 return phys; 4499 } 4500 4501 static bool domain_support_force_snooping(struct dmar_domain *domain) 4502 { 4503 struct device_domain_info *info; 4504 bool support = true; 4505 4506 assert_spin_locked(&device_domain_lock); 4507 list_for_each_entry(info, &domain->devices, link) { 4508 if (!ecap_sc_support(info->iommu->ecap)) { 4509 support = false; 4510 break; 4511 } 4512 } 4513 4514 return support; 4515 } 4516 4517 static void domain_set_force_snooping(struct dmar_domain *domain) 4518 { 4519 struct device_domain_info *info; 4520 4521 assert_spin_locked(&device_domain_lock); 4522 4523 /* 4524 * Second level page table supports per-PTE snoop control. The 4525 * iommu_map() interface will handle this by setting SNP bit. 4526 */ 4527 if (!domain_use_first_level(domain)) { 4528 domain->set_pte_snp = true; 4529 return; 4530 } 4531 4532 list_for_each_entry(info, &domain->devices, link) 4533 intel_pasid_setup_page_snoop_control(info->iommu, info->dev, 4534 PASID_RID2PASID); 4535 } 4536 4537 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) 4538 { 4539 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4540 unsigned long flags; 4541 4542 if (dmar_domain->force_snooping) 4543 return true; 4544 4545 spin_lock_irqsave(&device_domain_lock, flags); 4546 if (!domain_support_force_snooping(dmar_domain)) { 4547 spin_unlock_irqrestore(&device_domain_lock, flags); 4548 return false; 4549 } 4550 4551 domain_set_force_snooping(dmar_domain); 4552 dmar_domain->force_snooping = true; 4553 spin_unlock_irqrestore(&device_domain_lock, flags); 4554 4555 return true; 4556 } 4557 4558 static bool intel_iommu_capable(enum iommu_cap cap) 4559 { 4560 if (cap == IOMMU_CAP_CACHE_COHERENCY) 4561 return true; 4562 if (cap == IOMMU_CAP_INTR_REMAP) 4563 return irq_remapping_enabled == 1; 4564 if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) 4565 return dmar_platform_optin(); 4566 4567 return false; 4568 } 4569 4570 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4571 { 4572 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4573 struct device_domain_info *info; 4574 struct intel_iommu *iommu; 4575 unsigned long flags; 4576 u8 bus, devfn; 4577 4578 iommu = device_to_iommu(dev, &bus, &devfn); 4579 if (!iommu) 4580 return ERR_PTR(-ENODEV); 4581 4582 info = kzalloc(sizeof(*info), GFP_KERNEL); 4583 if (!info) 4584 return ERR_PTR(-ENOMEM); 4585 4586 if (dev_is_real_dma_subdevice(dev)) { 4587 info->bus = pdev->bus->number; 4588 info->devfn = pdev->devfn; 4589 info->segment = pci_domain_nr(pdev->bus); 4590 } else { 4591 info->bus = bus; 4592 info->devfn = devfn; 4593 info->segment = iommu->segment; 4594 } 4595 4596 info->dev = dev; 4597 info->iommu = iommu; 4598 if (dev_is_pci(dev)) { 4599 if (ecap_dev_iotlb_support(iommu->ecap) && 4600 pci_ats_supported(pdev) && 4601 dmar_ats_supported(pdev, iommu)) 4602 info->ats_supported = 1; 4603 4604 if (sm_supported(iommu)) { 4605 if (pasid_supported(iommu)) { 4606 int features = pci_pasid_features(pdev); 4607 4608 if (features >= 0) 4609 info->pasid_supported = features | 1; 4610 } 4611 4612 if (info->ats_supported && ecap_prs(iommu->ecap) && 4613 pci_pri_supported(pdev)) 4614 info->pri_supported = 1; 4615 } 4616 } 4617 4618 spin_lock_irqsave(&device_domain_lock, flags); 4619 list_add(&info->global, &device_domain_list); 4620 dev_iommu_priv_set(dev, info); 4621 spin_unlock_irqrestore(&device_domain_lock, flags); 4622 4623 return &iommu->iommu; 4624 } 4625 4626 static void intel_iommu_release_device(struct device *dev) 4627 { 4628 struct device_domain_info *info = dev_iommu_priv_get(dev); 4629 unsigned long flags; 4630 4631 dmar_remove_one_dev_info(dev); 4632 4633 spin_lock_irqsave(&device_domain_lock, flags); 4634 dev_iommu_priv_set(dev, NULL); 4635 list_del(&info->global); 4636 spin_unlock_irqrestore(&device_domain_lock, flags); 4637 4638 kfree(info); 4639 set_dma_ops(dev, NULL); 4640 } 4641 4642 static void intel_iommu_probe_finalize(struct device *dev) 4643 { 4644 set_dma_ops(dev, NULL); 4645 iommu_setup_dma_ops(dev, 0, U64_MAX); 4646 } 4647 4648 static void intel_iommu_get_resv_regions(struct device *device, 4649 struct list_head *head) 4650 { 4651 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4652 struct iommu_resv_region *reg; 4653 struct dmar_rmrr_unit *rmrr; 4654 struct device *i_dev; 4655 int i; 4656 4657 down_read(&dmar_global_lock); 4658 for_each_rmrr_units(rmrr) { 4659 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4660 i, i_dev) { 4661 struct iommu_resv_region *resv; 4662 enum iommu_resv_type type; 4663 size_t length; 4664 4665 if (i_dev != device && 4666 !is_downstream_to_pci_bridge(device, i_dev)) 4667 continue; 4668 4669 length = rmrr->end_address - rmrr->base_address + 1; 4670 4671 type = device_rmrr_is_relaxable(device) ? 4672 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4673 4674 resv = iommu_alloc_resv_region(rmrr->base_address, 4675 length, prot, type); 4676 if (!resv) 4677 break; 4678 4679 list_add_tail(&resv->list, head); 4680 } 4681 } 4682 up_read(&dmar_global_lock); 4683 4684 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4685 if (dev_is_pci(device)) { 4686 struct pci_dev *pdev = to_pci_dev(device); 4687 4688 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4689 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4690 IOMMU_RESV_DIRECT_RELAXABLE); 4691 if (reg) 4692 list_add_tail(®->list, head); 4693 } 4694 } 4695 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4696 4697 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4698 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4699 0, IOMMU_RESV_MSI); 4700 if (!reg) 4701 return; 4702 list_add_tail(®->list, head); 4703 } 4704 4705 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 4706 { 4707 struct device_domain_info *info = dev_iommu_priv_get(dev); 4708 struct context_entry *context; 4709 struct dmar_domain *domain; 4710 unsigned long flags; 4711 u64 ctx_lo; 4712 int ret; 4713 4714 domain = info->domain; 4715 if (!domain) 4716 return -EINVAL; 4717 4718 spin_lock_irqsave(&device_domain_lock, flags); 4719 spin_lock(&iommu->lock); 4720 4721 ret = -EINVAL; 4722 if (!info->pasid_supported) 4723 goto out; 4724 4725 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 4726 if (WARN_ON(!context)) 4727 goto out; 4728 4729 ctx_lo = context[0].lo; 4730 4731 if (!(ctx_lo & CONTEXT_PASIDE)) { 4732 ctx_lo |= CONTEXT_PASIDE; 4733 context[0].lo = ctx_lo; 4734 wmb(); 4735 iommu->flush.flush_context(iommu, 4736 domain->iommu_did[iommu->seq_id], 4737 PCI_DEVID(info->bus, info->devfn), 4738 DMA_CCMD_MASK_NOBIT, 4739 DMA_CCMD_DEVICE_INVL); 4740 } 4741 4742 /* Enable PASID support in the device, if it wasn't already */ 4743 if (!info->pasid_enabled) 4744 iommu_enable_dev_iotlb(info); 4745 4746 ret = 0; 4747 4748 out: 4749 spin_unlock(&iommu->lock); 4750 spin_unlock_irqrestore(&device_domain_lock, flags); 4751 4752 return ret; 4753 } 4754 4755 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4756 { 4757 if (dev_is_pci(dev)) 4758 return pci_device_group(dev); 4759 return generic_device_group(dev); 4760 } 4761 4762 static int intel_iommu_enable_sva(struct device *dev) 4763 { 4764 struct device_domain_info *info = dev_iommu_priv_get(dev); 4765 struct intel_iommu *iommu; 4766 int ret; 4767 4768 if (!info || dmar_disabled) 4769 return -EINVAL; 4770 4771 iommu = info->iommu; 4772 if (!iommu) 4773 return -EINVAL; 4774 4775 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4776 return -ENODEV; 4777 4778 if (intel_iommu_enable_pasid(iommu, dev)) 4779 return -ENODEV; 4780 4781 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 4782 return -EINVAL; 4783 4784 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4785 if (!ret) 4786 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4787 4788 return ret; 4789 } 4790 4791 static int intel_iommu_disable_sva(struct device *dev) 4792 { 4793 struct device_domain_info *info = dev_iommu_priv_get(dev); 4794 struct intel_iommu *iommu = info->iommu; 4795 int ret; 4796 4797 ret = iommu_unregister_device_fault_handler(dev); 4798 if (!ret) 4799 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 4800 4801 return ret; 4802 } 4803 4804 static int intel_iommu_enable_iopf(struct device *dev) 4805 { 4806 struct device_domain_info *info = dev_iommu_priv_get(dev); 4807 4808 if (info && info->pri_supported) 4809 return 0; 4810 4811 return -ENODEV; 4812 } 4813 4814 static int 4815 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4816 { 4817 switch (feat) { 4818 case IOMMU_DEV_FEAT_IOPF: 4819 return intel_iommu_enable_iopf(dev); 4820 4821 case IOMMU_DEV_FEAT_SVA: 4822 return intel_iommu_enable_sva(dev); 4823 4824 default: 4825 return -ENODEV; 4826 } 4827 } 4828 4829 static int 4830 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4831 { 4832 switch (feat) { 4833 case IOMMU_DEV_FEAT_IOPF: 4834 return 0; 4835 4836 case IOMMU_DEV_FEAT_SVA: 4837 return intel_iommu_disable_sva(dev); 4838 4839 default: 4840 return -ENODEV; 4841 } 4842 } 4843 4844 static bool intel_iommu_is_attach_deferred(struct device *dev) 4845 { 4846 struct device_domain_info *info = dev_iommu_priv_get(dev); 4847 4848 return translation_pre_enabled(info->iommu) && !info->domain; 4849 } 4850 4851 /* 4852 * Check that the device does not live on an external facing PCI port that is 4853 * marked as untrusted. Such devices should not be able to apply quirks and 4854 * thus not be able to bypass the IOMMU restrictions. 4855 */ 4856 static bool risky_device(struct pci_dev *pdev) 4857 { 4858 if (pdev->untrusted) { 4859 pci_info(pdev, 4860 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4861 pdev->vendor, pdev->device); 4862 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4863 return true; 4864 } 4865 return false; 4866 } 4867 4868 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4869 unsigned long iova, size_t size) 4870 { 4871 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4872 unsigned long pages = aligned_nrpages(iova, size); 4873 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4874 struct intel_iommu *iommu; 4875 int iommu_id; 4876 4877 for_each_domain_iommu(iommu_id, dmar_domain) { 4878 iommu = g_iommus[iommu_id]; 4879 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 4880 } 4881 } 4882 4883 const struct iommu_ops intel_iommu_ops = { 4884 .capable = intel_iommu_capable, 4885 .domain_alloc = intel_iommu_domain_alloc, 4886 .probe_device = intel_iommu_probe_device, 4887 .probe_finalize = intel_iommu_probe_finalize, 4888 .release_device = intel_iommu_release_device, 4889 .get_resv_regions = intel_iommu_get_resv_regions, 4890 .put_resv_regions = generic_iommu_put_resv_regions, 4891 .device_group = intel_iommu_device_group, 4892 .dev_enable_feat = intel_iommu_dev_enable_feat, 4893 .dev_disable_feat = intel_iommu_dev_disable_feat, 4894 .is_attach_deferred = intel_iommu_is_attach_deferred, 4895 .def_domain_type = device_def_domain_type, 4896 .pgsize_bitmap = SZ_4K, 4897 #ifdef CONFIG_INTEL_IOMMU_SVM 4898 .sva_bind = intel_svm_bind, 4899 .sva_unbind = intel_svm_unbind, 4900 .sva_get_pasid = intel_svm_get_pasid, 4901 .page_response = intel_svm_page_response, 4902 #endif 4903 .default_domain_ops = &(const struct iommu_domain_ops) { 4904 .attach_dev = intel_iommu_attach_device, 4905 .detach_dev = intel_iommu_detach_device, 4906 .map_pages = intel_iommu_map_pages, 4907 .unmap_pages = intel_iommu_unmap_pages, 4908 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4909 .flush_iotlb_all = intel_flush_iotlb_all, 4910 .iotlb_sync = intel_iommu_tlb_sync, 4911 .iova_to_phys = intel_iommu_iova_to_phys, 4912 .free = intel_iommu_domain_free, 4913 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, 4914 } 4915 }; 4916 4917 static void quirk_iommu_igfx(struct pci_dev *dev) 4918 { 4919 if (risky_device(dev)) 4920 return; 4921 4922 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4923 dmar_map_gfx = 0; 4924 } 4925 4926 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4934 4935 /* Broadwell igfx malfunctions with dmar */ 4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4960 4961 static void quirk_iommu_rwbf(struct pci_dev *dev) 4962 { 4963 if (risky_device(dev)) 4964 return; 4965 4966 /* 4967 * Mobile 4 Series Chipset neglects to set RWBF capability, 4968 * but needs it. Same seems to hold for the desktop versions. 4969 */ 4970 pci_info(dev, "Forcing write-buffer flush capability\n"); 4971 rwbf_quirk = 1; 4972 } 4973 4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4981 4982 #define GGC 0x52 4983 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4984 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4985 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4986 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4987 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4988 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4989 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4990 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4991 4992 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4993 { 4994 unsigned short ggc; 4995 4996 if (risky_device(dev)) 4997 return; 4998 4999 if (pci_read_config_word(dev, GGC, &ggc)) 5000 return; 5001 5002 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5003 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5004 dmar_map_gfx = 0; 5005 } else if (dmar_map_gfx) { 5006 /* we have to ensure the gfx device is idle before we flush */ 5007 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5008 iommu_set_dma_strict(); 5009 } 5010 } 5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5015 5016 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5017 { 5018 unsigned short ver; 5019 5020 if (!IS_GFX_DEVICE(dev)) 5021 return; 5022 5023 ver = (dev->device >> 8) & 0xff; 5024 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5025 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5026 ver != 0x9a && ver != 0xa7) 5027 return; 5028 5029 if (risky_device(dev)) 5030 return; 5031 5032 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5033 iommu_skip_te_disable = 1; 5034 } 5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5036 5037 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5038 ISOCH DMAR unit for the Azalia sound device, but not give it any 5039 TLB entries, which causes it to deadlock. Check for that. We do 5040 this in a function called from init_dmars(), instead of in a PCI 5041 quirk, because we don't want to print the obnoxious "BIOS broken" 5042 message if VT-d is actually disabled. 5043 */ 5044 static void __init check_tylersburg_isoch(void) 5045 { 5046 struct pci_dev *pdev; 5047 uint32_t vtisochctrl; 5048 5049 /* If there's no Azalia in the system anyway, forget it. */ 5050 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5051 if (!pdev) 5052 return; 5053 5054 if (risky_device(pdev)) { 5055 pci_dev_put(pdev); 5056 return; 5057 } 5058 5059 pci_dev_put(pdev); 5060 5061 /* System Management Registers. Might be hidden, in which case 5062 we can't do the sanity check. But that's OK, because the 5063 known-broken BIOSes _don't_ actually hide it, so far. */ 5064 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5065 if (!pdev) 5066 return; 5067 5068 if (risky_device(pdev)) { 5069 pci_dev_put(pdev); 5070 return; 5071 } 5072 5073 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5074 pci_dev_put(pdev); 5075 return; 5076 } 5077 5078 pci_dev_put(pdev); 5079 5080 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5081 if (vtisochctrl & 1) 5082 return; 5083 5084 /* Drop all bits other than the number of TLB entries */ 5085 vtisochctrl &= 0x1c; 5086 5087 /* If we have the recommended number of TLB entries (16), fine. */ 5088 if (vtisochctrl == 0x10) 5089 return; 5090 5091 /* Zero TLB entries? You get to ride the short bus to school. */ 5092 if (!vtisochctrl) { 5093 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5094 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5095 dmi_get_system_info(DMI_BIOS_VENDOR), 5096 dmi_get_system_info(DMI_BIOS_VERSION), 5097 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5098 iommu_identity_mapping |= IDENTMAP_AZALIA; 5099 return; 5100 } 5101 5102 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5103 vtisochctrl); 5104 } 5105