1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/crash_dump.h> 17 #include <linux/dma-direct.h> 18 #include <linux/dma-iommu.h> 19 #include <linux/dmi.h> 20 #include <linux/intel-iommu.h> 21 #include <linux/intel-svm.h> 22 #include <linux/memory.h> 23 #include <linux/pci.h> 24 #include <linux/pci-ats.h> 25 #include <linux/spinlock.h> 26 #include <linux/syscore_ops.h> 27 #include <linux/tboot.h> 28 29 #include "../irq_remapping.h" 30 #include "../iommu-sva-lib.h" 31 #include "pasid.h" 32 #include "cap_audit.h" 33 34 #define ROOT_SIZE VTD_PAGE_SIZE 35 #define CONTEXT_SIZE VTD_PAGE_SIZE 36 37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 41 42 #define IOAPIC_RANGE_START (0xfee00000) 43 #define IOAPIC_RANGE_END (0xfeefffff) 44 #define IOVA_START_ADDR (0x1000) 45 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 48 #define MAX_AGAW_WIDTH 64 49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 50 51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 53 54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 59 60 /* IO virtual address start page frame number */ 61 #define IOVA_START_PFN (1) 62 63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 64 65 /* page table handling */ 66 #define LEVEL_STRIDE (9) 67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 68 69 static inline int agaw_to_level(int agaw) 70 { 71 return agaw + 2; 72 } 73 74 static inline int agaw_to_width(int agaw) 75 { 76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 77 } 78 79 static inline int width_to_agaw(int width) 80 { 81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 82 } 83 84 static inline unsigned int level_to_offset_bits(int level) 85 { 86 return (level - 1) * LEVEL_STRIDE; 87 } 88 89 static inline int pfn_level_offset(u64 pfn, int level) 90 { 91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 92 } 93 94 static inline u64 level_mask(int level) 95 { 96 return -1ULL << level_to_offset_bits(level); 97 } 98 99 static inline u64 level_size(int level) 100 { 101 return 1ULL << level_to_offset_bits(level); 102 } 103 104 static inline u64 align_to_level(u64 pfn, int level) 105 { 106 return (pfn + level_size(level) - 1) & level_mask(level); 107 } 108 109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 110 { 111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 112 } 113 114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 115 are never going to work. */ 116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 117 { 118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 119 } 120 static inline unsigned long page_to_dma_pfn(struct page *pg) 121 { 122 return mm_to_dma_pfn(page_to_pfn(pg)); 123 } 124 static inline unsigned long virt_to_dma_pfn(void *p) 125 { 126 return page_to_dma_pfn(virt_to_page(p)); 127 } 128 129 /* global iommu list, set NULL for ignored DMAR units */ 130 static struct intel_iommu **g_iommus; 131 132 static void __init check_tylersburg_isoch(void); 133 static int rwbf_quirk; 134 static inline struct device_domain_info * 135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 136 137 /* 138 * set to 1 to panic kernel if can't successfully enable VT-d 139 * (used when kernel is launched w/ TXT) 140 */ 141 static int force_on = 0; 142 static int intel_iommu_tboot_noforce; 143 static int no_platform_optin; 144 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 146 147 /* 148 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 149 * if marked present. 150 */ 151 static phys_addr_t root_entry_lctp(struct root_entry *re) 152 { 153 if (!(re->lo & 1)) 154 return 0; 155 156 return re->lo & VTD_PAGE_MASK; 157 } 158 159 /* 160 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 161 * if marked present. 162 */ 163 static phys_addr_t root_entry_uctp(struct root_entry *re) 164 { 165 if (!(re->hi & 1)) 166 return 0; 167 168 return re->hi & VTD_PAGE_MASK; 169 } 170 171 static inline void context_clear_pasid_enable(struct context_entry *context) 172 { 173 context->lo &= ~(1ULL << 11); 174 } 175 176 static inline bool context_pasid_enabled(struct context_entry *context) 177 { 178 return !!(context->lo & (1ULL << 11)); 179 } 180 181 static inline void context_set_copied(struct context_entry *context) 182 { 183 context->hi |= (1ull << 3); 184 } 185 186 static inline bool context_copied(struct context_entry *context) 187 { 188 return !!(context->hi & (1ULL << 3)); 189 } 190 191 static inline bool __context_present(struct context_entry *context) 192 { 193 return (context->lo & 1); 194 } 195 196 bool context_present(struct context_entry *context) 197 { 198 return context_pasid_enabled(context) ? 199 __context_present(context) : 200 __context_present(context) && !context_copied(context); 201 } 202 203 static inline void context_set_present(struct context_entry *context) 204 { 205 context->lo |= 1; 206 } 207 208 static inline void context_set_fault_enable(struct context_entry *context) 209 { 210 context->lo &= (((u64)-1) << 2) | 1; 211 } 212 213 static inline void context_set_translation_type(struct context_entry *context, 214 unsigned long value) 215 { 216 context->lo &= (((u64)-1) << 4) | 3; 217 context->lo |= (value & 3) << 2; 218 } 219 220 static inline void context_set_address_root(struct context_entry *context, 221 unsigned long value) 222 { 223 context->lo &= ~VTD_PAGE_MASK; 224 context->lo |= value & VTD_PAGE_MASK; 225 } 226 227 static inline void context_set_address_width(struct context_entry *context, 228 unsigned long value) 229 { 230 context->hi |= value & 7; 231 } 232 233 static inline void context_set_domain_id(struct context_entry *context, 234 unsigned long value) 235 { 236 context->hi |= (value & ((1 << 16) - 1)) << 8; 237 } 238 239 static inline int context_domain_id(struct context_entry *c) 240 { 241 return((c->hi >> 8) & 0xffff); 242 } 243 244 static inline void context_clear_entry(struct context_entry *context) 245 { 246 context->lo = 0; 247 context->hi = 0; 248 } 249 250 /* 251 * This domain is a statically identity mapping domain. 252 * 1. This domain creats a static 1:1 mapping to all usable memory. 253 * 2. It maps to each iommu if successful. 254 * 3. Each iommu mapps to this domain if successful. 255 */ 256 static struct dmar_domain *si_domain; 257 static int hw_pass_through = 1; 258 259 #define for_each_domain_iommu(idx, domain) \ 260 for (idx = 0; idx < g_num_of_iommus; idx++) \ 261 if (domain->iommu_refcnt[idx]) 262 263 struct dmar_rmrr_unit { 264 struct list_head list; /* list of rmrr units */ 265 struct acpi_dmar_header *hdr; /* ACPI header */ 266 u64 base_address; /* reserved base address*/ 267 u64 end_address; /* reserved end address */ 268 struct dmar_dev_scope *devices; /* target devices */ 269 int devices_cnt; /* target device count */ 270 }; 271 272 struct dmar_atsr_unit { 273 struct list_head list; /* list of ATSR units */ 274 struct acpi_dmar_header *hdr; /* ACPI header */ 275 struct dmar_dev_scope *devices; /* target devices */ 276 int devices_cnt; /* target device count */ 277 u8 include_all:1; /* include all ports */ 278 }; 279 280 struct dmar_satc_unit { 281 struct list_head list; /* list of SATC units */ 282 struct acpi_dmar_header *hdr; /* ACPI header */ 283 struct dmar_dev_scope *devices; /* target devices */ 284 struct intel_iommu *iommu; /* the corresponding iommu */ 285 int devices_cnt; /* target device count */ 286 u8 atc_required:1; /* ATS is required */ 287 }; 288 289 static LIST_HEAD(dmar_atsr_units); 290 static LIST_HEAD(dmar_rmrr_units); 291 static LIST_HEAD(dmar_satc_units); 292 293 #define for_each_rmrr_units(rmrr) \ 294 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 295 296 /* bitmap for indexing intel_iommus */ 297 static int g_num_of_iommus; 298 299 static void domain_remove_dev_info(struct dmar_domain *domain); 300 static void dmar_remove_one_dev_info(struct device *dev); 301 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 302 303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 305 306 int intel_iommu_enabled = 0; 307 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 308 309 static int dmar_map_gfx = 1; 310 static int intel_iommu_superpage = 1; 311 static int iommu_identity_mapping; 312 static int iommu_skip_te_disable; 313 314 #define IDENTMAP_GFX 2 315 #define IDENTMAP_AZALIA 4 316 317 int intel_iommu_gfx_mapped; 318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 319 320 DEFINE_SPINLOCK(device_domain_lock); 321 static LIST_HEAD(device_domain_list); 322 323 /* 324 * Iterate over elements in device_domain_list and call the specified 325 * callback @fn against each element. 326 */ 327 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 328 void *data), void *data) 329 { 330 int ret = 0; 331 unsigned long flags; 332 struct device_domain_info *info; 333 334 spin_lock_irqsave(&device_domain_lock, flags); 335 list_for_each_entry(info, &device_domain_list, global) { 336 ret = fn(info, data); 337 if (ret) { 338 spin_unlock_irqrestore(&device_domain_lock, flags); 339 return ret; 340 } 341 } 342 spin_unlock_irqrestore(&device_domain_lock, flags); 343 344 return 0; 345 } 346 347 const struct iommu_ops intel_iommu_ops; 348 349 static bool translation_pre_enabled(struct intel_iommu *iommu) 350 { 351 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 352 } 353 354 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 355 { 356 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 357 } 358 359 static void init_translation_status(struct intel_iommu *iommu) 360 { 361 u32 gsts; 362 363 gsts = readl(iommu->reg + DMAR_GSTS_REG); 364 if (gsts & DMA_GSTS_TES) 365 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 366 } 367 368 static int __init intel_iommu_setup(char *str) 369 { 370 if (!str) 371 return -EINVAL; 372 373 while (*str) { 374 if (!strncmp(str, "on", 2)) { 375 dmar_disabled = 0; 376 pr_info("IOMMU enabled\n"); 377 } else if (!strncmp(str, "off", 3)) { 378 dmar_disabled = 1; 379 no_platform_optin = 1; 380 pr_info("IOMMU disabled\n"); 381 } else if (!strncmp(str, "igfx_off", 8)) { 382 dmar_map_gfx = 0; 383 pr_info("Disable GFX device mapping\n"); 384 } else if (!strncmp(str, "forcedac", 8)) { 385 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 386 iommu_dma_forcedac = true; 387 } else if (!strncmp(str, "strict", 6)) { 388 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 389 iommu_set_dma_strict(); 390 } else if (!strncmp(str, "sp_off", 6)) { 391 pr_info("Disable supported super page\n"); 392 intel_iommu_superpage = 0; 393 } else if (!strncmp(str, "sm_on", 5)) { 394 pr_info("Enable scalable mode if hardware supports\n"); 395 intel_iommu_sm = 1; 396 } else if (!strncmp(str, "sm_off", 6)) { 397 pr_info("Scalable mode is disallowed\n"); 398 intel_iommu_sm = 0; 399 } else if (!strncmp(str, "tboot_noforce", 13)) { 400 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 401 intel_iommu_tboot_noforce = 1; 402 } else { 403 pr_notice("Unknown option - '%s'\n", str); 404 } 405 406 str += strcspn(str, ","); 407 while (*str == ',') 408 str++; 409 } 410 411 return 1; 412 } 413 __setup("intel_iommu=", intel_iommu_setup); 414 415 void *alloc_pgtable_page(int node) 416 { 417 struct page *page; 418 void *vaddr = NULL; 419 420 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 421 if (page) 422 vaddr = page_address(page); 423 return vaddr; 424 } 425 426 void free_pgtable_page(void *vaddr) 427 { 428 free_page((unsigned long)vaddr); 429 } 430 431 static inline int domain_type_is_si(struct dmar_domain *domain) 432 { 433 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 434 } 435 436 static inline bool domain_use_first_level(struct dmar_domain *domain) 437 { 438 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 439 } 440 441 static inline int domain_pfn_supported(struct dmar_domain *domain, 442 unsigned long pfn) 443 { 444 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 445 446 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 447 } 448 449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 450 { 451 unsigned long sagaw; 452 int agaw; 453 454 sagaw = cap_sagaw(iommu->cap); 455 for (agaw = width_to_agaw(max_gaw); 456 agaw >= 0; agaw--) { 457 if (test_bit(agaw, &sagaw)) 458 break; 459 } 460 461 return agaw; 462 } 463 464 /* 465 * Calculate max SAGAW for each iommu. 466 */ 467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 468 { 469 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 470 } 471 472 /* 473 * calculate agaw for each iommu. 474 * "SAGAW" may be different across iommus, use a default agaw, and 475 * get a supported less agaw for iommus that don't support the default agaw. 476 */ 477 int iommu_calculate_agaw(struct intel_iommu *iommu) 478 { 479 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 480 } 481 482 /* This functionin only returns single iommu in a domain */ 483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 484 { 485 int iommu_id; 486 487 /* si_domain and vm domain should not get here. */ 488 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 489 return NULL; 490 491 for_each_domain_iommu(iommu_id, domain) 492 break; 493 494 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 495 return NULL; 496 497 return g_iommus[iommu_id]; 498 } 499 500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 501 { 502 return sm_supported(iommu) ? 503 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 504 } 505 506 static void domain_update_iommu_coherency(struct dmar_domain *domain) 507 { 508 struct dmar_drhd_unit *drhd; 509 struct intel_iommu *iommu; 510 bool found = false; 511 int i; 512 513 domain->iommu_coherency = true; 514 515 for_each_domain_iommu(i, domain) { 516 found = true; 517 if (!iommu_paging_structure_coherency(g_iommus[i])) { 518 domain->iommu_coherency = false; 519 break; 520 } 521 } 522 if (found) 523 return; 524 525 /* No hardware attached; use lowest common denominator */ 526 rcu_read_lock(); 527 for_each_active_iommu(iommu, drhd) { 528 if (!iommu_paging_structure_coherency(iommu)) { 529 domain->iommu_coherency = false; 530 break; 531 } 532 } 533 rcu_read_unlock(); 534 } 535 536 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 537 { 538 struct dmar_drhd_unit *drhd; 539 struct intel_iommu *iommu; 540 bool ret = true; 541 542 rcu_read_lock(); 543 for_each_active_iommu(iommu, drhd) { 544 if (iommu != skip) { 545 /* 546 * If the hardware is operating in the scalable mode, 547 * the snooping control is always supported since we 548 * always set PASID-table-entry.PGSNP bit if the domain 549 * is managed outside (UNMANAGED). 550 */ 551 if (!sm_supported(iommu) && 552 !ecap_sc_support(iommu->ecap)) { 553 ret = false; 554 break; 555 } 556 } 557 } 558 rcu_read_unlock(); 559 560 return ret; 561 } 562 563 static int domain_update_iommu_superpage(struct dmar_domain *domain, 564 struct intel_iommu *skip) 565 { 566 struct dmar_drhd_unit *drhd; 567 struct intel_iommu *iommu; 568 int mask = 0x3; 569 570 if (!intel_iommu_superpage) 571 return 0; 572 573 /* set iommu_superpage to the smallest common denominator */ 574 rcu_read_lock(); 575 for_each_active_iommu(iommu, drhd) { 576 if (iommu != skip) { 577 if (domain && domain_use_first_level(domain)) { 578 if (!cap_fl1gp_support(iommu->cap)) 579 mask = 0x1; 580 } else { 581 mask &= cap_super_page_val(iommu->cap); 582 } 583 584 if (!mask) 585 break; 586 } 587 } 588 rcu_read_unlock(); 589 590 return fls(mask); 591 } 592 593 static int domain_update_device_node(struct dmar_domain *domain) 594 { 595 struct device_domain_info *info; 596 int nid = NUMA_NO_NODE; 597 598 assert_spin_locked(&device_domain_lock); 599 600 if (list_empty(&domain->devices)) 601 return NUMA_NO_NODE; 602 603 list_for_each_entry(info, &domain->devices, link) { 604 if (!info->dev) 605 continue; 606 607 /* 608 * There could possibly be multiple device numa nodes as devices 609 * within the same domain may sit behind different IOMMUs. There 610 * isn't perfect answer in such situation, so we select first 611 * come first served policy. 612 */ 613 nid = dev_to_node(info->dev); 614 if (nid != NUMA_NO_NODE) 615 break; 616 } 617 618 return nid; 619 } 620 621 static void domain_update_iotlb(struct dmar_domain *domain); 622 623 /* Return the super pagesize bitmap if supported. */ 624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 625 { 626 unsigned long bitmap = 0; 627 628 /* 629 * 1-level super page supports page size of 2MiB, 2-level super page 630 * supports page size of both 2MiB and 1GiB. 631 */ 632 if (domain->iommu_superpage == 1) 633 bitmap |= SZ_2M; 634 else if (domain->iommu_superpage == 2) 635 bitmap |= SZ_2M | SZ_1G; 636 637 return bitmap; 638 } 639 640 /* Some capabilities may be different across iommus */ 641 static void domain_update_iommu_cap(struct dmar_domain *domain) 642 { 643 domain_update_iommu_coherency(domain); 644 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 645 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 646 647 /* 648 * If RHSA is missing, we should default to the device numa domain 649 * as fall back. 650 */ 651 if (domain->nid == NUMA_NO_NODE) 652 domain->nid = domain_update_device_node(domain); 653 654 /* 655 * First-level translation restricts the input-address to a 656 * canonical address (i.e., address bits 63:N have the same 657 * value as address bit [N-1], where N is 48-bits with 4-level 658 * paging and 57-bits with 5-level paging). Hence, skip bit 659 * [N-1]. 660 */ 661 if (domain_use_first_level(domain)) 662 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 663 else 664 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 665 666 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 667 domain_update_iotlb(domain); 668 } 669 670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 671 u8 devfn, int alloc) 672 { 673 struct root_entry *root = &iommu->root_entry[bus]; 674 struct context_entry *context; 675 u64 *entry; 676 677 entry = &root->lo; 678 if (sm_supported(iommu)) { 679 if (devfn >= 0x80) { 680 devfn -= 0x80; 681 entry = &root->hi; 682 } 683 devfn *= 2; 684 } 685 if (*entry & 1) 686 context = phys_to_virt(*entry & VTD_PAGE_MASK); 687 else { 688 unsigned long phy_addr; 689 if (!alloc) 690 return NULL; 691 692 context = alloc_pgtable_page(iommu->node); 693 if (!context) 694 return NULL; 695 696 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 697 phy_addr = virt_to_phys((void *)context); 698 *entry = phy_addr | 1; 699 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 700 } 701 return &context[devfn]; 702 } 703 704 /** 705 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 706 * sub-hierarchy of a candidate PCI-PCI bridge 707 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 708 * @bridge: the candidate PCI-PCI bridge 709 * 710 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 711 */ 712 static bool 713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 714 { 715 struct pci_dev *pdev, *pbridge; 716 717 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 718 return false; 719 720 pdev = to_pci_dev(dev); 721 pbridge = to_pci_dev(bridge); 722 723 if (pbridge->subordinate && 724 pbridge->subordinate->number <= pdev->bus->number && 725 pbridge->subordinate->busn_res.end >= pdev->bus->number) 726 return true; 727 728 return false; 729 } 730 731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 732 { 733 struct dmar_drhd_unit *drhd; 734 u32 vtbar; 735 int rc; 736 737 /* We know that this device on this chipset has its own IOMMU. 738 * If we find it under a different IOMMU, then the BIOS is lying 739 * to us. Hope that the IOMMU for this device is actually 740 * disabled, and it needs no translation... 741 */ 742 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 743 if (rc) { 744 /* "can't" happen */ 745 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 746 return false; 747 } 748 vtbar &= 0xffff0000; 749 750 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 751 drhd = dmar_find_matched_drhd_unit(pdev); 752 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 753 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 754 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 755 return true; 756 } 757 758 return false; 759 } 760 761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 762 { 763 if (!iommu || iommu->drhd->ignored) 764 return true; 765 766 if (dev_is_pci(dev)) { 767 struct pci_dev *pdev = to_pci_dev(dev); 768 769 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 770 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 771 quirk_ioat_snb_local_iommu(pdev)) 772 return true; 773 } 774 775 return false; 776 } 777 778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 779 { 780 struct dmar_drhd_unit *drhd = NULL; 781 struct pci_dev *pdev = NULL; 782 struct intel_iommu *iommu; 783 struct device *tmp; 784 u16 segment = 0; 785 int i; 786 787 if (!dev) 788 return NULL; 789 790 if (dev_is_pci(dev)) { 791 struct pci_dev *pf_pdev; 792 793 pdev = pci_real_dma_dev(to_pci_dev(dev)); 794 795 /* VFs aren't listed in scope tables; we need to look up 796 * the PF instead to find the IOMMU. */ 797 pf_pdev = pci_physfn(pdev); 798 dev = &pf_pdev->dev; 799 segment = pci_domain_nr(pdev->bus); 800 } else if (has_acpi_companion(dev)) 801 dev = &ACPI_COMPANION(dev)->dev; 802 803 rcu_read_lock(); 804 for_each_iommu(iommu, drhd) { 805 if (pdev && segment != drhd->segment) 806 continue; 807 808 for_each_active_dev_scope(drhd->devices, 809 drhd->devices_cnt, i, tmp) { 810 if (tmp == dev) { 811 /* For a VF use its original BDF# not that of the PF 812 * which we used for the IOMMU lookup. Strictly speaking 813 * we could do this for all PCI devices; we only need to 814 * get the BDF# from the scope table for ACPI matches. */ 815 if (pdev && pdev->is_virtfn) 816 goto got_pdev; 817 818 if (bus && devfn) { 819 *bus = drhd->devices[i].bus; 820 *devfn = drhd->devices[i].devfn; 821 } 822 goto out; 823 } 824 825 if (is_downstream_to_pci_bridge(dev, tmp)) 826 goto got_pdev; 827 } 828 829 if (pdev && drhd->include_all) { 830 got_pdev: 831 if (bus && devfn) { 832 *bus = pdev->bus->number; 833 *devfn = pdev->devfn; 834 } 835 goto out; 836 } 837 } 838 iommu = NULL; 839 out: 840 if (iommu_is_dummy(iommu, dev)) 841 iommu = NULL; 842 843 rcu_read_unlock(); 844 845 return iommu; 846 } 847 848 static void domain_flush_cache(struct dmar_domain *domain, 849 void *addr, int size) 850 { 851 if (!domain->iommu_coherency) 852 clflush_cache_range(addr, size); 853 } 854 855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 856 { 857 struct context_entry *context; 858 int ret = 0; 859 unsigned long flags; 860 861 spin_lock_irqsave(&iommu->lock, flags); 862 context = iommu_context_addr(iommu, bus, devfn, 0); 863 if (context) 864 ret = context_present(context); 865 spin_unlock_irqrestore(&iommu->lock, flags); 866 return ret; 867 } 868 869 static void free_context_table(struct intel_iommu *iommu) 870 { 871 int i; 872 unsigned long flags; 873 struct context_entry *context; 874 875 spin_lock_irqsave(&iommu->lock, flags); 876 if (!iommu->root_entry) { 877 goto out; 878 } 879 for (i = 0; i < ROOT_ENTRY_NR; i++) { 880 context = iommu_context_addr(iommu, i, 0, 0); 881 if (context) 882 free_pgtable_page(context); 883 884 if (!sm_supported(iommu)) 885 continue; 886 887 context = iommu_context_addr(iommu, i, 0x80, 0); 888 if (context) 889 free_pgtable_page(context); 890 891 } 892 free_pgtable_page(iommu->root_entry); 893 iommu->root_entry = NULL; 894 out: 895 spin_unlock_irqrestore(&iommu->lock, flags); 896 } 897 898 #ifdef CONFIG_DMAR_DEBUG 899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 900 { 901 struct device_domain_info *info; 902 struct dma_pte *parent, *pte; 903 struct dmar_domain *domain; 904 int offset, level; 905 906 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 907 if (!info || !info->domain) { 908 pr_info("device [%02x:%02x.%d] not probed\n", 909 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 910 return; 911 } 912 913 domain = info->domain; 914 level = agaw_to_level(domain->agaw); 915 parent = domain->pgd; 916 if (!parent) { 917 pr_info("no page table setup\n"); 918 return; 919 } 920 921 while (1) { 922 offset = pfn_level_offset(pfn, level); 923 pte = &parent[offset]; 924 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 925 pr_info("PTE not present at level %d\n", level); 926 break; 927 } 928 929 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 930 931 if (level == 1) 932 break; 933 934 parent = phys_to_virt(dma_pte_addr(pte)); 935 level--; 936 } 937 } 938 939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 940 unsigned long long addr, u32 pasid) 941 { 942 struct pasid_dir_entry *dir, *pde; 943 struct pasid_entry *entries, *pte; 944 struct context_entry *ctx_entry; 945 struct root_entry *rt_entry; 946 u8 devfn = source_id & 0xff; 947 u8 bus = source_id >> 8; 948 int i, dir_index, index; 949 950 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 951 952 /* root entry dump */ 953 rt_entry = &iommu->root_entry[bus]; 954 if (!rt_entry) { 955 pr_info("root table entry is not present\n"); 956 return; 957 } 958 959 if (sm_supported(iommu)) 960 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 961 rt_entry->hi, rt_entry->lo); 962 else 963 pr_info("root entry: 0x%016llx", rt_entry->lo); 964 965 /* context entry dump */ 966 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 967 if (!ctx_entry) { 968 pr_info("context table entry is not present\n"); 969 return; 970 } 971 972 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 973 ctx_entry->hi, ctx_entry->lo); 974 975 /* legacy mode does not require PASID entries */ 976 if (!sm_supported(iommu)) 977 goto pgtable_walk; 978 979 /* get the pointer to pasid directory entry */ 980 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 981 if (!dir) { 982 pr_info("pasid directory entry is not present\n"); 983 return; 984 } 985 /* For request-without-pasid, get the pasid from context entry */ 986 if (intel_iommu_sm && pasid == INVALID_IOASID) 987 pasid = PASID_RID2PASID; 988 989 dir_index = pasid >> PASID_PDE_SHIFT; 990 pde = &dir[dir_index]; 991 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 992 993 /* get the pointer to the pasid table entry */ 994 entries = get_pasid_table_from_pde(pde); 995 if (!entries) { 996 pr_info("pasid table entry is not present\n"); 997 return; 998 } 999 index = pasid & PASID_PTE_MASK; 1000 pte = &entries[index]; 1001 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 1002 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 1003 1004 pgtable_walk: 1005 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 1006 } 1007 #endif 1008 1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1010 unsigned long pfn, int *target_level) 1011 { 1012 struct dma_pte *parent, *pte; 1013 int level = agaw_to_level(domain->agaw); 1014 int offset; 1015 1016 BUG_ON(!domain->pgd); 1017 1018 if (!domain_pfn_supported(domain, pfn)) 1019 /* Address beyond IOMMU's addressing capabilities. */ 1020 return NULL; 1021 1022 parent = domain->pgd; 1023 1024 while (1) { 1025 void *tmp_page; 1026 1027 offset = pfn_level_offset(pfn, level); 1028 pte = &parent[offset]; 1029 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1030 break; 1031 if (level == *target_level) 1032 break; 1033 1034 if (!dma_pte_present(pte)) { 1035 uint64_t pteval; 1036 1037 tmp_page = alloc_pgtable_page(domain->nid); 1038 1039 if (!tmp_page) 1040 return NULL; 1041 1042 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1043 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1044 if (domain_use_first_level(domain)) { 1045 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1046 if (iommu_is_dma_domain(&domain->domain)) 1047 pteval |= DMA_FL_PTE_ACCESS; 1048 } 1049 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1050 /* Someone else set it while we were thinking; use theirs. */ 1051 free_pgtable_page(tmp_page); 1052 else 1053 domain_flush_cache(domain, pte, sizeof(*pte)); 1054 } 1055 if (level == 1) 1056 break; 1057 1058 parent = phys_to_virt(dma_pte_addr(pte)); 1059 level--; 1060 } 1061 1062 if (!*target_level) 1063 *target_level = level; 1064 1065 return pte; 1066 } 1067 1068 /* return address's pte at specific level */ 1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1070 unsigned long pfn, 1071 int level, int *large_page) 1072 { 1073 struct dma_pte *parent, *pte; 1074 int total = agaw_to_level(domain->agaw); 1075 int offset; 1076 1077 parent = domain->pgd; 1078 while (level <= total) { 1079 offset = pfn_level_offset(pfn, total); 1080 pte = &parent[offset]; 1081 if (level == total) 1082 return pte; 1083 1084 if (!dma_pte_present(pte)) { 1085 *large_page = total; 1086 break; 1087 } 1088 1089 if (dma_pte_superpage(pte)) { 1090 *large_page = total; 1091 return pte; 1092 } 1093 1094 parent = phys_to_virt(dma_pte_addr(pte)); 1095 total--; 1096 } 1097 return NULL; 1098 } 1099 1100 /* clear last level pte, a tlb flush should be followed */ 1101 static void dma_pte_clear_range(struct dmar_domain *domain, 1102 unsigned long start_pfn, 1103 unsigned long last_pfn) 1104 { 1105 unsigned int large_page; 1106 struct dma_pte *first_pte, *pte; 1107 1108 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1109 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1110 BUG_ON(start_pfn > last_pfn); 1111 1112 /* we don't need lock here; nobody else touches the iova range */ 1113 do { 1114 large_page = 1; 1115 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1116 if (!pte) { 1117 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1118 continue; 1119 } 1120 do { 1121 dma_clear_pte(pte); 1122 start_pfn += lvl_to_nr_pages(large_page); 1123 pte++; 1124 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1125 1126 domain_flush_cache(domain, first_pte, 1127 (void *)pte - (void *)first_pte); 1128 1129 } while (start_pfn && start_pfn <= last_pfn); 1130 } 1131 1132 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1133 int retain_level, struct dma_pte *pte, 1134 unsigned long pfn, unsigned long start_pfn, 1135 unsigned long last_pfn) 1136 { 1137 pfn = max(start_pfn, pfn); 1138 pte = &pte[pfn_level_offset(pfn, level)]; 1139 1140 do { 1141 unsigned long level_pfn; 1142 struct dma_pte *level_pte; 1143 1144 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1145 goto next; 1146 1147 level_pfn = pfn & level_mask(level); 1148 level_pte = phys_to_virt(dma_pte_addr(pte)); 1149 1150 if (level > 2) { 1151 dma_pte_free_level(domain, level - 1, retain_level, 1152 level_pte, level_pfn, start_pfn, 1153 last_pfn); 1154 } 1155 1156 /* 1157 * Free the page table if we're below the level we want to 1158 * retain and the range covers the entire table. 1159 */ 1160 if (level < retain_level && !(start_pfn > level_pfn || 1161 last_pfn < level_pfn + level_size(level) - 1)) { 1162 dma_clear_pte(pte); 1163 domain_flush_cache(domain, pte, sizeof(*pte)); 1164 free_pgtable_page(level_pte); 1165 } 1166 next: 1167 pfn += level_size(level); 1168 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1169 } 1170 1171 /* 1172 * clear last level (leaf) ptes and free page table pages below the 1173 * level we wish to keep intact. 1174 */ 1175 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1176 unsigned long start_pfn, 1177 unsigned long last_pfn, 1178 int retain_level) 1179 { 1180 dma_pte_clear_range(domain, start_pfn, last_pfn); 1181 1182 /* We don't need lock here; nobody else touches the iova range */ 1183 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1184 domain->pgd, 0, start_pfn, last_pfn); 1185 1186 /* free pgd */ 1187 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1188 free_pgtable_page(domain->pgd); 1189 domain->pgd = NULL; 1190 } 1191 } 1192 1193 /* When a page at a given level is being unlinked from its parent, we don't 1194 need to *modify* it at all. All we need to do is make a list of all the 1195 pages which can be freed just as soon as we've flushed the IOTLB and we 1196 know the hardware page-walk will no longer touch them. 1197 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1198 be freed. */ 1199 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1200 int level, struct dma_pte *pte, 1201 struct list_head *freelist) 1202 { 1203 struct page *pg; 1204 1205 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1206 list_add_tail(&pg->lru, freelist); 1207 1208 if (level == 1) 1209 return; 1210 1211 pte = page_address(pg); 1212 do { 1213 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1214 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1215 pte++; 1216 } while (!first_pte_in_page(pte)); 1217 } 1218 1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1220 struct dma_pte *pte, unsigned long pfn, 1221 unsigned long start_pfn, unsigned long last_pfn, 1222 struct list_head *freelist) 1223 { 1224 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1225 1226 pfn = max(start_pfn, pfn); 1227 pte = &pte[pfn_level_offset(pfn, level)]; 1228 1229 do { 1230 unsigned long level_pfn = pfn & level_mask(level); 1231 1232 if (!dma_pte_present(pte)) 1233 goto next; 1234 1235 /* If range covers entire pagetable, free it */ 1236 if (start_pfn <= level_pfn && 1237 last_pfn >= level_pfn + level_size(level) - 1) { 1238 /* These suborbinate page tables are going away entirely. Don't 1239 bother to clear them; we're just going to *free* them. */ 1240 if (level > 1 && !dma_pte_superpage(pte)) 1241 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1242 1243 dma_clear_pte(pte); 1244 if (!first_pte) 1245 first_pte = pte; 1246 last_pte = pte; 1247 } else if (level > 1) { 1248 /* Recurse down into a level that isn't *entirely* obsolete */ 1249 dma_pte_clear_level(domain, level - 1, 1250 phys_to_virt(dma_pte_addr(pte)), 1251 level_pfn, start_pfn, last_pfn, 1252 freelist); 1253 } 1254 next: 1255 pfn = level_pfn + level_size(level); 1256 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1257 1258 if (first_pte) 1259 domain_flush_cache(domain, first_pte, 1260 (void *)++last_pte - (void *)first_pte); 1261 } 1262 1263 /* We can't just free the pages because the IOMMU may still be walking 1264 the page tables, and may have cached the intermediate levels. The 1265 pages can only be freed after the IOTLB flush has been done. */ 1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1267 unsigned long last_pfn, struct list_head *freelist) 1268 { 1269 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1270 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1271 BUG_ON(start_pfn > last_pfn); 1272 1273 /* we don't need lock here; nobody else touches the iova range */ 1274 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1275 domain->pgd, 0, start_pfn, last_pfn, freelist); 1276 1277 /* free pgd */ 1278 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1279 struct page *pgd_page = virt_to_page(domain->pgd); 1280 list_add_tail(&pgd_page->lru, freelist); 1281 domain->pgd = NULL; 1282 } 1283 } 1284 1285 /* iommu handling */ 1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1287 { 1288 struct root_entry *root; 1289 unsigned long flags; 1290 1291 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1292 if (!root) { 1293 pr_err("Allocating root entry for %s failed\n", 1294 iommu->name); 1295 return -ENOMEM; 1296 } 1297 1298 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1299 1300 spin_lock_irqsave(&iommu->lock, flags); 1301 iommu->root_entry = root; 1302 spin_unlock_irqrestore(&iommu->lock, flags); 1303 1304 return 0; 1305 } 1306 1307 static void iommu_set_root_entry(struct intel_iommu *iommu) 1308 { 1309 u64 addr; 1310 u32 sts; 1311 unsigned long flag; 1312 1313 addr = virt_to_phys(iommu->root_entry); 1314 if (sm_supported(iommu)) 1315 addr |= DMA_RTADDR_SMT; 1316 1317 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1318 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1319 1320 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1321 1322 /* Make sure hardware complete it */ 1323 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1324 readl, (sts & DMA_GSTS_RTPS), sts); 1325 1326 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1327 1328 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1329 if (sm_supported(iommu)) 1330 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1331 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1332 } 1333 1334 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1335 { 1336 u32 val; 1337 unsigned long flag; 1338 1339 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1340 return; 1341 1342 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1343 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1344 1345 /* Make sure hardware complete it */ 1346 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1347 readl, (!(val & DMA_GSTS_WBFS)), val); 1348 1349 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1350 } 1351 1352 /* return value determine if we need a write buffer flush */ 1353 static void __iommu_flush_context(struct intel_iommu *iommu, 1354 u16 did, u16 source_id, u8 function_mask, 1355 u64 type) 1356 { 1357 u64 val = 0; 1358 unsigned long flag; 1359 1360 switch (type) { 1361 case DMA_CCMD_GLOBAL_INVL: 1362 val = DMA_CCMD_GLOBAL_INVL; 1363 break; 1364 case DMA_CCMD_DOMAIN_INVL: 1365 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1366 break; 1367 case DMA_CCMD_DEVICE_INVL: 1368 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1369 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1370 break; 1371 default: 1372 BUG(); 1373 } 1374 val |= DMA_CCMD_ICC; 1375 1376 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1377 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1378 1379 /* Make sure hardware complete it */ 1380 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1381 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1382 1383 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1384 } 1385 1386 /* return value determine if we need a write buffer flush */ 1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1388 u64 addr, unsigned int size_order, u64 type) 1389 { 1390 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1391 u64 val = 0, val_iva = 0; 1392 unsigned long flag; 1393 1394 switch (type) { 1395 case DMA_TLB_GLOBAL_FLUSH: 1396 /* global flush doesn't need set IVA_REG */ 1397 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1398 break; 1399 case DMA_TLB_DSI_FLUSH: 1400 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1401 break; 1402 case DMA_TLB_PSI_FLUSH: 1403 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1404 /* IH bit is passed in as part of address */ 1405 val_iva = size_order | addr; 1406 break; 1407 default: 1408 BUG(); 1409 } 1410 /* Note: set drain read/write */ 1411 #if 0 1412 /* 1413 * This is probably to be super secure.. Looks like we can 1414 * ignore it without any impact. 1415 */ 1416 if (cap_read_drain(iommu->cap)) 1417 val |= DMA_TLB_READ_DRAIN; 1418 #endif 1419 if (cap_write_drain(iommu->cap)) 1420 val |= DMA_TLB_WRITE_DRAIN; 1421 1422 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1423 /* Note: Only uses first TLB reg currently */ 1424 if (val_iva) 1425 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1426 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1427 1428 /* Make sure hardware complete it */ 1429 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1430 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1431 1432 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1433 1434 /* check IOTLB invalidation granularity */ 1435 if (DMA_TLB_IAIG(val) == 0) 1436 pr_err("Flush IOTLB failed\n"); 1437 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1438 pr_debug("TLB flush request %Lx, actual %Lx\n", 1439 (unsigned long long)DMA_TLB_IIRG(type), 1440 (unsigned long long)DMA_TLB_IAIG(val)); 1441 } 1442 1443 static struct device_domain_info * 1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1445 u8 bus, u8 devfn) 1446 { 1447 struct device_domain_info *info; 1448 1449 assert_spin_locked(&device_domain_lock); 1450 1451 if (!iommu->qi) 1452 return NULL; 1453 1454 list_for_each_entry(info, &domain->devices, link) 1455 if (info->iommu == iommu && info->bus == bus && 1456 info->devfn == devfn) { 1457 if (info->ats_supported && info->dev) 1458 return info; 1459 break; 1460 } 1461 1462 return NULL; 1463 } 1464 1465 static void domain_update_iotlb(struct dmar_domain *domain) 1466 { 1467 struct device_domain_info *info; 1468 bool has_iotlb_device = false; 1469 1470 assert_spin_locked(&device_domain_lock); 1471 1472 list_for_each_entry(info, &domain->devices, link) 1473 if (info->ats_enabled) { 1474 has_iotlb_device = true; 1475 break; 1476 } 1477 1478 domain->has_iotlb_device = has_iotlb_device; 1479 } 1480 1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1482 { 1483 struct pci_dev *pdev; 1484 1485 assert_spin_locked(&device_domain_lock); 1486 1487 if (!info || !dev_is_pci(info->dev)) 1488 return; 1489 1490 pdev = to_pci_dev(info->dev); 1491 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1492 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1493 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1494 * reserved, which should be set to 0. 1495 */ 1496 if (!ecap_dit(info->iommu->ecap)) 1497 info->pfsid = 0; 1498 else { 1499 struct pci_dev *pf_pdev; 1500 1501 /* pdev will be returned if device is not a vf */ 1502 pf_pdev = pci_physfn(pdev); 1503 info->pfsid = pci_dev_id(pf_pdev); 1504 } 1505 1506 #ifdef CONFIG_INTEL_IOMMU_SVM 1507 /* The PCIe spec, in its wisdom, declares that the behaviour of 1508 the device if you enable PASID support after ATS support is 1509 undefined. So always enable PASID support on devices which 1510 have it, even if we can't yet know if we're ever going to 1511 use it. */ 1512 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1513 info->pasid_enabled = 1; 1514 1515 if (info->pri_supported && 1516 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1517 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1518 info->pri_enabled = 1; 1519 #endif 1520 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1521 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1522 info->ats_enabled = 1; 1523 domain_update_iotlb(info->domain); 1524 info->ats_qdep = pci_ats_queue_depth(pdev); 1525 } 1526 } 1527 1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1529 { 1530 struct pci_dev *pdev; 1531 1532 assert_spin_locked(&device_domain_lock); 1533 1534 if (!dev_is_pci(info->dev)) 1535 return; 1536 1537 pdev = to_pci_dev(info->dev); 1538 1539 if (info->ats_enabled) { 1540 pci_disable_ats(pdev); 1541 info->ats_enabled = 0; 1542 domain_update_iotlb(info->domain); 1543 } 1544 #ifdef CONFIG_INTEL_IOMMU_SVM 1545 if (info->pri_enabled) { 1546 pci_disable_pri(pdev); 1547 info->pri_enabled = 0; 1548 } 1549 if (info->pasid_enabled) { 1550 pci_disable_pasid(pdev); 1551 info->pasid_enabled = 0; 1552 } 1553 #endif 1554 } 1555 1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1557 u64 addr, unsigned int mask) 1558 { 1559 u16 sid, qdep; 1560 1561 if (!info || !info->ats_enabled) 1562 return; 1563 1564 sid = info->bus << 8 | info->devfn; 1565 qdep = info->ats_qdep; 1566 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1567 qdep, addr, mask); 1568 } 1569 1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1571 u64 addr, unsigned mask) 1572 { 1573 unsigned long flags; 1574 struct device_domain_info *info; 1575 1576 if (!domain->has_iotlb_device) 1577 return; 1578 1579 spin_lock_irqsave(&device_domain_lock, flags); 1580 list_for_each_entry(info, &domain->devices, link) 1581 __iommu_flush_dev_iotlb(info, addr, mask); 1582 1583 spin_unlock_irqrestore(&device_domain_lock, flags); 1584 } 1585 1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1587 struct dmar_domain *domain, 1588 unsigned long pfn, unsigned int pages, 1589 int ih, int map) 1590 { 1591 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1592 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1593 u16 did = domain->iommu_did[iommu->seq_id]; 1594 1595 BUG_ON(pages == 0); 1596 1597 if (ih) 1598 ih = 1 << 6; 1599 1600 if (domain_use_first_level(domain)) { 1601 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); 1602 } else { 1603 /* 1604 * Fallback to domain selective flush if no PSI support or 1605 * the size is too big. PSI requires page size to be 2 ^ x, 1606 * and the base address is naturally aligned to the size. 1607 */ 1608 if (!cap_pgsel_inv(iommu->cap) || 1609 mask > cap_max_amask_val(iommu->cap)) 1610 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1611 DMA_TLB_DSI_FLUSH); 1612 else 1613 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1614 DMA_TLB_PSI_FLUSH); 1615 } 1616 1617 /* 1618 * In caching mode, changes of pages from non-present to present require 1619 * flush. However, device IOTLB doesn't need to be flushed in this case. 1620 */ 1621 if (!cap_caching_mode(iommu->cap) || !map) 1622 iommu_flush_dev_iotlb(domain, addr, mask); 1623 } 1624 1625 /* Notification for newly created mappings */ 1626 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1627 struct dmar_domain *domain, 1628 unsigned long pfn, unsigned int pages) 1629 { 1630 /* 1631 * It's a non-present to present mapping. Only flush if caching mode 1632 * and second level. 1633 */ 1634 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1635 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1636 else 1637 iommu_flush_write_buffer(iommu); 1638 } 1639 1640 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1641 { 1642 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1643 int idx; 1644 1645 for_each_domain_iommu(idx, dmar_domain) { 1646 struct intel_iommu *iommu = g_iommus[idx]; 1647 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1648 1649 if (domain_use_first_level(dmar_domain)) 1650 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); 1651 else 1652 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1653 DMA_TLB_DSI_FLUSH); 1654 1655 if (!cap_caching_mode(iommu->cap)) 1656 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH); 1657 } 1658 } 1659 1660 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1661 { 1662 u32 pmen; 1663 unsigned long flags; 1664 1665 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1666 return; 1667 1668 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1669 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1670 pmen &= ~DMA_PMEN_EPM; 1671 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1672 1673 /* wait for the protected region status bit to clear */ 1674 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1675 readl, !(pmen & DMA_PMEN_PRS), pmen); 1676 1677 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1678 } 1679 1680 static void iommu_enable_translation(struct intel_iommu *iommu) 1681 { 1682 u32 sts; 1683 unsigned long flags; 1684 1685 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1686 iommu->gcmd |= DMA_GCMD_TE; 1687 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1688 1689 /* Make sure hardware complete it */ 1690 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1691 readl, (sts & DMA_GSTS_TES), sts); 1692 1693 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1694 } 1695 1696 static void iommu_disable_translation(struct intel_iommu *iommu) 1697 { 1698 u32 sts; 1699 unsigned long flag; 1700 1701 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1702 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1703 return; 1704 1705 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1706 iommu->gcmd &= ~DMA_GCMD_TE; 1707 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1708 1709 /* Make sure hardware complete it */ 1710 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1711 readl, (!(sts & DMA_GSTS_TES)), sts); 1712 1713 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1714 } 1715 1716 static int iommu_init_domains(struct intel_iommu *iommu) 1717 { 1718 u32 ndomains; 1719 1720 ndomains = cap_ndoms(iommu->cap); 1721 pr_debug("%s: Number of Domains supported <%d>\n", 1722 iommu->name, ndomains); 1723 1724 spin_lock_init(&iommu->lock); 1725 1726 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1727 if (!iommu->domain_ids) 1728 return -ENOMEM; 1729 1730 /* 1731 * If Caching mode is set, then invalid translations are tagged 1732 * with domain-id 0, hence we need to pre-allocate it. We also 1733 * use domain-id 0 as a marker for non-allocated domain-id, so 1734 * make sure it is not used for a real domain. 1735 */ 1736 set_bit(0, iommu->domain_ids); 1737 1738 /* 1739 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1740 * entry for first-level or pass-through translation modes should 1741 * be programmed with a domain id different from those used for 1742 * second-level or nested translation. We reserve a domain id for 1743 * this purpose. 1744 */ 1745 if (sm_supported(iommu)) 1746 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1747 1748 return 0; 1749 } 1750 1751 static void disable_dmar_iommu(struct intel_iommu *iommu) 1752 { 1753 struct device_domain_info *info, *tmp; 1754 unsigned long flags; 1755 1756 if (!iommu->domain_ids) 1757 return; 1758 1759 spin_lock_irqsave(&device_domain_lock, flags); 1760 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1761 if (info->iommu != iommu) 1762 continue; 1763 1764 if (!info->dev || !info->domain) 1765 continue; 1766 1767 __dmar_remove_one_dev_info(info); 1768 } 1769 spin_unlock_irqrestore(&device_domain_lock, flags); 1770 1771 if (iommu->gcmd & DMA_GCMD_TE) 1772 iommu_disable_translation(iommu); 1773 } 1774 1775 static void free_dmar_iommu(struct intel_iommu *iommu) 1776 { 1777 if (iommu->domain_ids) { 1778 bitmap_free(iommu->domain_ids); 1779 iommu->domain_ids = NULL; 1780 } 1781 1782 g_iommus[iommu->seq_id] = NULL; 1783 1784 /* free context mapping */ 1785 free_context_table(iommu); 1786 1787 #ifdef CONFIG_INTEL_IOMMU_SVM 1788 if (pasid_supported(iommu)) { 1789 if (ecap_prs(iommu->ecap)) 1790 intel_svm_finish_prq(iommu); 1791 } 1792 if (vccap_pasid(iommu->vccap)) 1793 ioasid_unregister_allocator(&iommu->pasid_allocator); 1794 1795 #endif 1796 } 1797 1798 /* 1799 * Check and return whether first level is used by default for 1800 * DMA translation. 1801 */ 1802 static bool first_level_by_default(unsigned int type) 1803 { 1804 /* Only SL is available in legacy mode */ 1805 if (!scalable_mode_support()) 1806 return false; 1807 1808 /* Only level (either FL or SL) is available, just use it */ 1809 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1810 return intel_cap_flts_sanity(); 1811 1812 /* Both levels are available, decide it based on domain type */ 1813 return type != IOMMU_DOMAIN_UNMANAGED; 1814 } 1815 1816 static struct dmar_domain *alloc_domain(unsigned int type) 1817 { 1818 struct dmar_domain *domain; 1819 1820 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1821 if (!domain) 1822 return NULL; 1823 1824 domain->nid = NUMA_NO_NODE; 1825 if (first_level_by_default(type)) 1826 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1827 domain->has_iotlb_device = false; 1828 INIT_LIST_HEAD(&domain->devices); 1829 1830 return domain; 1831 } 1832 1833 /* Must be called with iommu->lock */ 1834 static int domain_attach_iommu(struct dmar_domain *domain, 1835 struct intel_iommu *iommu) 1836 { 1837 unsigned long ndomains; 1838 int num; 1839 1840 assert_spin_locked(&device_domain_lock); 1841 assert_spin_locked(&iommu->lock); 1842 1843 domain->iommu_refcnt[iommu->seq_id] += 1; 1844 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1845 ndomains = cap_ndoms(iommu->cap); 1846 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1847 1848 if (num >= ndomains) { 1849 pr_err("%s: No free domain ids\n", iommu->name); 1850 domain->iommu_refcnt[iommu->seq_id] -= 1; 1851 return -ENOSPC; 1852 } 1853 1854 set_bit(num, iommu->domain_ids); 1855 domain->iommu_did[iommu->seq_id] = num; 1856 domain->nid = iommu->node; 1857 domain_update_iommu_cap(domain); 1858 } 1859 1860 return 0; 1861 } 1862 1863 static void domain_detach_iommu(struct dmar_domain *domain, 1864 struct intel_iommu *iommu) 1865 { 1866 int num; 1867 1868 assert_spin_locked(&device_domain_lock); 1869 assert_spin_locked(&iommu->lock); 1870 1871 domain->iommu_refcnt[iommu->seq_id] -= 1; 1872 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1873 num = domain->iommu_did[iommu->seq_id]; 1874 clear_bit(num, iommu->domain_ids); 1875 domain_update_iommu_cap(domain); 1876 domain->iommu_did[iommu->seq_id] = 0; 1877 } 1878 } 1879 1880 static inline int guestwidth_to_adjustwidth(int gaw) 1881 { 1882 int agaw; 1883 int r = (gaw - 12) % 9; 1884 1885 if (r == 0) 1886 agaw = gaw; 1887 else 1888 agaw = gaw + 9 - r; 1889 if (agaw > 64) 1890 agaw = 64; 1891 return agaw; 1892 } 1893 1894 static void domain_exit(struct dmar_domain *domain) 1895 { 1896 1897 /* Remove associated devices and clear attached or cached domains */ 1898 domain_remove_dev_info(domain); 1899 1900 if (domain->pgd) { 1901 LIST_HEAD(freelist); 1902 1903 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 1904 put_pages_list(&freelist); 1905 } 1906 1907 kfree(domain); 1908 } 1909 1910 /* 1911 * Get the PASID directory size for scalable mode context entry. 1912 * Value of X in the PDTS field of a scalable mode context entry 1913 * indicates PASID directory with 2^(X + 7) entries. 1914 */ 1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1916 { 1917 unsigned long pds, max_pde; 1918 1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1920 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 1921 if (pds < 7) 1922 return 0; 1923 1924 return pds - 7; 1925 } 1926 1927 /* 1928 * Set the RID_PASID field of a scalable mode context entry. The 1929 * IOMMU hardware will use the PASID value set in this field for 1930 * DMA translations of DMA requests without PASID. 1931 */ 1932 static inline void 1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1934 { 1935 context->hi |= pasid & ((1 << 20) - 1); 1936 } 1937 1938 /* 1939 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1940 * entry. 1941 */ 1942 static inline void context_set_sm_dte(struct context_entry *context) 1943 { 1944 context->lo |= (1 << 2); 1945 } 1946 1947 /* 1948 * Set the PRE(Page Request Enable) field of a scalable mode context 1949 * entry. 1950 */ 1951 static inline void context_set_sm_pre(struct context_entry *context) 1952 { 1953 context->lo |= (1 << 4); 1954 } 1955 1956 /* Convert value to context PASID directory size field coding. */ 1957 #define context_pdts(pds) (((pds) & 0x7) << 9) 1958 1959 static int domain_context_mapping_one(struct dmar_domain *domain, 1960 struct intel_iommu *iommu, 1961 struct pasid_table *table, 1962 u8 bus, u8 devfn) 1963 { 1964 u16 did = domain->iommu_did[iommu->seq_id]; 1965 int translation = CONTEXT_TT_MULTI_LEVEL; 1966 struct device_domain_info *info = NULL; 1967 struct context_entry *context; 1968 unsigned long flags; 1969 int ret; 1970 1971 WARN_ON(did == 0); 1972 1973 if (hw_pass_through && domain_type_is_si(domain)) 1974 translation = CONTEXT_TT_PASS_THROUGH; 1975 1976 pr_debug("Set context mapping for %02x:%02x.%d\n", 1977 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1978 1979 BUG_ON(!domain->pgd); 1980 1981 spin_lock_irqsave(&device_domain_lock, flags); 1982 spin_lock(&iommu->lock); 1983 1984 ret = -ENOMEM; 1985 context = iommu_context_addr(iommu, bus, devfn, 1); 1986 if (!context) 1987 goto out_unlock; 1988 1989 ret = 0; 1990 if (context_present(context)) 1991 goto out_unlock; 1992 1993 /* 1994 * For kdump cases, old valid entries may be cached due to the 1995 * in-flight DMA and copied pgtable, but there is no unmapping 1996 * behaviour for them, thus we need an explicit cache flush for 1997 * the newly-mapped device. For kdump, at this point, the device 1998 * is supposed to finish reset at its driver probe stage, so no 1999 * in-flight DMA will exist, and we don't need to worry anymore 2000 * hereafter. 2001 */ 2002 if (context_copied(context)) { 2003 u16 did_old = context_domain_id(context); 2004 2005 if (did_old < cap_ndoms(iommu->cap)) { 2006 iommu->flush.flush_context(iommu, did_old, 2007 (((u16)bus) << 8) | devfn, 2008 DMA_CCMD_MASK_NOBIT, 2009 DMA_CCMD_DEVICE_INVL); 2010 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2011 DMA_TLB_DSI_FLUSH); 2012 } 2013 } 2014 2015 context_clear_entry(context); 2016 2017 if (sm_supported(iommu)) { 2018 unsigned long pds; 2019 2020 WARN_ON(!table); 2021 2022 /* Setup the PASID DIR pointer: */ 2023 pds = context_get_sm_pds(table); 2024 context->lo = (u64)virt_to_phys(table->table) | 2025 context_pdts(pds); 2026 2027 /* Setup the RID_PASID field: */ 2028 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2029 2030 /* 2031 * Setup the Device-TLB enable bit and Page request 2032 * Enable bit: 2033 */ 2034 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2035 if (info && info->ats_supported) 2036 context_set_sm_dte(context); 2037 if (info && info->pri_supported) 2038 context_set_sm_pre(context); 2039 } else { 2040 struct dma_pte *pgd = domain->pgd; 2041 int agaw; 2042 2043 context_set_domain_id(context, did); 2044 2045 if (translation != CONTEXT_TT_PASS_THROUGH) { 2046 /* 2047 * Skip top levels of page tables for iommu which has 2048 * less agaw than default. Unnecessary for PT mode. 2049 */ 2050 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2051 ret = -ENOMEM; 2052 pgd = phys_to_virt(dma_pte_addr(pgd)); 2053 if (!dma_pte_present(pgd)) 2054 goto out_unlock; 2055 } 2056 2057 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2058 if (info && info->ats_supported) 2059 translation = CONTEXT_TT_DEV_IOTLB; 2060 else 2061 translation = CONTEXT_TT_MULTI_LEVEL; 2062 2063 context_set_address_root(context, virt_to_phys(pgd)); 2064 context_set_address_width(context, agaw); 2065 } else { 2066 /* 2067 * In pass through mode, AW must be programmed to 2068 * indicate the largest AGAW value supported by 2069 * hardware. And ASR is ignored by hardware. 2070 */ 2071 context_set_address_width(context, iommu->msagaw); 2072 } 2073 2074 context_set_translation_type(context, translation); 2075 } 2076 2077 context_set_fault_enable(context); 2078 context_set_present(context); 2079 if (!ecap_coherent(iommu->ecap)) 2080 clflush_cache_range(context, sizeof(*context)); 2081 2082 /* 2083 * It's a non-present to present mapping. If hardware doesn't cache 2084 * non-present entry we only need to flush the write-buffer. If the 2085 * _does_ cache non-present entries, then it does so in the special 2086 * domain #0, which we have to flush: 2087 */ 2088 if (cap_caching_mode(iommu->cap)) { 2089 iommu->flush.flush_context(iommu, 0, 2090 (((u16)bus) << 8) | devfn, 2091 DMA_CCMD_MASK_NOBIT, 2092 DMA_CCMD_DEVICE_INVL); 2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2094 } else { 2095 iommu_flush_write_buffer(iommu); 2096 } 2097 iommu_enable_dev_iotlb(info); 2098 2099 ret = 0; 2100 2101 out_unlock: 2102 spin_unlock(&iommu->lock); 2103 spin_unlock_irqrestore(&device_domain_lock, flags); 2104 2105 return ret; 2106 } 2107 2108 struct domain_context_mapping_data { 2109 struct dmar_domain *domain; 2110 struct intel_iommu *iommu; 2111 struct pasid_table *table; 2112 }; 2113 2114 static int domain_context_mapping_cb(struct pci_dev *pdev, 2115 u16 alias, void *opaque) 2116 { 2117 struct domain_context_mapping_data *data = opaque; 2118 2119 return domain_context_mapping_one(data->domain, data->iommu, 2120 data->table, PCI_BUS_NUM(alias), 2121 alias & 0xff); 2122 } 2123 2124 static int 2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2126 { 2127 struct domain_context_mapping_data data; 2128 struct pasid_table *table; 2129 struct intel_iommu *iommu; 2130 u8 bus, devfn; 2131 2132 iommu = device_to_iommu(dev, &bus, &devfn); 2133 if (!iommu) 2134 return -ENODEV; 2135 2136 table = intel_pasid_get_table(dev); 2137 2138 if (!dev_is_pci(dev)) 2139 return domain_context_mapping_one(domain, iommu, table, 2140 bus, devfn); 2141 2142 data.domain = domain; 2143 data.iommu = iommu; 2144 data.table = table; 2145 2146 return pci_for_each_dma_alias(to_pci_dev(dev), 2147 &domain_context_mapping_cb, &data); 2148 } 2149 2150 static int domain_context_mapped_cb(struct pci_dev *pdev, 2151 u16 alias, void *opaque) 2152 { 2153 struct intel_iommu *iommu = opaque; 2154 2155 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2156 } 2157 2158 static int domain_context_mapped(struct device *dev) 2159 { 2160 struct intel_iommu *iommu; 2161 u8 bus, devfn; 2162 2163 iommu = device_to_iommu(dev, &bus, &devfn); 2164 if (!iommu) 2165 return -ENODEV; 2166 2167 if (!dev_is_pci(dev)) 2168 return device_context_mapped(iommu, bus, devfn); 2169 2170 return !pci_for_each_dma_alias(to_pci_dev(dev), 2171 domain_context_mapped_cb, iommu); 2172 } 2173 2174 /* Returns a number of VTD pages, but aligned to MM page size */ 2175 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2176 size_t size) 2177 { 2178 host_addr &= ~PAGE_MASK; 2179 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2180 } 2181 2182 /* Return largest possible superpage level for a given mapping */ 2183 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2184 unsigned long iov_pfn, 2185 unsigned long phy_pfn, 2186 unsigned long pages) 2187 { 2188 int support, level = 1; 2189 unsigned long pfnmerge; 2190 2191 support = domain->iommu_superpage; 2192 2193 /* To use a large page, the virtual *and* physical addresses 2194 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2195 of them will mean we have to use smaller pages. So just 2196 merge them and check both at once. */ 2197 pfnmerge = iov_pfn | phy_pfn; 2198 2199 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2200 pages >>= VTD_STRIDE_SHIFT; 2201 if (!pages) 2202 break; 2203 pfnmerge >>= VTD_STRIDE_SHIFT; 2204 level++; 2205 support--; 2206 } 2207 return level; 2208 } 2209 2210 /* 2211 * Ensure that old small page tables are removed to make room for superpage(s). 2212 * We're going to add new large pages, so make sure we don't remove their parent 2213 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2214 */ 2215 static void switch_to_super_page(struct dmar_domain *domain, 2216 unsigned long start_pfn, 2217 unsigned long end_pfn, int level) 2218 { 2219 unsigned long lvl_pages = lvl_to_nr_pages(level); 2220 struct dma_pte *pte = NULL; 2221 int i; 2222 2223 while (start_pfn <= end_pfn) { 2224 if (!pte) 2225 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2226 2227 if (dma_pte_present(pte)) { 2228 dma_pte_free_pagetable(domain, start_pfn, 2229 start_pfn + lvl_pages - 1, 2230 level + 1); 2231 2232 for_each_domain_iommu(i, domain) 2233 iommu_flush_iotlb_psi(g_iommus[i], domain, 2234 start_pfn, lvl_pages, 2235 0, 0); 2236 } 2237 2238 pte++; 2239 start_pfn += lvl_pages; 2240 if (first_pte_in_page(pte)) 2241 pte = NULL; 2242 } 2243 } 2244 2245 static int 2246 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2247 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2248 { 2249 struct dma_pte *first_pte = NULL, *pte = NULL; 2250 unsigned int largepage_lvl = 0; 2251 unsigned long lvl_pages = 0; 2252 phys_addr_t pteval; 2253 u64 attr; 2254 2255 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2256 2257 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2258 return -EINVAL; 2259 2260 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2261 attr |= DMA_FL_PTE_PRESENT; 2262 if (domain_use_first_level(domain)) { 2263 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2264 if (prot & DMA_PTE_WRITE) 2265 attr |= DMA_FL_PTE_DIRTY; 2266 } 2267 2268 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2269 2270 while (nr_pages > 0) { 2271 uint64_t tmp; 2272 2273 if (!pte) { 2274 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2275 phys_pfn, nr_pages); 2276 2277 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2278 if (!pte) 2279 return -ENOMEM; 2280 first_pte = pte; 2281 2282 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2283 2284 /* It is large page*/ 2285 if (largepage_lvl > 1) { 2286 unsigned long end_pfn; 2287 unsigned long pages_to_remove; 2288 2289 pteval |= DMA_PTE_LARGE_PAGE; 2290 pages_to_remove = min_t(unsigned long, nr_pages, 2291 nr_pte_to_next_page(pte) * lvl_pages); 2292 end_pfn = iov_pfn + pages_to_remove - 1; 2293 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2294 } else { 2295 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2296 } 2297 2298 } 2299 /* We don't need lock here, nobody else 2300 * touches the iova range 2301 */ 2302 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2303 if (tmp) { 2304 static int dumps = 5; 2305 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2306 iov_pfn, tmp, (unsigned long long)pteval); 2307 if (dumps) { 2308 dumps--; 2309 debug_dma_dump_mappings(NULL); 2310 } 2311 WARN_ON(1); 2312 } 2313 2314 nr_pages -= lvl_pages; 2315 iov_pfn += lvl_pages; 2316 phys_pfn += lvl_pages; 2317 pteval += lvl_pages * VTD_PAGE_SIZE; 2318 2319 /* If the next PTE would be the first in a new page, then we 2320 * need to flush the cache on the entries we've just written. 2321 * And then we'll need to recalculate 'pte', so clear it and 2322 * let it get set again in the if (!pte) block above. 2323 * 2324 * If we're done (!nr_pages) we need to flush the cache too. 2325 * 2326 * Also if we've been setting superpages, we may need to 2327 * recalculate 'pte' and switch back to smaller pages for the 2328 * end of the mapping, if the trailing size is not enough to 2329 * use another superpage (i.e. nr_pages < lvl_pages). 2330 */ 2331 pte++; 2332 if (!nr_pages || first_pte_in_page(pte) || 2333 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2334 domain_flush_cache(domain, first_pte, 2335 (void *)pte - (void *)first_pte); 2336 pte = NULL; 2337 } 2338 } 2339 2340 return 0; 2341 } 2342 2343 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2344 { 2345 struct intel_iommu *iommu = info->iommu; 2346 struct context_entry *context; 2347 unsigned long flags; 2348 u16 did_old; 2349 2350 if (!iommu) 2351 return; 2352 2353 spin_lock_irqsave(&iommu->lock, flags); 2354 context = iommu_context_addr(iommu, bus, devfn, 0); 2355 if (!context) { 2356 spin_unlock_irqrestore(&iommu->lock, flags); 2357 return; 2358 } 2359 2360 if (sm_supported(iommu)) { 2361 if (hw_pass_through && domain_type_is_si(info->domain)) 2362 did_old = FLPT_DEFAULT_DID; 2363 else 2364 did_old = info->domain->iommu_did[iommu->seq_id]; 2365 } else { 2366 did_old = context_domain_id(context); 2367 } 2368 2369 context_clear_entry(context); 2370 __iommu_flush_cache(iommu, context, sizeof(*context)); 2371 spin_unlock_irqrestore(&iommu->lock, flags); 2372 iommu->flush.flush_context(iommu, 2373 did_old, 2374 (((u16)bus) << 8) | devfn, 2375 DMA_CCMD_MASK_NOBIT, 2376 DMA_CCMD_DEVICE_INVL); 2377 2378 if (sm_supported(iommu)) 2379 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2380 2381 iommu->flush.flush_iotlb(iommu, 2382 did_old, 2383 0, 2384 0, 2385 DMA_TLB_DSI_FLUSH); 2386 2387 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2388 } 2389 2390 static void domain_remove_dev_info(struct dmar_domain *domain) 2391 { 2392 struct device_domain_info *info, *tmp; 2393 unsigned long flags; 2394 2395 spin_lock_irqsave(&device_domain_lock, flags); 2396 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2397 __dmar_remove_one_dev_info(info); 2398 spin_unlock_irqrestore(&device_domain_lock, flags); 2399 } 2400 2401 static inline struct device_domain_info * 2402 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2403 { 2404 struct device_domain_info *info; 2405 2406 list_for_each_entry(info, &device_domain_list, global) 2407 if (info->segment == segment && info->bus == bus && 2408 info->devfn == devfn) 2409 return info; 2410 2411 return NULL; 2412 } 2413 2414 static int domain_setup_first_level(struct intel_iommu *iommu, 2415 struct dmar_domain *domain, 2416 struct device *dev, 2417 u32 pasid) 2418 { 2419 struct dma_pte *pgd = domain->pgd; 2420 int agaw, level; 2421 int flags = 0; 2422 2423 /* 2424 * Skip top levels of page tables for iommu which has 2425 * less agaw than default. Unnecessary for PT mode. 2426 */ 2427 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2428 pgd = phys_to_virt(dma_pte_addr(pgd)); 2429 if (!dma_pte_present(pgd)) 2430 return -ENOMEM; 2431 } 2432 2433 level = agaw_to_level(agaw); 2434 if (level != 4 && level != 5) 2435 return -EINVAL; 2436 2437 if (pasid != PASID_RID2PASID) 2438 flags |= PASID_FLAG_SUPERVISOR_MODE; 2439 if (level == 5) 2440 flags |= PASID_FLAG_FL5LP; 2441 2442 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2443 flags |= PASID_FLAG_PAGE_SNOOP; 2444 2445 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2446 domain->iommu_did[iommu->seq_id], 2447 flags); 2448 } 2449 2450 static bool dev_is_real_dma_subdevice(struct device *dev) 2451 { 2452 return dev && dev_is_pci(dev) && 2453 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2454 } 2455 2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2457 int bus, int devfn, 2458 struct device *dev, 2459 struct dmar_domain *domain) 2460 { 2461 struct device_domain_info *info = dev_iommu_priv_get(dev); 2462 unsigned long flags; 2463 int ret; 2464 2465 spin_lock_irqsave(&device_domain_lock, flags); 2466 info->domain = domain; 2467 spin_lock(&iommu->lock); 2468 ret = domain_attach_iommu(domain, iommu); 2469 spin_unlock(&iommu->lock); 2470 if (ret) { 2471 spin_unlock_irqrestore(&device_domain_lock, flags); 2472 return NULL; 2473 } 2474 list_add(&info->link, &domain->devices); 2475 spin_unlock_irqrestore(&device_domain_lock, flags); 2476 2477 /* PASID table is mandatory for a PCI device in scalable mode. */ 2478 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2479 ret = intel_pasid_alloc_table(dev); 2480 if (ret) { 2481 dev_err(dev, "PASID table allocation failed\n"); 2482 dmar_remove_one_dev_info(dev); 2483 return NULL; 2484 } 2485 2486 /* Setup the PASID entry for requests without PASID: */ 2487 spin_lock_irqsave(&iommu->lock, flags); 2488 if (hw_pass_through && domain_type_is_si(domain)) 2489 ret = intel_pasid_setup_pass_through(iommu, domain, 2490 dev, PASID_RID2PASID); 2491 else if (domain_use_first_level(domain)) 2492 ret = domain_setup_first_level(iommu, domain, dev, 2493 PASID_RID2PASID); 2494 else 2495 ret = intel_pasid_setup_second_level(iommu, domain, 2496 dev, PASID_RID2PASID); 2497 spin_unlock_irqrestore(&iommu->lock, flags); 2498 if (ret) { 2499 dev_err(dev, "Setup RID2PASID failed\n"); 2500 dmar_remove_one_dev_info(dev); 2501 return NULL; 2502 } 2503 } 2504 2505 if (dev && domain_context_mapping(domain, dev)) { 2506 dev_err(dev, "Domain context map failed\n"); 2507 dmar_remove_one_dev_info(dev); 2508 return NULL; 2509 } 2510 2511 return domain; 2512 } 2513 2514 static int iommu_domain_identity_map(struct dmar_domain *domain, 2515 unsigned long first_vpfn, 2516 unsigned long last_vpfn) 2517 { 2518 /* 2519 * RMRR range might have overlap with physical memory range, 2520 * clear it first 2521 */ 2522 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2523 2524 return __domain_mapping(domain, first_vpfn, 2525 first_vpfn, last_vpfn - first_vpfn + 1, 2526 DMA_PTE_READ|DMA_PTE_WRITE); 2527 } 2528 2529 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2530 2531 static int __init si_domain_init(int hw) 2532 { 2533 struct dmar_rmrr_unit *rmrr; 2534 struct device *dev; 2535 int i, nid, ret; 2536 2537 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2538 if (!si_domain) 2539 return -EFAULT; 2540 2541 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2542 domain_exit(si_domain); 2543 return -EFAULT; 2544 } 2545 2546 if (hw) 2547 return 0; 2548 2549 for_each_online_node(nid) { 2550 unsigned long start_pfn, end_pfn; 2551 int i; 2552 2553 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2554 ret = iommu_domain_identity_map(si_domain, 2555 mm_to_dma_pfn(start_pfn), 2556 mm_to_dma_pfn(end_pfn)); 2557 if (ret) 2558 return ret; 2559 } 2560 } 2561 2562 /* 2563 * Identity map the RMRRs so that devices with RMRRs could also use 2564 * the si_domain. 2565 */ 2566 for_each_rmrr_units(rmrr) { 2567 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2568 i, dev) { 2569 unsigned long long start = rmrr->base_address; 2570 unsigned long long end = rmrr->end_address; 2571 2572 if (WARN_ON(end < start || 2573 end >> agaw_to_width(si_domain->agaw))) 2574 continue; 2575 2576 ret = iommu_domain_identity_map(si_domain, 2577 mm_to_dma_pfn(start >> PAGE_SHIFT), 2578 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2579 if (ret) 2580 return ret; 2581 } 2582 } 2583 2584 return 0; 2585 } 2586 2587 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2588 { 2589 struct dmar_domain *ndomain; 2590 struct intel_iommu *iommu; 2591 u8 bus, devfn; 2592 2593 iommu = device_to_iommu(dev, &bus, &devfn); 2594 if (!iommu) 2595 return -ENODEV; 2596 2597 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2598 if (ndomain != domain) 2599 return -EBUSY; 2600 2601 return 0; 2602 } 2603 2604 static bool device_has_rmrr(struct device *dev) 2605 { 2606 struct dmar_rmrr_unit *rmrr; 2607 struct device *tmp; 2608 int i; 2609 2610 rcu_read_lock(); 2611 for_each_rmrr_units(rmrr) { 2612 /* 2613 * Return TRUE if this RMRR contains the device that 2614 * is passed in. 2615 */ 2616 for_each_active_dev_scope(rmrr->devices, 2617 rmrr->devices_cnt, i, tmp) 2618 if (tmp == dev || 2619 is_downstream_to_pci_bridge(dev, tmp)) { 2620 rcu_read_unlock(); 2621 return true; 2622 } 2623 } 2624 rcu_read_unlock(); 2625 return false; 2626 } 2627 2628 /** 2629 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2630 * is relaxable (ie. is allowed to be not enforced under some conditions) 2631 * @dev: device handle 2632 * 2633 * We assume that PCI USB devices with RMRRs have them largely 2634 * for historical reasons and that the RMRR space is not actively used post 2635 * boot. This exclusion may change if vendors begin to abuse it. 2636 * 2637 * The same exception is made for graphics devices, with the requirement that 2638 * any use of the RMRR regions will be torn down before assigning the device 2639 * to a guest. 2640 * 2641 * Return: true if the RMRR is relaxable, false otherwise 2642 */ 2643 static bool device_rmrr_is_relaxable(struct device *dev) 2644 { 2645 struct pci_dev *pdev; 2646 2647 if (!dev_is_pci(dev)) 2648 return false; 2649 2650 pdev = to_pci_dev(dev); 2651 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2652 return true; 2653 else 2654 return false; 2655 } 2656 2657 /* 2658 * There are a couple cases where we need to restrict the functionality of 2659 * devices associated with RMRRs. The first is when evaluating a device for 2660 * identity mapping because problems exist when devices are moved in and out 2661 * of domains and their respective RMRR information is lost. This means that 2662 * a device with associated RMRRs will never be in a "passthrough" domain. 2663 * The second is use of the device through the IOMMU API. This interface 2664 * expects to have full control of the IOVA space for the device. We cannot 2665 * satisfy both the requirement that RMRR access is maintained and have an 2666 * unencumbered IOVA space. We also have no ability to quiesce the device's 2667 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2668 * We therefore prevent devices associated with an RMRR from participating in 2669 * the IOMMU API, which eliminates them from device assignment. 2670 * 2671 * In both cases, devices which have relaxable RMRRs are not concerned by this 2672 * restriction. See device_rmrr_is_relaxable comment. 2673 */ 2674 static bool device_is_rmrr_locked(struct device *dev) 2675 { 2676 if (!device_has_rmrr(dev)) 2677 return false; 2678 2679 if (device_rmrr_is_relaxable(dev)) 2680 return false; 2681 2682 return true; 2683 } 2684 2685 /* 2686 * Return the required default domain type for a specific device. 2687 * 2688 * @dev: the device in query 2689 * @startup: true if this is during early boot 2690 * 2691 * Returns: 2692 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2693 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2694 * - 0: both identity and dynamic domains work for this device 2695 */ 2696 static int device_def_domain_type(struct device *dev) 2697 { 2698 if (dev_is_pci(dev)) { 2699 struct pci_dev *pdev = to_pci_dev(dev); 2700 2701 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2702 return IOMMU_DOMAIN_IDENTITY; 2703 2704 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2705 return IOMMU_DOMAIN_IDENTITY; 2706 } 2707 2708 return 0; 2709 } 2710 2711 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2712 { 2713 /* 2714 * Start from the sane iommu hardware state. 2715 * If the queued invalidation is already initialized by us 2716 * (for example, while enabling interrupt-remapping) then 2717 * we got the things already rolling from a sane state. 2718 */ 2719 if (!iommu->qi) { 2720 /* 2721 * Clear any previous faults. 2722 */ 2723 dmar_fault(-1, iommu); 2724 /* 2725 * Disable queued invalidation if supported and already enabled 2726 * before OS handover. 2727 */ 2728 dmar_disable_qi(iommu); 2729 } 2730 2731 if (dmar_enable_qi(iommu)) { 2732 /* 2733 * Queued Invalidate not enabled, use Register Based Invalidate 2734 */ 2735 iommu->flush.flush_context = __iommu_flush_context; 2736 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2737 pr_info("%s: Using Register based invalidation\n", 2738 iommu->name); 2739 } else { 2740 iommu->flush.flush_context = qi_flush_context; 2741 iommu->flush.flush_iotlb = qi_flush_iotlb; 2742 pr_info("%s: Using Queued invalidation\n", iommu->name); 2743 } 2744 } 2745 2746 static int copy_context_table(struct intel_iommu *iommu, 2747 struct root_entry *old_re, 2748 struct context_entry **tbl, 2749 int bus, bool ext) 2750 { 2751 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2752 struct context_entry *new_ce = NULL, ce; 2753 struct context_entry *old_ce = NULL; 2754 struct root_entry re; 2755 phys_addr_t old_ce_phys; 2756 2757 tbl_idx = ext ? bus * 2 : bus; 2758 memcpy(&re, old_re, sizeof(re)); 2759 2760 for (devfn = 0; devfn < 256; devfn++) { 2761 /* First calculate the correct index */ 2762 idx = (ext ? devfn * 2 : devfn) % 256; 2763 2764 if (idx == 0) { 2765 /* First save what we may have and clean up */ 2766 if (new_ce) { 2767 tbl[tbl_idx] = new_ce; 2768 __iommu_flush_cache(iommu, new_ce, 2769 VTD_PAGE_SIZE); 2770 pos = 1; 2771 } 2772 2773 if (old_ce) 2774 memunmap(old_ce); 2775 2776 ret = 0; 2777 if (devfn < 0x80) 2778 old_ce_phys = root_entry_lctp(&re); 2779 else 2780 old_ce_phys = root_entry_uctp(&re); 2781 2782 if (!old_ce_phys) { 2783 if (ext && devfn == 0) { 2784 /* No LCTP, try UCTP */ 2785 devfn = 0x7f; 2786 continue; 2787 } else { 2788 goto out; 2789 } 2790 } 2791 2792 ret = -ENOMEM; 2793 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2794 MEMREMAP_WB); 2795 if (!old_ce) 2796 goto out; 2797 2798 new_ce = alloc_pgtable_page(iommu->node); 2799 if (!new_ce) 2800 goto out_unmap; 2801 2802 ret = 0; 2803 } 2804 2805 /* Now copy the context entry */ 2806 memcpy(&ce, old_ce + idx, sizeof(ce)); 2807 2808 if (!__context_present(&ce)) 2809 continue; 2810 2811 did = context_domain_id(&ce); 2812 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2813 set_bit(did, iommu->domain_ids); 2814 2815 /* 2816 * We need a marker for copied context entries. This 2817 * marker needs to work for the old format as well as 2818 * for extended context entries. 2819 * 2820 * Bit 67 of the context entry is used. In the old 2821 * format this bit is available to software, in the 2822 * extended format it is the PGE bit, but PGE is ignored 2823 * by HW if PASIDs are disabled (and thus still 2824 * available). 2825 * 2826 * So disable PASIDs first and then mark the entry 2827 * copied. This means that we don't copy PASID 2828 * translations from the old kernel, but this is fine as 2829 * faults there are not fatal. 2830 */ 2831 context_clear_pasid_enable(&ce); 2832 context_set_copied(&ce); 2833 2834 new_ce[idx] = ce; 2835 } 2836 2837 tbl[tbl_idx + pos] = new_ce; 2838 2839 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2840 2841 out_unmap: 2842 memunmap(old_ce); 2843 2844 out: 2845 return ret; 2846 } 2847 2848 static int copy_translation_tables(struct intel_iommu *iommu) 2849 { 2850 struct context_entry **ctxt_tbls; 2851 struct root_entry *old_rt; 2852 phys_addr_t old_rt_phys; 2853 int ctxt_table_entries; 2854 unsigned long flags; 2855 u64 rtaddr_reg; 2856 int bus, ret; 2857 bool new_ext, ext; 2858 2859 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2860 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2861 new_ext = !!ecap_ecs(iommu->ecap); 2862 2863 /* 2864 * The RTT bit can only be changed when translation is disabled, 2865 * but disabling translation means to open a window for data 2866 * corruption. So bail out and don't copy anything if we would 2867 * have to change the bit. 2868 */ 2869 if (new_ext != ext) 2870 return -EINVAL; 2871 2872 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 2873 if (!old_rt_phys) 2874 return -EINVAL; 2875 2876 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 2877 if (!old_rt) 2878 return -ENOMEM; 2879 2880 /* This is too big for the stack - allocate it from slab */ 2881 ctxt_table_entries = ext ? 512 : 256; 2882 ret = -ENOMEM; 2883 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 2884 if (!ctxt_tbls) 2885 goto out_unmap; 2886 2887 for (bus = 0; bus < 256; bus++) { 2888 ret = copy_context_table(iommu, &old_rt[bus], 2889 ctxt_tbls, bus, ext); 2890 if (ret) { 2891 pr_err("%s: Failed to copy context table for bus %d\n", 2892 iommu->name, bus); 2893 continue; 2894 } 2895 } 2896 2897 spin_lock_irqsave(&iommu->lock, flags); 2898 2899 /* Context tables are copied, now write them to the root_entry table */ 2900 for (bus = 0; bus < 256; bus++) { 2901 int idx = ext ? bus * 2 : bus; 2902 u64 val; 2903 2904 if (ctxt_tbls[idx]) { 2905 val = virt_to_phys(ctxt_tbls[idx]) | 1; 2906 iommu->root_entry[bus].lo = val; 2907 } 2908 2909 if (!ext || !ctxt_tbls[idx + 1]) 2910 continue; 2911 2912 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 2913 iommu->root_entry[bus].hi = val; 2914 } 2915 2916 spin_unlock_irqrestore(&iommu->lock, flags); 2917 2918 kfree(ctxt_tbls); 2919 2920 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 2921 2922 ret = 0; 2923 2924 out_unmap: 2925 memunmap(old_rt); 2926 2927 return ret; 2928 } 2929 2930 #ifdef CONFIG_INTEL_IOMMU_SVM 2931 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 2932 { 2933 struct intel_iommu *iommu = data; 2934 ioasid_t ioasid; 2935 2936 if (!iommu) 2937 return INVALID_IOASID; 2938 /* 2939 * VT-d virtual command interface always uses the full 20 bit 2940 * PASID range. Host can partition guest PASID range based on 2941 * policies but it is out of guest's control. 2942 */ 2943 if (min < PASID_MIN || max > intel_pasid_max_id) 2944 return INVALID_IOASID; 2945 2946 if (vcmd_alloc_pasid(iommu, &ioasid)) 2947 return INVALID_IOASID; 2948 2949 return ioasid; 2950 } 2951 2952 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 2953 { 2954 struct intel_iommu *iommu = data; 2955 2956 if (!iommu) 2957 return; 2958 /* 2959 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 2960 * We can only free the PASID when all the devices are unbound. 2961 */ 2962 if (ioasid_find(NULL, ioasid, NULL)) { 2963 pr_alert("Cannot free active IOASID %d\n", ioasid); 2964 return; 2965 } 2966 vcmd_free_pasid(iommu, ioasid); 2967 } 2968 2969 static void register_pasid_allocator(struct intel_iommu *iommu) 2970 { 2971 /* 2972 * If we are running in the host, no need for custom allocator 2973 * in that PASIDs are allocated from the host system-wide. 2974 */ 2975 if (!cap_caching_mode(iommu->cap)) 2976 return; 2977 2978 if (!sm_supported(iommu)) { 2979 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 2980 return; 2981 } 2982 2983 /* 2984 * Register a custom PASID allocator if we are running in a guest, 2985 * guest PASID must be obtained via virtual command interface. 2986 * There can be multiple vIOMMUs in each guest but only one allocator 2987 * is active. All vIOMMU allocators will eventually be calling the same 2988 * host allocator. 2989 */ 2990 if (!vccap_pasid(iommu->vccap)) 2991 return; 2992 2993 pr_info("Register custom PASID allocator\n"); 2994 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 2995 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 2996 iommu->pasid_allocator.pdata = (void *)iommu; 2997 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 2998 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 2999 /* 3000 * Disable scalable mode on this IOMMU if there 3001 * is no custom allocator. Mixing SM capable vIOMMU 3002 * and non-SM vIOMMU are not supported. 3003 */ 3004 intel_iommu_sm = 0; 3005 } 3006 } 3007 #endif 3008 3009 static int __init init_dmars(void) 3010 { 3011 struct dmar_drhd_unit *drhd; 3012 struct intel_iommu *iommu; 3013 int ret; 3014 3015 /* 3016 * for each drhd 3017 * allocate root 3018 * initialize and program root entry to not present 3019 * endfor 3020 */ 3021 for_each_drhd_unit(drhd) { 3022 /* 3023 * lock not needed as this is only incremented in the single 3024 * threaded kernel __init code path all other access are read 3025 * only 3026 */ 3027 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3028 g_num_of_iommus++; 3029 continue; 3030 } 3031 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3032 } 3033 3034 /* Preallocate enough resources for IOMMU hot-addition */ 3035 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3036 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3037 3038 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3039 GFP_KERNEL); 3040 if (!g_iommus) { 3041 ret = -ENOMEM; 3042 goto error; 3043 } 3044 3045 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3046 if (ret) 3047 goto free_iommu; 3048 3049 for_each_iommu(iommu, drhd) { 3050 if (drhd->ignored) { 3051 iommu_disable_translation(iommu); 3052 continue; 3053 } 3054 3055 /* 3056 * Find the max pasid size of all IOMMU's in the system. 3057 * We need to ensure the system pasid table is no bigger 3058 * than the smallest supported. 3059 */ 3060 if (pasid_supported(iommu)) { 3061 u32 temp = 2 << ecap_pss(iommu->ecap); 3062 3063 intel_pasid_max_id = min_t(u32, temp, 3064 intel_pasid_max_id); 3065 } 3066 3067 g_iommus[iommu->seq_id] = iommu; 3068 3069 intel_iommu_init_qi(iommu); 3070 3071 ret = iommu_init_domains(iommu); 3072 if (ret) 3073 goto free_iommu; 3074 3075 init_translation_status(iommu); 3076 3077 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3078 iommu_disable_translation(iommu); 3079 clear_translation_pre_enabled(iommu); 3080 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3081 iommu->name); 3082 } 3083 3084 /* 3085 * TBD: 3086 * we could share the same root & context tables 3087 * among all IOMMU's. Need to Split it later. 3088 */ 3089 ret = iommu_alloc_root_entry(iommu); 3090 if (ret) 3091 goto free_iommu; 3092 3093 if (translation_pre_enabled(iommu)) { 3094 pr_info("Translation already enabled - trying to copy translation structures\n"); 3095 3096 ret = copy_translation_tables(iommu); 3097 if (ret) { 3098 /* 3099 * We found the IOMMU with translation 3100 * enabled - but failed to copy over the 3101 * old root-entry table. Try to proceed 3102 * by disabling translation now and 3103 * allocating a clean root-entry table. 3104 * This might cause DMAR faults, but 3105 * probably the dump will still succeed. 3106 */ 3107 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3108 iommu->name); 3109 iommu_disable_translation(iommu); 3110 clear_translation_pre_enabled(iommu); 3111 } else { 3112 pr_info("Copied translation tables from previous kernel for %s\n", 3113 iommu->name); 3114 } 3115 } 3116 3117 if (!ecap_pass_through(iommu->ecap)) 3118 hw_pass_through = 0; 3119 intel_svm_check(iommu); 3120 } 3121 3122 /* 3123 * Now that qi is enabled on all iommus, set the root entry and flush 3124 * caches. This is required on some Intel X58 chipsets, otherwise the 3125 * flush_context function will loop forever and the boot hangs. 3126 */ 3127 for_each_active_iommu(iommu, drhd) { 3128 iommu_flush_write_buffer(iommu); 3129 #ifdef CONFIG_INTEL_IOMMU_SVM 3130 register_pasid_allocator(iommu); 3131 #endif 3132 iommu_set_root_entry(iommu); 3133 } 3134 3135 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3136 dmar_map_gfx = 0; 3137 #endif 3138 3139 if (!dmar_map_gfx) 3140 iommu_identity_mapping |= IDENTMAP_GFX; 3141 3142 check_tylersburg_isoch(); 3143 3144 ret = si_domain_init(hw_pass_through); 3145 if (ret) 3146 goto free_iommu; 3147 3148 /* 3149 * for each drhd 3150 * enable fault log 3151 * global invalidate context cache 3152 * global invalidate iotlb 3153 * enable translation 3154 */ 3155 for_each_iommu(iommu, drhd) { 3156 if (drhd->ignored) { 3157 /* 3158 * we always have to disable PMRs or DMA may fail on 3159 * this device 3160 */ 3161 if (force_on) 3162 iommu_disable_protect_mem_regions(iommu); 3163 continue; 3164 } 3165 3166 iommu_flush_write_buffer(iommu); 3167 3168 #ifdef CONFIG_INTEL_IOMMU_SVM 3169 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3170 /* 3171 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3172 * could cause possible lock race condition. 3173 */ 3174 up_write(&dmar_global_lock); 3175 ret = intel_svm_enable_prq(iommu); 3176 down_write(&dmar_global_lock); 3177 if (ret) 3178 goto free_iommu; 3179 } 3180 #endif 3181 ret = dmar_set_interrupt(iommu); 3182 if (ret) 3183 goto free_iommu; 3184 } 3185 3186 return 0; 3187 3188 free_iommu: 3189 for_each_active_iommu(iommu, drhd) { 3190 disable_dmar_iommu(iommu); 3191 free_dmar_iommu(iommu); 3192 } 3193 3194 kfree(g_iommus); 3195 3196 error: 3197 return ret; 3198 } 3199 3200 static void __init init_no_remapping_devices(void) 3201 { 3202 struct dmar_drhd_unit *drhd; 3203 struct device *dev; 3204 int i; 3205 3206 for_each_drhd_unit(drhd) { 3207 if (!drhd->include_all) { 3208 for_each_active_dev_scope(drhd->devices, 3209 drhd->devices_cnt, i, dev) 3210 break; 3211 /* ignore DMAR unit if no devices exist */ 3212 if (i == drhd->devices_cnt) 3213 drhd->ignored = 1; 3214 } 3215 } 3216 3217 for_each_active_drhd_unit(drhd) { 3218 if (drhd->include_all) 3219 continue; 3220 3221 for_each_active_dev_scope(drhd->devices, 3222 drhd->devices_cnt, i, dev) 3223 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3224 break; 3225 if (i < drhd->devices_cnt) 3226 continue; 3227 3228 /* This IOMMU has *only* gfx devices. Either bypass it or 3229 set the gfx_mapped flag, as appropriate */ 3230 drhd->gfx_dedicated = 1; 3231 if (!dmar_map_gfx) 3232 drhd->ignored = 1; 3233 } 3234 } 3235 3236 #ifdef CONFIG_SUSPEND 3237 static int init_iommu_hw(void) 3238 { 3239 struct dmar_drhd_unit *drhd; 3240 struct intel_iommu *iommu = NULL; 3241 3242 for_each_active_iommu(iommu, drhd) 3243 if (iommu->qi) 3244 dmar_reenable_qi(iommu); 3245 3246 for_each_iommu(iommu, drhd) { 3247 if (drhd->ignored) { 3248 /* 3249 * we always have to disable PMRs or DMA may fail on 3250 * this device 3251 */ 3252 if (force_on) 3253 iommu_disable_protect_mem_regions(iommu); 3254 continue; 3255 } 3256 3257 iommu_flush_write_buffer(iommu); 3258 iommu_set_root_entry(iommu); 3259 iommu_enable_translation(iommu); 3260 iommu_disable_protect_mem_regions(iommu); 3261 } 3262 3263 return 0; 3264 } 3265 3266 static void iommu_flush_all(void) 3267 { 3268 struct dmar_drhd_unit *drhd; 3269 struct intel_iommu *iommu; 3270 3271 for_each_active_iommu(iommu, drhd) { 3272 iommu->flush.flush_context(iommu, 0, 0, 0, 3273 DMA_CCMD_GLOBAL_INVL); 3274 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3275 DMA_TLB_GLOBAL_FLUSH); 3276 } 3277 } 3278 3279 static int iommu_suspend(void) 3280 { 3281 struct dmar_drhd_unit *drhd; 3282 struct intel_iommu *iommu = NULL; 3283 unsigned long flag; 3284 3285 for_each_active_iommu(iommu, drhd) { 3286 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3287 GFP_KERNEL); 3288 if (!iommu->iommu_state) 3289 goto nomem; 3290 } 3291 3292 iommu_flush_all(); 3293 3294 for_each_active_iommu(iommu, drhd) { 3295 iommu_disable_translation(iommu); 3296 3297 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3298 3299 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3300 readl(iommu->reg + DMAR_FECTL_REG); 3301 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3302 readl(iommu->reg + DMAR_FEDATA_REG); 3303 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3304 readl(iommu->reg + DMAR_FEADDR_REG); 3305 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3306 readl(iommu->reg + DMAR_FEUADDR_REG); 3307 3308 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3309 } 3310 return 0; 3311 3312 nomem: 3313 for_each_active_iommu(iommu, drhd) 3314 kfree(iommu->iommu_state); 3315 3316 return -ENOMEM; 3317 } 3318 3319 static void iommu_resume(void) 3320 { 3321 struct dmar_drhd_unit *drhd; 3322 struct intel_iommu *iommu = NULL; 3323 unsigned long flag; 3324 3325 if (init_iommu_hw()) { 3326 if (force_on) 3327 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3328 else 3329 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3330 return; 3331 } 3332 3333 for_each_active_iommu(iommu, drhd) { 3334 3335 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3336 3337 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3338 iommu->reg + DMAR_FECTL_REG); 3339 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3340 iommu->reg + DMAR_FEDATA_REG); 3341 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3342 iommu->reg + DMAR_FEADDR_REG); 3343 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3344 iommu->reg + DMAR_FEUADDR_REG); 3345 3346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3347 } 3348 3349 for_each_active_iommu(iommu, drhd) 3350 kfree(iommu->iommu_state); 3351 } 3352 3353 static struct syscore_ops iommu_syscore_ops = { 3354 .resume = iommu_resume, 3355 .suspend = iommu_suspend, 3356 }; 3357 3358 static void __init init_iommu_pm_ops(void) 3359 { 3360 register_syscore_ops(&iommu_syscore_ops); 3361 } 3362 3363 #else 3364 static inline void init_iommu_pm_ops(void) {} 3365 #endif /* CONFIG_PM */ 3366 3367 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3368 { 3369 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3370 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3371 rmrr->end_address <= rmrr->base_address || 3372 arch_rmrr_sanity_check(rmrr)) 3373 return -EINVAL; 3374 3375 return 0; 3376 } 3377 3378 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3379 { 3380 struct acpi_dmar_reserved_memory *rmrr; 3381 struct dmar_rmrr_unit *rmrru; 3382 3383 rmrr = (struct acpi_dmar_reserved_memory *)header; 3384 if (rmrr_sanity_check(rmrr)) { 3385 pr_warn(FW_BUG 3386 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3387 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3388 rmrr->base_address, rmrr->end_address, 3389 dmi_get_system_info(DMI_BIOS_VENDOR), 3390 dmi_get_system_info(DMI_BIOS_VERSION), 3391 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3392 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3393 } 3394 3395 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3396 if (!rmrru) 3397 goto out; 3398 3399 rmrru->hdr = header; 3400 3401 rmrru->base_address = rmrr->base_address; 3402 rmrru->end_address = rmrr->end_address; 3403 3404 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3405 ((void *)rmrr) + rmrr->header.length, 3406 &rmrru->devices_cnt); 3407 if (rmrru->devices_cnt && rmrru->devices == NULL) 3408 goto free_rmrru; 3409 3410 list_add(&rmrru->list, &dmar_rmrr_units); 3411 3412 return 0; 3413 free_rmrru: 3414 kfree(rmrru); 3415 out: 3416 return -ENOMEM; 3417 } 3418 3419 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3420 { 3421 struct dmar_atsr_unit *atsru; 3422 struct acpi_dmar_atsr *tmp; 3423 3424 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3425 dmar_rcu_check()) { 3426 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3427 if (atsr->segment != tmp->segment) 3428 continue; 3429 if (atsr->header.length != tmp->header.length) 3430 continue; 3431 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3432 return atsru; 3433 } 3434 3435 return NULL; 3436 } 3437 3438 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3439 { 3440 struct acpi_dmar_atsr *atsr; 3441 struct dmar_atsr_unit *atsru; 3442 3443 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3444 return 0; 3445 3446 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3447 atsru = dmar_find_atsr(atsr); 3448 if (atsru) 3449 return 0; 3450 3451 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3452 if (!atsru) 3453 return -ENOMEM; 3454 3455 /* 3456 * If memory is allocated from slab by ACPI _DSM method, we need to 3457 * copy the memory content because the memory buffer will be freed 3458 * on return. 3459 */ 3460 atsru->hdr = (void *)(atsru + 1); 3461 memcpy(atsru->hdr, hdr, hdr->length); 3462 atsru->include_all = atsr->flags & 0x1; 3463 if (!atsru->include_all) { 3464 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3465 (void *)atsr + atsr->header.length, 3466 &atsru->devices_cnt); 3467 if (atsru->devices_cnt && atsru->devices == NULL) { 3468 kfree(atsru); 3469 return -ENOMEM; 3470 } 3471 } 3472 3473 list_add_rcu(&atsru->list, &dmar_atsr_units); 3474 3475 return 0; 3476 } 3477 3478 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3479 { 3480 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3481 kfree(atsru); 3482 } 3483 3484 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3485 { 3486 struct acpi_dmar_atsr *atsr; 3487 struct dmar_atsr_unit *atsru; 3488 3489 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3490 atsru = dmar_find_atsr(atsr); 3491 if (atsru) { 3492 list_del_rcu(&atsru->list); 3493 synchronize_rcu(); 3494 intel_iommu_free_atsr(atsru); 3495 } 3496 3497 return 0; 3498 } 3499 3500 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3501 { 3502 int i; 3503 struct device *dev; 3504 struct acpi_dmar_atsr *atsr; 3505 struct dmar_atsr_unit *atsru; 3506 3507 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3508 atsru = dmar_find_atsr(atsr); 3509 if (!atsru) 3510 return 0; 3511 3512 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3513 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3514 i, dev) 3515 return -EBUSY; 3516 } 3517 3518 return 0; 3519 } 3520 3521 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3522 { 3523 struct dmar_satc_unit *satcu; 3524 struct acpi_dmar_satc *tmp; 3525 3526 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3527 dmar_rcu_check()) { 3528 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3529 if (satc->segment != tmp->segment) 3530 continue; 3531 if (satc->header.length != tmp->header.length) 3532 continue; 3533 if (memcmp(satc, tmp, satc->header.length) == 0) 3534 return satcu; 3535 } 3536 3537 return NULL; 3538 } 3539 3540 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3541 { 3542 struct acpi_dmar_satc *satc; 3543 struct dmar_satc_unit *satcu; 3544 3545 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3546 return 0; 3547 3548 satc = container_of(hdr, struct acpi_dmar_satc, header); 3549 satcu = dmar_find_satc(satc); 3550 if (satcu) 3551 return 0; 3552 3553 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3554 if (!satcu) 3555 return -ENOMEM; 3556 3557 satcu->hdr = (void *)(satcu + 1); 3558 memcpy(satcu->hdr, hdr, hdr->length); 3559 satcu->atc_required = satc->flags & 0x1; 3560 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3561 (void *)satc + satc->header.length, 3562 &satcu->devices_cnt); 3563 if (satcu->devices_cnt && !satcu->devices) { 3564 kfree(satcu); 3565 return -ENOMEM; 3566 } 3567 list_add_rcu(&satcu->list, &dmar_satc_units); 3568 3569 return 0; 3570 } 3571 3572 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3573 { 3574 int sp, ret; 3575 struct intel_iommu *iommu = dmaru->iommu; 3576 3577 if (g_iommus[iommu->seq_id]) 3578 return 0; 3579 3580 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3581 if (ret) 3582 goto out; 3583 3584 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3585 pr_warn("%s: Doesn't support hardware pass through.\n", 3586 iommu->name); 3587 return -ENXIO; 3588 } 3589 if (!ecap_sc_support(iommu->ecap) && 3590 domain_update_iommu_snooping(iommu)) { 3591 pr_warn("%s: Doesn't support snooping.\n", 3592 iommu->name); 3593 return -ENXIO; 3594 } 3595 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3596 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3597 pr_warn("%s: Doesn't support large page.\n", 3598 iommu->name); 3599 return -ENXIO; 3600 } 3601 3602 /* 3603 * Disable translation if already enabled prior to OS handover. 3604 */ 3605 if (iommu->gcmd & DMA_GCMD_TE) 3606 iommu_disable_translation(iommu); 3607 3608 g_iommus[iommu->seq_id] = iommu; 3609 ret = iommu_init_domains(iommu); 3610 if (ret == 0) 3611 ret = iommu_alloc_root_entry(iommu); 3612 if (ret) 3613 goto out; 3614 3615 intel_svm_check(iommu); 3616 3617 if (dmaru->ignored) { 3618 /* 3619 * we always have to disable PMRs or DMA may fail on this device 3620 */ 3621 if (force_on) 3622 iommu_disable_protect_mem_regions(iommu); 3623 return 0; 3624 } 3625 3626 intel_iommu_init_qi(iommu); 3627 iommu_flush_write_buffer(iommu); 3628 3629 #ifdef CONFIG_INTEL_IOMMU_SVM 3630 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3631 ret = intel_svm_enable_prq(iommu); 3632 if (ret) 3633 goto disable_iommu; 3634 } 3635 #endif 3636 ret = dmar_set_interrupt(iommu); 3637 if (ret) 3638 goto disable_iommu; 3639 3640 iommu_set_root_entry(iommu); 3641 iommu_enable_translation(iommu); 3642 3643 iommu_disable_protect_mem_regions(iommu); 3644 return 0; 3645 3646 disable_iommu: 3647 disable_dmar_iommu(iommu); 3648 out: 3649 free_dmar_iommu(iommu); 3650 return ret; 3651 } 3652 3653 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3654 { 3655 int ret = 0; 3656 struct intel_iommu *iommu = dmaru->iommu; 3657 3658 if (!intel_iommu_enabled) 3659 return 0; 3660 if (iommu == NULL) 3661 return -EINVAL; 3662 3663 if (insert) { 3664 ret = intel_iommu_add(dmaru); 3665 } else { 3666 disable_dmar_iommu(iommu); 3667 free_dmar_iommu(iommu); 3668 } 3669 3670 return ret; 3671 } 3672 3673 static void intel_iommu_free_dmars(void) 3674 { 3675 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3676 struct dmar_atsr_unit *atsru, *atsr_n; 3677 struct dmar_satc_unit *satcu, *satc_n; 3678 3679 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3680 list_del(&rmrru->list); 3681 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3682 kfree(rmrru); 3683 } 3684 3685 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3686 list_del(&atsru->list); 3687 intel_iommu_free_atsr(atsru); 3688 } 3689 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3690 list_del(&satcu->list); 3691 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3692 kfree(satcu); 3693 } 3694 } 3695 3696 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) 3697 { 3698 struct dmar_satc_unit *satcu; 3699 struct acpi_dmar_satc *satc; 3700 struct device *tmp; 3701 int i; 3702 3703 dev = pci_physfn(dev); 3704 rcu_read_lock(); 3705 3706 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { 3707 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3708 if (satc->segment != pci_domain_nr(dev->bus)) 3709 continue; 3710 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) 3711 if (to_pci_dev(tmp) == dev) 3712 goto out; 3713 } 3714 satcu = NULL; 3715 out: 3716 rcu_read_unlock(); 3717 return satcu; 3718 } 3719 3720 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) 3721 { 3722 int i, ret = 1; 3723 struct pci_bus *bus; 3724 struct pci_dev *bridge = NULL; 3725 struct device *tmp; 3726 struct acpi_dmar_atsr *atsr; 3727 struct dmar_atsr_unit *atsru; 3728 struct dmar_satc_unit *satcu; 3729 3730 dev = pci_physfn(dev); 3731 satcu = dmar_find_matched_satc_unit(dev); 3732 if (satcu) 3733 /* 3734 * This device supports ATS as it is in SATC table. 3735 * When IOMMU is in legacy mode, enabling ATS is done 3736 * automatically by HW for the device that requires 3737 * ATS, hence OS should not enable this device ATS 3738 * to avoid duplicated TLB invalidation. 3739 */ 3740 return !(satcu->atc_required && !sm_supported(iommu)); 3741 3742 for (bus = dev->bus; bus; bus = bus->parent) { 3743 bridge = bus->self; 3744 /* If it's an integrated device, allow ATS */ 3745 if (!bridge) 3746 return 1; 3747 /* Connected via non-PCIe: no ATS */ 3748 if (!pci_is_pcie(bridge) || 3749 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3750 return 0; 3751 /* If we found the root port, look it up in the ATSR */ 3752 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3753 break; 3754 } 3755 3756 rcu_read_lock(); 3757 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3758 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3759 if (atsr->segment != pci_domain_nr(dev->bus)) 3760 continue; 3761 3762 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3763 if (tmp == &bridge->dev) 3764 goto out; 3765 3766 if (atsru->include_all) 3767 goto out; 3768 } 3769 ret = 0; 3770 out: 3771 rcu_read_unlock(); 3772 3773 return ret; 3774 } 3775 3776 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3777 { 3778 int ret; 3779 struct dmar_rmrr_unit *rmrru; 3780 struct dmar_atsr_unit *atsru; 3781 struct dmar_satc_unit *satcu; 3782 struct acpi_dmar_atsr *atsr; 3783 struct acpi_dmar_reserved_memory *rmrr; 3784 struct acpi_dmar_satc *satc; 3785 3786 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3787 return 0; 3788 3789 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3790 rmrr = container_of(rmrru->hdr, 3791 struct acpi_dmar_reserved_memory, header); 3792 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3793 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3794 ((void *)rmrr) + rmrr->header.length, 3795 rmrr->segment, rmrru->devices, 3796 rmrru->devices_cnt); 3797 if (ret < 0) 3798 return ret; 3799 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3800 dmar_remove_dev_scope(info, rmrr->segment, 3801 rmrru->devices, rmrru->devices_cnt); 3802 } 3803 } 3804 3805 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3806 if (atsru->include_all) 3807 continue; 3808 3809 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3810 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3811 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3812 (void *)atsr + atsr->header.length, 3813 atsr->segment, atsru->devices, 3814 atsru->devices_cnt); 3815 if (ret > 0) 3816 break; 3817 else if (ret < 0) 3818 return ret; 3819 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3820 if (dmar_remove_dev_scope(info, atsr->segment, 3821 atsru->devices, atsru->devices_cnt)) 3822 break; 3823 } 3824 } 3825 list_for_each_entry(satcu, &dmar_satc_units, list) { 3826 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 3827 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3828 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 3829 (void *)satc + satc->header.length, 3830 satc->segment, satcu->devices, 3831 satcu->devices_cnt); 3832 if (ret > 0) 3833 break; 3834 else if (ret < 0) 3835 return ret; 3836 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3837 if (dmar_remove_dev_scope(info, satc->segment, 3838 satcu->devices, satcu->devices_cnt)) 3839 break; 3840 } 3841 } 3842 3843 return 0; 3844 } 3845 3846 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3847 unsigned long val, void *v) 3848 { 3849 struct memory_notify *mhp = v; 3850 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3851 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3852 mhp->nr_pages - 1); 3853 3854 switch (val) { 3855 case MEM_GOING_ONLINE: 3856 if (iommu_domain_identity_map(si_domain, 3857 start_vpfn, last_vpfn)) { 3858 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3859 start_vpfn, last_vpfn); 3860 return NOTIFY_BAD; 3861 } 3862 break; 3863 3864 case MEM_OFFLINE: 3865 case MEM_CANCEL_ONLINE: 3866 { 3867 struct dmar_drhd_unit *drhd; 3868 struct intel_iommu *iommu; 3869 LIST_HEAD(freelist); 3870 3871 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 3872 3873 rcu_read_lock(); 3874 for_each_active_iommu(iommu, drhd) 3875 iommu_flush_iotlb_psi(iommu, si_domain, 3876 start_vpfn, mhp->nr_pages, 3877 list_empty(&freelist), 0); 3878 rcu_read_unlock(); 3879 put_pages_list(&freelist); 3880 } 3881 break; 3882 } 3883 3884 return NOTIFY_OK; 3885 } 3886 3887 static struct notifier_block intel_iommu_memory_nb = { 3888 .notifier_call = intel_iommu_memory_notifier, 3889 .priority = 0 3890 }; 3891 3892 static void intel_disable_iommus(void) 3893 { 3894 struct intel_iommu *iommu = NULL; 3895 struct dmar_drhd_unit *drhd; 3896 3897 for_each_iommu(iommu, drhd) 3898 iommu_disable_translation(iommu); 3899 } 3900 3901 void intel_iommu_shutdown(void) 3902 { 3903 struct dmar_drhd_unit *drhd; 3904 struct intel_iommu *iommu = NULL; 3905 3906 if (no_iommu || dmar_disabled) 3907 return; 3908 3909 down_write(&dmar_global_lock); 3910 3911 /* Disable PMRs explicitly here. */ 3912 for_each_iommu(iommu, drhd) 3913 iommu_disable_protect_mem_regions(iommu); 3914 3915 /* Make sure the IOMMUs are switched off */ 3916 intel_disable_iommus(); 3917 3918 up_write(&dmar_global_lock); 3919 } 3920 3921 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 3922 { 3923 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 3924 3925 return container_of(iommu_dev, struct intel_iommu, iommu); 3926 } 3927 3928 static ssize_t version_show(struct device *dev, 3929 struct device_attribute *attr, char *buf) 3930 { 3931 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3932 u32 ver = readl(iommu->reg + DMAR_VER_REG); 3933 return sprintf(buf, "%d:%d\n", 3934 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 3935 } 3936 static DEVICE_ATTR_RO(version); 3937 3938 static ssize_t address_show(struct device *dev, 3939 struct device_attribute *attr, char *buf) 3940 { 3941 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3942 return sprintf(buf, "%llx\n", iommu->reg_phys); 3943 } 3944 static DEVICE_ATTR_RO(address); 3945 3946 static ssize_t cap_show(struct device *dev, 3947 struct device_attribute *attr, char *buf) 3948 { 3949 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3950 return sprintf(buf, "%llx\n", iommu->cap); 3951 } 3952 static DEVICE_ATTR_RO(cap); 3953 3954 static ssize_t ecap_show(struct device *dev, 3955 struct device_attribute *attr, char *buf) 3956 { 3957 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3958 return sprintf(buf, "%llx\n", iommu->ecap); 3959 } 3960 static DEVICE_ATTR_RO(ecap); 3961 3962 static ssize_t domains_supported_show(struct device *dev, 3963 struct device_attribute *attr, char *buf) 3964 { 3965 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3966 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 3967 } 3968 static DEVICE_ATTR_RO(domains_supported); 3969 3970 static ssize_t domains_used_show(struct device *dev, 3971 struct device_attribute *attr, char *buf) 3972 { 3973 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 3974 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 3975 cap_ndoms(iommu->cap))); 3976 } 3977 static DEVICE_ATTR_RO(domains_used); 3978 3979 static struct attribute *intel_iommu_attrs[] = { 3980 &dev_attr_version.attr, 3981 &dev_attr_address.attr, 3982 &dev_attr_cap.attr, 3983 &dev_attr_ecap.attr, 3984 &dev_attr_domains_supported.attr, 3985 &dev_attr_domains_used.attr, 3986 NULL, 3987 }; 3988 3989 static struct attribute_group intel_iommu_group = { 3990 .name = "intel-iommu", 3991 .attrs = intel_iommu_attrs, 3992 }; 3993 3994 const struct attribute_group *intel_iommu_groups[] = { 3995 &intel_iommu_group, 3996 NULL, 3997 }; 3998 3999 static inline bool has_external_pci(void) 4000 { 4001 struct pci_dev *pdev = NULL; 4002 4003 for_each_pci_dev(pdev) 4004 if (pdev->external_facing) 4005 return true; 4006 4007 return false; 4008 } 4009 4010 static int __init platform_optin_force_iommu(void) 4011 { 4012 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4013 return 0; 4014 4015 if (no_iommu || dmar_disabled) 4016 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4017 4018 /* 4019 * If Intel-IOMMU is disabled by default, we will apply identity 4020 * map for all devices except those marked as being untrusted. 4021 */ 4022 if (dmar_disabled) 4023 iommu_set_default_passthrough(false); 4024 4025 dmar_disabled = 0; 4026 no_iommu = 0; 4027 4028 return 1; 4029 } 4030 4031 static int __init probe_acpi_namespace_devices(void) 4032 { 4033 struct dmar_drhd_unit *drhd; 4034 /* To avoid a -Wunused-but-set-variable warning. */ 4035 struct intel_iommu *iommu __maybe_unused; 4036 struct device *dev; 4037 int i, ret = 0; 4038 4039 for_each_active_iommu(iommu, drhd) { 4040 for_each_active_dev_scope(drhd->devices, 4041 drhd->devices_cnt, i, dev) { 4042 struct acpi_device_physical_node *pn; 4043 struct iommu_group *group; 4044 struct acpi_device *adev; 4045 4046 if (dev->bus != &acpi_bus_type) 4047 continue; 4048 4049 adev = to_acpi_device(dev); 4050 mutex_lock(&adev->physical_node_lock); 4051 list_for_each_entry(pn, 4052 &adev->physical_node_list, node) { 4053 group = iommu_group_get(pn->dev); 4054 if (group) { 4055 iommu_group_put(group); 4056 continue; 4057 } 4058 4059 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4060 ret = iommu_probe_device(pn->dev); 4061 if (ret) 4062 break; 4063 } 4064 mutex_unlock(&adev->physical_node_lock); 4065 4066 if (ret) 4067 return ret; 4068 } 4069 } 4070 4071 return 0; 4072 } 4073 4074 int __init intel_iommu_init(void) 4075 { 4076 int ret = -ENODEV; 4077 struct dmar_drhd_unit *drhd; 4078 struct intel_iommu *iommu; 4079 4080 /* 4081 * Intel IOMMU is required for a TXT/tboot launch or platform 4082 * opt in, so enforce that. 4083 */ 4084 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4085 platform_optin_force_iommu(); 4086 4087 down_write(&dmar_global_lock); 4088 if (dmar_table_init()) { 4089 if (force_on) 4090 panic("tboot: Failed to initialize DMAR table\n"); 4091 goto out_free_dmar; 4092 } 4093 4094 if (dmar_dev_scope_init() < 0) { 4095 if (force_on) 4096 panic("tboot: Failed to initialize DMAR device scope\n"); 4097 goto out_free_dmar; 4098 } 4099 4100 up_write(&dmar_global_lock); 4101 4102 /* 4103 * The bus notifier takes the dmar_global_lock, so lockdep will 4104 * complain later when we register it under the lock. 4105 */ 4106 dmar_register_bus_notifier(); 4107 4108 down_write(&dmar_global_lock); 4109 4110 if (!no_iommu) 4111 intel_iommu_debugfs_init(); 4112 4113 if (no_iommu || dmar_disabled) { 4114 /* 4115 * We exit the function here to ensure IOMMU's remapping and 4116 * mempool aren't setup, which means that the IOMMU's PMRs 4117 * won't be disabled via the call to init_dmars(). So disable 4118 * it explicitly here. The PMRs were setup by tboot prior to 4119 * calling SENTER, but the kernel is expected to reset/tear 4120 * down the PMRs. 4121 */ 4122 if (intel_iommu_tboot_noforce) { 4123 for_each_iommu(iommu, drhd) 4124 iommu_disable_protect_mem_regions(iommu); 4125 } 4126 4127 /* 4128 * Make sure the IOMMUs are switched off, even when we 4129 * boot into a kexec kernel and the previous kernel left 4130 * them enabled 4131 */ 4132 intel_disable_iommus(); 4133 goto out_free_dmar; 4134 } 4135 4136 if (list_empty(&dmar_rmrr_units)) 4137 pr_info("No RMRR found\n"); 4138 4139 if (list_empty(&dmar_atsr_units)) 4140 pr_info("No ATSR found\n"); 4141 4142 if (list_empty(&dmar_satc_units)) 4143 pr_info("No SATC found\n"); 4144 4145 if (dmar_map_gfx) 4146 intel_iommu_gfx_mapped = 1; 4147 4148 init_no_remapping_devices(); 4149 4150 ret = init_dmars(); 4151 if (ret) { 4152 if (force_on) 4153 panic("tboot: Failed to initialize DMARs\n"); 4154 pr_err("Initialization failed\n"); 4155 goto out_free_dmar; 4156 } 4157 up_write(&dmar_global_lock); 4158 4159 init_iommu_pm_ops(); 4160 4161 down_read(&dmar_global_lock); 4162 for_each_active_iommu(iommu, drhd) { 4163 /* 4164 * The flush queue implementation does not perform 4165 * page-selective invalidations that are required for efficient 4166 * TLB flushes in virtual environments. The benefit of batching 4167 * is likely to be much lower than the overhead of synchronizing 4168 * the virtual and physical IOMMU page-tables. 4169 */ 4170 if (cap_caching_mode(iommu->cap)) { 4171 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4172 iommu_set_dma_strict(); 4173 } 4174 iommu_device_sysfs_add(&iommu->iommu, NULL, 4175 intel_iommu_groups, 4176 "%s", iommu->name); 4177 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4178 } 4179 up_read(&dmar_global_lock); 4180 4181 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4182 if (si_domain && !hw_pass_through) 4183 register_memory_notifier(&intel_iommu_memory_nb); 4184 4185 down_read(&dmar_global_lock); 4186 if (probe_acpi_namespace_devices()) 4187 pr_warn("ACPI name space devices didn't probe correctly\n"); 4188 4189 /* Finally, we enable the DMA remapping hardware. */ 4190 for_each_iommu(iommu, drhd) { 4191 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4192 iommu_enable_translation(iommu); 4193 4194 iommu_disable_protect_mem_regions(iommu); 4195 } 4196 up_read(&dmar_global_lock); 4197 4198 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4199 4200 intel_iommu_enabled = 1; 4201 4202 return 0; 4203 4204 out_free_dmar: 4205 intel_iommu_free_dmars(); 4206 up_write(&dmar_global_lock); 4207 return ret; 4208 } 4209 4210 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4211 { 4212 struct device_domain_info *info = opaque; 4213 4214 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4215 return 0; 4216 } 4217 4218 /* 4219 * NB - intel-iommu lacks any sort of reference counting for the users of 4220 * dependent devices. If multiple endpoints have intersecting dependent 4221 * devices, unbinding the driver from any one of them will possibly leave 4222 * the others unable to operate. 4223 */ 4224 static void domain_context_clear(struct device_domain_info *info) 4225 { 4226 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4227 return; 4228 4229 pci_for_each_dma_alias(to_pci_dev(info->dev), 4230 &domain_context_clear_one_cb, info); 4231 } 4232 4233 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4234 { 4235 struct dmar_domain *domain; 4236 struct intel_iommu *iommu; 4237 unsigned long flags; 4238 4239 assert_spin_locked(&device_domain_lock); 4240 4241 if (WARN_ON(!info)) 4242 return; 4243 4244 iommu = info->iommu; 4245 domain = info->domain; 4246 4247 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4248 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4249 intel_pasid_tear_down_entry(iommu, info->dev, 4250 PASID_RID2PASID, false); 4251 4252 iommu_disable_dev_iotlb(info); 4253 domain_context_clear(info); 4254 intel_pasid_free_table(info->dev); 4255 } 4256 4257 list_del(&info->link); 4258 4259 spin_lock_irqsave(&iommu->lock, flags); 4260 domain_detach_iommu(domain, iommu); 4261 spin_unlock_irqrestore(&iommu->lock, flags); 4262 } 4263 4264 static void dmar_remove_one_dev_info(struct device *dev) 4265 { 4266 struct device_domain_info *info; 4267 unsigned long flags; 4268 4269 spin_lock_irqsave(&device_domain_lock, flags); 4270 info = dev_iommu_priv_get(dev); 4271 if (info) 4272 __dmar_remove_one_dev_info(info); 4273 spin_unlock_irqrestore(&device_domain_lock, flags); 4274 } 4275 4276 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4277 { 4278 int adjust_width; 4279 4280 /* calculate AGAW */ 4281 domain->gaw = guest_width; 4282 adjust_width = guestwidth_to_adjustwidth(guest_width); 4283 domain->agaw = width_to_agaw(adjust_width); 4284 4285 domain->iommu_coherency = false; 4286 domain->iommu_snooping = false; 4287 domain->iommu_superpage = 0; 4288 domain->max_addr = 0; 4289 4290 /* always allocate the top pgd */ 4291 domain->pgd = alloc_pgtable_page(domain->nid); 4292 if (!domain->pgd) 4293 return -ENOMEM; 4294 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4295 return 0; 4296 } 4297 4298 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4299 { 4300 struct dmar_domain *dmar_domain; 4301 struct iommu_domain *domain; 4302 4303 switch (type) { 4304 case IOMMU_DOMAIN_DMA: 4305 case IOMMU_DOMAIN_DMA_FQ: 4306 case IOMMU_DOMAIN_UNMANAGED: 4307 dmar_domain = alloc_domain(type); 4308 if (!dmar_domain) { 4309 pr_err("Can't allocate dmar_domain\n"); 4310 return NULL; 4311 } 4312 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4313 pr_err("Domain initialization failed\n"); 4314 domain_exit(dmar_domain); 4315 return NULL; 4316 } 4317 4318 domain = &dmar_domain->domain; 4319 domain->geometry.aperture_start = 0; 4320 domain->geometry.aperture_end = 4321 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4322 domain->geometry.force_aperture = true; 4323 4324 return domain; 4325 case IOMMU_DOMAIN_IDENTITY: 4326 return &si_domain->domain; 4327 default: 4328 return NULL; 4329 } 4330 4331 return NULL; 4332 } 4333 4334 static void intel_iommu_domain_free(struct iommu_domain *domain) 4335 { 4336 if (domain != &si_domain->domain) 4337 domain_exit(to_dmar_domain(domain)); 4338 } 4339 4340 static int prepare_domain_attach_device(struct iommu_domain *domain, 4341 struct device *dev) 4342 { 4343 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4344 struct intel_iommu *iommu; 4345 int addr_width; 4346 4347 iommu = device_to_iommu(dev, NULL, NULL); 4348 if (!iommu) 4349 return -ENODEV; 4350 4351 /* check if this iommu agaw is sufficient for max mapped address */ 4352 addr_width = agaw_to_width(iommu->agaw); 4353 if (addr_width > cap_mgaw(iommu->cap)) 4354 addr_width = cap_mgaw(iommu->cap); 4355 4356 if (dmar_domain->max_addr > (1LL << addr_width)) { 4357 dev_err(dev, "%s: iommu width (%d) is not " 4358 "sufficient for the mapped address (%llx)\n", 4359 __func__, addr_width, dmar_domain->max_addr); 4360 return -EFAULT; 4361 } 4362 dmar_domain->gaw = addr_width; 4363 4364 /* 4365 * Knock out extra levels of page tables if necessary 4366 */ 4367 while (iommu->agaw < dmar_domain->agaw) { 4368 struct dma_pte *pte; 4369 4370 pte = dmar_domain->pgd; 4371 if (dma_pte_present(pte)) { 4372 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4373 free_pgtable_page(pte); 4374 } 4375 dmar_domain->agaw--; 4376 } 4377 4378 return 0; 4379 } 4380 4381 static int intel_iommu_attach_device(struct iommu_domain *domain, 4382 struct device *dev) 4383 { 4384 int ret; 4385 4386 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4387 device_is_rmrr_locked(dev)) { 4388 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4389 return -EPERM; 4390 } 4391 4392 /* normally dev is not mapped */ 4393 if (unlikely(domain_context_mapped(dev))) { 4394 struct device_domain_info *info = dev_iommu_priv_get(dev); 4395 4396 if (info->domain) 4397 dmar_remove_one_dev_info(dev); 4398 } 4399 4400 ret = prepare_domain_attach_device(domain, dev); 4401 if (ret) 4402 return ret; 4403 4404 return domain_add_dev_info(to_dmar_domain(domain), dev); 4405 } 4406 4407 static void intel_iommu_detach_device(struct iommu_domain *domain, 4408 struct device *dev) 4409 { 4410 dmar_remove_one_dev_info(dev); 4411 } 4412 4413 static int intel_iommu_map(struct iommu_domain *domain, 4414 unsigned long iova, phys_addr_t hpa, 4415 size_t size, int iommu_prot, gfp_t gfp) 4416 { 4417 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4418 u64 max_addr; 4419 int prot = 0; 4420 4421 if (iommu_prot & IOMMU_READ) 4422 prot |= DMA_PTE_READ; 4423 if (iommu_prot & IOMMU_WRITE) 4424 prot |= DMA_PTE_WRITE; 4425 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 4426 prot |= DMA_PTE_SNP; 4427 4428 max_addr = iova + size; 4429 if (dmar_domain->max_addr < max_addr) { 4430 u64 end; 4431 4432 /* check if minimum agaw is sufficient for mapped address */ 4433 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4434 if (end < max_addr) { 4435 pr_err("%s: iommu width (%d) is not " 4436 "sufficient for the mapped address (%llx)\n", 4437 __func__, dmar_domain->gaw, max_addr); 4438 return -EFAULT; 4439 } 4440 dmar_domain->max_addr = max_addr; 4441 } 4442 /* Round up size to next multiple of PAGE_SIZE, if it and 4443 the low bits of hpa would take us onto the next page */ 4444 size = aligned_nrpages(hpa, size); 4445 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4446 hpa >> VTD_PAGE_SHIFT, size, prot); 4447 } 4448 4449 static int intel_iommu_map_pages(struct iommu_domain *domain, 4450 unsigned long iova, phys_addr_t paddr, 4451 size_t pgsize, size_t pgcount, 4452 int prot, gfp_t gfp, size_t *mapped) 4453 { 4454 unsigned long pgshift = __ffs(pgsize); 4455 size_t size = pgcount << pgshift; 4456 int ret; 4457 4458 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 4459 return -EINVAL; 4460 4461 if (!IS_ALIGNED(iova | paddr, pgsize)) 4462 return -EINVAL; 4463 4464 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 4465 if (!ret && mapped) 4466 *mapped = size; 4467 4468 return ret; 4469 } 4470 4471 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4472 unsigned long iova, size_t size, 4473 struct iommu_iotlb_gather *gather) 4474 { 4475 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4476 unsigned long start_pfn, last_pfn; 4477 int level = 0; 4478 4479 /* Cope with horrid API which requires us to unmap more than the 4480 size argument if it happens to be a large-page mapping. */ 4481 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4482 4483 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4484 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4485 4486 start_pfn = iova >> VTD_PAGE_SHIFT; 4487 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4488 4489 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 4490 4491 if (dmar_domain->max_addr == iova + size) 4492 dmar_domain->max_addr = iova; 4493 4494 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4495 4496 return size; 4497 } 4498 4499 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 4500 unsigned long iova, 4501 size_t pgsize, size_t pgcount, 4502 struct iommu_iotlb_gather *gather) 4503 { 4504 unsigned long pgshift = __ffs(pgsize); 4505 size_t size = pgcount << pgshift; 4506 4507 return intel_iommu_unmap(domain, iova, size, gather); 4508 } 4509 4510 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4511 struct iommu_iotlb_gather *gather) 4512 { 4513 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4514 unsigned long iova_pfn = IOVA_PFN(gather->start); 4515 size_t size = gather->end - gather->start; 4516 unsigned long start_pfn; 4517 unsigned long nrpages; 4518 int iommu_id; 4519 4520 nrpages = aligned_nrpages(gather->start, size); 4521 start_pfn = mm_to_dma_pfn(iova_pfn); 4522 4523 for_each_domain_iommu(iommu_id, dmar_domain) 4524 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 4525 start_pfn, nrpages, 4526 list_empty(&gather->freelist), 0); 4527 4528 put_pages_list(&gather->freelist); 4529 } 4530 4531 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4532 dma_addr_t iova) 4533 { 4534 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4535 struct dma_pte *pte; 4536 int level = 0; 4537 u64 phys = 0; 4538 4539 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4540 if (pte && dma_pte_present(pte)) 4541 phys = dma_pte_addr(pte) + 4542 (iova & (BIT_MASK(level_to_offset_bits(level) + 4543 VTD_PAGE_SHIFT) - 1)); 4544 4545 return phys; 4546 } 4547 4548 static bool intel_iommu_capable(enum iommu_cap cap) 4549 { 4550 if (cap == IOMMU_CAP_CACHE_COHERENCY) 4551 return domain_update_iommu_snooping(NULL); 4552 if (cap == IOMMU_CAP_INTR_REMAP) 4553 return irq_remapping_enabled == 1; 4554 4555 return false; 4556 } 4557 4558 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 4559 { 4560 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; 4561 struct device_domain_info *info; 4562 struct intel_iommu *iommu; 4563 unsigned long flags; 4564 u8 bus, devfn; 4565 4566 iommu = device_to_iommu(dev, &bus, &devfn); 4567 if (!iommu) 4568 return ERR_PTR(-ENODEV); 4569 4570 info = kzalloc(sizeof(*info), GFP_KERNEL); 4571 if (!info) 4572 return ERR_PTR(-ENOMEM); 4573 4574 if (dev_is_real_dma_subdevice(dev)) { 4575 info->bus = pdev->bus->number; 4576 info->devfn = pdev->devfn; 4577 info->segment = pci_domain_nr(pdev->bus); 4578 } else { 4579 info->bus = bus; 4580 info->devfn = devfn; 4581 info->segment = iommu->segment; 4582 } 4583 4584 info->dev = dev; 4585 info->iommu = iommu; 4586 if (dev_is_pci(dev)) { 4587 if (ecap_dev_iotlb_support(iommu->ecap) && 4588 pci_ats_supported(pdev) && 4589 dmar_ats_supported(pdev, iommu)) 4590 info->ats_supported = 1; 4591 4592 if (sm_supported(iommu)) { 4593 if (pasid_supported(iommu)) { 4594 int features = pci_pasid_features(pdev); 4595 4596 if (features >= 0) 4597 info->pasid_supported = features | 1; 4598 } 4599 4600 if (info->ats_supported && ecap_prs(iommu->ecap) && 4601 pci_pri_supported(pdev)) 4602 info->pri_supported = 1; 4603 } 4604 } 4605 4606 spin_lock_irqsave(&device_domain_lock, flags); 4607 list_add(&info->global, &device_domain_list); 4608 dev_iommu_priv_set(dev, info); 4609 spin_unlock_irqrestore(&device_domain_lock, flags); 4610 4611 return &iommu->iommu; 4612 } 4613 4614 static void intel_iommu_release_device(struct device *dev) 4615 { 4616 struct device_domain_info *info = dev_iommu_priv_get(dev); 4617 unsigned long flags; 4618 4619 dmar_remove_one_dev_info(dev); 4620 4621 spin_lock_irqsave(&device_domain_lock, flags); 4622 dev_iommu_priv_set(dev, NULL); 4623 list_del(&info->global); 4624 spin_unlock_irqrestore(&device_domain_lock, flags); 4625 4626 kfree(info); 4627 set_dma_ops(dev, NULL); 4628 } 4629 4630 static void intel_iommu_probe_finalize(struct device *dev) 4631 { 4632 set_dma_ops(dev, NULL); 4633 iommu_setup_dma_ops(dev, 0, U64_MAX); 4634 } 4635 4636 static void intel_iommu_get_resv_regions(struct device *device, 4637 struct list_head *head) 4638 { 4639 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 4640 struct iommu_resv_region *reg; 4641 struct dmar_rmrr_unit *rmrr; 4642 struct device *i_dev; 4643 int i; 4644 4645 down_read(&dmar_global_lock); 4646 for_each_rmrr_units(rmrr) { 4647 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 4648 i, i_dev) { 4649 struct iommu_resv_region *resv; 4650 enum iommu_resv_type type; 4651 size_t length; 4652 4653 if (i_dev != device && 4654 !is_downstream_to_pci_bridge(device, i_dev)) 4655 continue; 4656 4657 length = rmrr->end_address - rmrr->base_address + 1; 4658 4659 type = device_rmrr_is_relaxable(device) ? 4660 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 4661 4662 resv = iommu_alloc_resv_region(rmrr->base_address, 4663 length, prot, type); 4664 if (!resv) 4665 break; 4666 4667 list_add_tail(&resv->list, head); 4668 } 4669 } 4670 up_read(&dmar_global_lock); 4671 4672 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 4673 if (dev_is_pci(device)) { 4674 struct pci_dev *pdev = to_pci_dev(device); 4675 4676 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 4677 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 4678 IOMMU_RESV_DIRECT_RELAXABLE); 4679 if (reg) 4680 list_add_tail(®->list, head); 4681 } 4682 } 4683 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 4684 4685 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 4686 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 4687 0, IOMMU_RESV_MSI); 4688 if (!reg) 4689 return; 4690 list_add_tail(®->list, head); 4691 } 4692 4693 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 4694 { 4695 struct device_domain_info *info = dev_iommu_priv_get(dev); 4696 struct context_entry *context; 4697 struct dmar_domain *domain; 4698 unsigned long flags; 4699 u64 ctx_lo; 4700 int ret; 4701 4702 domain = info->domain; 4703 if (!domain) 4704 return -EINVAL; 4705 4706 spin_lock_irqsave(&device_domain_lock, flags); 4707 spin_lock(&iommu->lock); 4708 4709 ret = -EINVAL; 4710 if (!info->pasid_supported) 4711 goto out; 4712 4713 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 4714 if (WARN_ON(!context)) 4715 goto out; 4716 4717 ctx_lo = context[0].lo; 4718 4719 if (!(ctx_lo & CONTEXT_PASIDE)) { 4720 ctx_lo |= CONTEXT_PASIDE; 4721 context[0].lo = ctx_lo; 4722 wmb(); 4723 iommu->flush.flush_context(iommu, 4724 domain->iommu_did[iommu->seq_id], 4725 PCI_DEVID(info->bus, info->devfn), 4726 DMA_CCMD_MASK_NOBIT, 4727 DMA_CCMD_DEVICE_INVL); 4728 } 4729 4730 /* Enable PASID support in the device, if it wasn't already */ 4731 if (!info->pasid_enabled) 4732 iommu_enable_dev_iotlb(info); 4733 4734 ret = 0; 4735 4736 out: 4737 spin_unlock(&iommu->lock); 4738 spin_unlock_irqrestore(&device_domain_lock, flags); 4739 4740 return ret; 4741 } 4742 4743 static struct iommu_group *intel_iommu_device_group(struct device *dev) 4744 { 4745 if (dev_is_pci(dev)) 4746 return pci_device_group(dev); 4747 return generic_device_group(dev); 4748 } 4749 4750 static int intel_iommu_enable_sva(struct device *dev) 4751 { 4752 struct device_domain_info *info = dev_iommu_priv_get(dev); 4753 struct intel_iommu *iommu; 4754 int ret; 4755 4756 if (!info || dmar_disabled) 4757 return -EINVAL; 4758 4759 iommu = info->iommu; 4760 if (!iommu) 4761 return -EINVAL; 4762 4763 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 4764 return -ENODEV; 4765 4766 if (intel_iommu_enable_pasid(iommu, dev)) 4767 return -ENODEV; 4768 4769 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 4770 return -EINVAL; 4771 4772 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 4773 if (!ret) 4774 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 4775 4776 return ret; 4777 } 4778 4779 static int intel_iommu_disable_sva(struct device *dev) 4780 { 4781 struct device_domain_info *info = dev_iommu_priv_get(dev); 4782 struct intel_iommu *iommu = info->iommu; 4783 int ret; 4784 4785 ret = iommu_unregister_device_fault_handler(dev); 4786 if (!ret) 4787 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 4788 4789 return ret; 4790 } 4791 4792 static int intel_iommu_enable_iopf(struct device *dev) 4793 { 4794 struct device_domain_info *info = dev_iommu_priv_get(dev); 4795 4796 if (info && info->pri_supported) 4797 return 0; 4798 4799 return -ENODEV; 4800 } 4801 4802 static int 4803 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 4804 { 4805 switch (feat) { 4806 case IOMMU_DEV_FEAT_IOPF: 4807 return intel_iommu_enable_iopf(dev); 4808 4809 case IOMMU_DEV_FEAT_SVA: 4810 return intel_iommu_enable_sva(dev); 4811 4812 default: 4813 return -ENODEV; 4814 } 4815 } 4816 4817 static int 4818 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 4819 { 4820 switch (feat) { 4821 case IOMMU_DEV_FEAT_IOPF: 4822 return 0; 4823 4824 case IOMMU_DEV_FEAT_SVA: 4825 return intel_iommu_disable_sva(dev); 4826 4827 default: 4828 return -ENODEV; 4829 } 4830 } 4831 4832 static bool intel_iommu_is_attach_deferred(struct device *dev) 4833 { 4834 struct device_domain_info *info = dev_iommu_priv_get(dev); 4835 4836 return translation_pre_enabled(info->iommu) && !info->domain; 4837 } 4838 4839 /* 4840 * Check that the device does not live on an external facing PCI port that is 4841 * marked as untrusted. Such devices should not be able to apply quirks and 4842 * thus not be able to bypass the IOMMU restrictions. 4843 */ 4844 static bool risky_device(struct pci_dev *pdev) 4845 { 4846 if (pdev->untrusted) { 4847 pci_info(pdev, 4848 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 4849 pdev->vendor, pdev->device); 4850 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 4851 return true; 4852 } 4853 return false; 4854 } 4855 4856 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 4857 unsigned long iova, size_t size) 4858 { 4859 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4860 unsigned long pages = aligned_nrpages(iova, size); 4861 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 4862 struct intel_iommu *iommu; 4863 int iommu_id; 4864 4865 for_each_domain_iommu(iommu_id, dmar_domain) { 4866 iommu = g_iommus[iommu_id]; 4867 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 4868 } 4869 } 4870 4871 const struct iommu_ops intel_iommu_ops = { 4872 .capable = intel_iommu_capable, 4873 .domain_alloc = intel_iommu_domain_alloc, 4874 .probe_device = intel_iommu_probe_device, 4875 .probe_finalize = intel_iommu_probe_finalize, 4876 .release_device = intel_iommu_release_device, 4877 .get_resv_regions = intel_iommu_get_resv_regions, 4878 .put_resv_regions = generic_iommu_put_resv_regions, 4879 .device_group = intel_iommu_device_group, 4880 .dev_enable_feat = intel_iommu_dev_enable_feat, 4881 .dev_disable_feat = intel_iommu_dev_disable_feat, 4882 .is_attach_deferred = intel_iommu_is_attach_deferred, 4883 .def_domain_type = device_def_domain_type, 4884 .pgsize_bitmap = SZ_4K, 4885 #ifdef CONFIG_INTEL_IOMMU_SVM 4886 .sva_bind = intel_svm_bind, 4887 .sva_unbind = intel_svm_unbind, 4888 .sva_get_pasid = intel_svm_get_pasid, 4889 .page_response = intel_svm_page_response, 4890 #endif 4891 .default_domain_ops = &(const struct iommu_domain_ops) { 4892 .attach_dev = intel_iommu_attach_device, 4893 .detach_dev = intel_iommu_detach_device, 4894 .map_pages = intel_iommu_map_pages, 4895 .unmap_pages = intel_iommu_unmap_pages, 4896 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 4897 .flush_iotlb_all = intel_flush_iotlb_all, 4898 .iotlb_sync = intel_iommu_tlb_sync, 4899 .iova_to_phys = intel_iommu_iova_to_phys, 4900 .free = intel_iommu_domain_free, 4901 } 4902 }; 4903 4904 static void quirk_iommu_igfx(struct pci_dev *dev) 4905 { 4906 if (risky_device(dev)) 4907 return; 4908 4909 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 4910 dmar_map_gfx = 0; 4911 } 4912 4913 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 4921 4922 /* Broadwell igfx malfunctions with dmar */ 4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 4947 4948 static void quirk_iommu_rwbf(struct pci_dev *dev) 4949 { 4950 if (risky_device(dev)) 4951 return; 4952 4953 /* 4954 * Mobile 4 Series Chipset neglects to set RWBF capability, 4955 * but needs it. Same seems to hold for the desktop versions. 4956 */ 4957 pci_info(dev, "Forcing write-buffer flush capability\n"); 4958 rwbf_quirk = 1; 4959 } 4960 4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 4968 4969 #define GGC 0x52 4970 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 4971 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 4972 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 4973 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 4974 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 4975 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 4976 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 4977 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 4978 4979 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 4980 { 4981 unsigned short ggc; 4982 4983 if (risky_device(dev)) 4984 return; 4985 4986 if (pci_read_config_word(dev, GGC, &ggc)) 4987 return; 4988 4989 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 4990 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 4991 dmar_map_gfx = 0; 4992 } else if (dmar_map_gfx) { 4993 /* we have to ensure the gfx device is idle before we flush */ 4994 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 4995 iommu_set_dma_strict(); 4996 } 4997 } 4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5002 5003 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5004 { 5005 unsigned short ver; 5006 5007 if (!IS_GFX_DEVICE(dev)) 5008 return; 5009 5010 ver = (dev->device >> 8) & 0xff; 5011 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5012 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5013 ver != 0x9a) 5014 return; 5015 5016 if (risky_device(dev)) 5017 return; 5018 5019 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5020 iommu_skip_te_disable = 1; 5021 } 5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5023 5024 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5025 ISOCH DMAR unit for the Azalia sound device, but not give it any 5026 TLB entries, which causes it to deadlock. Check for that. We do 5027 this in a function called from init_dmars(), instead of in a PCI 5028 quirk, because we don't want to print the obnoxious "BIOS broken" 5029 message if VT-d is actually disabled. 5030 */ 5031 static void __init check_tylersburg_isoch(void) 5032 { 5033 struct pci_dev *pdev; 5034 uint32_t vtisochctrl; 5035 5036 /* If there's no Azalia in the system anyway, forget it. */ 5037 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5038 if (!pdev) 5039 return; 5040 5041 if (risky_device(pdev)) { 5042 pci_dev_put(pdev); 5043 return; 5044 } 5045 5046 pci_dev_put(pdev); 5047 5048 /* System Management Registers. Might be hidden, in which case 5049 we can't do the sanity check. But that's OK, because the 5050 known-broken BIOSes _don't_ actually hide it, so far. */ 5051 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5052 if (!pdev) 5053 return; 5054 5055 if (risky_device(pdev)) { 5056 pci_dev_put(pdev); 5057 return; 5058 } 5059 5060 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5061 pci_dev_put(pdev); 5062 return; 5063 } 5064 5065 pci_dev_put(pdev); 5066 5067 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5068 if (vtisochctrl & 1) 5069 return; 5070 5071 /* Drop all bits other than the number of TLB entries */ 5072 vtisochctrl &= 0x1c; 5073 5074 /* If we have the recommended number of TLB entries (16), fine. */ 5075 if (vtisochctrl == 0x10) 5076 return; 5077 5078 /* Zero TLB entries? You get to ride the short bus to school. */ 5079 if (!vtisochctrl) { 5080 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5081 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5082 dmi_get_system_info(DMI_BIOS_VENDOR), 5083 dmi_get_system_info(DMI_BIOS_VERSION), 5084 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5085 iommu_identity_mapping |= IDENTMAP_AZALIA; 5086 return; 5087 } 5088 5089 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5090 vtisochctrl); 5091 } 5092