1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/intel-svm.h> 37 #include <linux/syscore_ops.h> 38 #include <linux/tboot.h> 39 #include <linux/dmi.h> 40 #include <linux/pci-ats.h> 41 #include <linux/memblock.h> 42 #include <linux/dma-direct.h> 43 #include <linux/crash_dump.h> 44 #include <linux/numa.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 49 #include "../irq_remapping.h" 50 #include "../iommu-sva-lib.h" 51 #include "pasid.h" 52 #include "cap_audit.h" 53 54 #define ROOT_SIZE VTD_PAGE_SIZE 55 #define CONTEXT_SIZE VTD_PAGE_SIZE 56 57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 61 62 #define IOAPIC_RANGE_START (0xfee00000) 63 #define IOAPIC_RANGE_END (0xfeefffff) 64 #define IOVA_START_ADDR (0x1000) 65 66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 67 68 #define MAX_AGAW_WIDTH 64 69 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 70 71 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 73 74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 75 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 76 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 77 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 78 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 79 80 /* IO virtual address start page frame number */ 81 #define IOVA_START_PFN (1) 82 83 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 84 85 /* page table handling */ 86 #define LEVEL_STRIDE (9) 87 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 88 89 static inline int agaw_to_level(int agaw) 90 { 91 return agaw + 2; 92 } 93 94 static inline int agaw_to_width(int agaw) 95 { 96 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 97 } 98 99 static inline int width_to_agaw(int width) 100 { 101 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 102 } 103 104 static inline unsigned int level_to_offset_bits(int level) 105 { 106 return (level - 1) * LEVEL_STRIDE; 107 } 108 109 static inline int pfn_level_offset(u64 pfn, int level) 110 { 111 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 112 } 113 114 static inline u64 level_mask(int level) 115 { 116 return -1ULL << level_to_offset_bits(level); 117 } 118 119 static inline u64 level_size(int level) 120 { 121 return 1ULL << level_to_offset_bits(level); 122 } 123 124 static inline u64 align_to_level(u64 pfn, int level) 125 { 126 return (pfn + level_size(level) - 1) & level_mask(level); 127 } 128 129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 130 { 131 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 132 } 133 134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 135 are never going to work. */ 136 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 137 { 138 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 139 } 140 static inline unsigned long page_to_dma_pfn(struct page *pg) 141 { 142 return mm_to_dma_pfn(page_to_pfn(pg)); 143 } 144 static inline unsigned long virt_to_dma_pfn(void *p) 145 { 146 return page_to_dma_pfn(virt_to_page(p)); 147 } 148 149 /* global iommu list, set NULL for ignored DMAR units */ 150 static struct intel_iommu **g_iommus; 151 152 static void __init check_tylersburg_isoch(void); 153 static int rwbf_quirk; 154 static inline struct device_domain_info * 155 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 156 157 /* 158 * set to 1 to panic kernel if can't successfully enable VT-d 159 * (used when kernel is launched w/ TXT) 160 */ 161 static int force_on = 0; 162 static int intel_iommu_tboot_noforce; 163 static int no_platform_optin; 164 165 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 166 167 /* 168 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 169 * if marked present. 170 */ 171 static phys_addr_t root_entry_lctp(struct root_entry *re) 172 { 173 if (!(re->lo & 1)) 174 return 0; 175 176 return re->lo & VTD_PAGE_MASK; 177 } 178 179 /* 180 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 181 * if marked present. 182 */ 183 static phys_addr_t root_entry_uctp(struct root_entry *re) 184 { 185 if (!(re->hi & 1)) 186 return 0; 187 188 return re->hi & VTD_PAGE_MASK; 189 } 190 191 static inline void context_clear_pasid_enable(struct context_entry *context) 192 { 193 context->lo &= ~(1ULL << 11); 194 } 195 196 static inline bool context_pasid_enabled(struct context_entry *context) 197 { 198 return !!(context->lo & (1ULL << 11)); 199 } 200 201 static inline void context_set_copied(struct context_entry *context) 202 { 203 context->hi |= (1ull << 3); 204 } 205 206 static inline bool context_copied(struct context_entry *context) 207 { 208 return !!(context->hi & (1ULL << 3)); 209 } 210 211 static inline bool __context_present(struct context_entry *context) 212 { 213 return (context->lo & 1); 214 } 215 216 bool context_present(struct context_entry *context) 217 { 218 return context_pasid_enabled(context) ? 219 __context_present(context) : 220 __context_present(context) && !context_copied(context); 221 } 222 223 static inline void context_set_present(struct context_entry *context) 224 { 225 context->lo |= 1; 226 } 227 228 static inline void context_set_fault_enable(struct context_entry *context) 229 { 230 context->lo &= (((u64)-1) << 2) | 1; 231 } 232 233 static inline void context_set_translation_type(struct context_entry *context, 234 unsigned long value) 235 { 236 context->lo &= (((u64)-1) << 4) | 3; 237 context->lo |= (value & 3) << 2; 238 } 239 240 static inline void context_set_address_root(struct context_entry *context, 241 unsigned long value) 242 { 243 context->lo &= ~VTD_PAGE_MASK; 244 context->lo |= value & VTD_PAGE_MASK; 245 } 246 247 static inline void context_set_address_width(struct context_entry *context, 248 unsigned long value) 249 { 250 context->hi |= value & 7; 251 } 252 253 static inline void context_set_domain_id(struct context_entry *context, 254 unsigned long value) 255 { 256 context->hi |= (value & ((1 << 16) - 1)) << 8; 257 } 258 259 static inline int context_domain_id(struct context_entry *c) 260 { 261 return((c->hi >> 8) & 0xffff); 262 } 263 264 static inline void context_clear_entry(struct context_entry *context) 265 { 266 context->lo = 0; 267 context->hi = 0; 268 } 269 270 /* 271 * This domain is a statically identity mapping domain. 272 * 1. This domain creats a static 1:1 mapping to all usable memory. 273 * 2. It maps to each iommu if successful. 274 * 3. Each iommu mapps to this domain if successful. 275 */ 276 static struct dmar_domain *si_domain; 277 static int hw_pass_through = 1; 278 279 #define for_each_domain_iommu(idx, domain) \ 280 for (idx = 0; idx < g_num_of_iommus; idx++) \ 281 if (domain->iommu_refcnt[idx]) 282 283 struct dmar_rmrr_unit { 284 struct list_head list; /* list of rmrr units */ 285 struct acpi_dmar_header *hdr; /* ACPI header */ 286 u64 base_address; /* reserved base address*/ 287 u64 end_address; /* reserved end address */ 288 struct dmar_dev_scope *devices; /* target devices */ 289 int devices_cnt; /* target device count */ 290 }; 291 292 struct dmar_atsr_unit { 293 struct list_head list; /* list of ATSR units */ 294 struct acpi_dmar_header *hdr; /* ACPI header */ 295 struct dmar_dev_scope *devices; /* target devices */ 296 int devices_cnt; /* target device count */ 297 u8 include_all:1; /* include all ports */ 298 }; 299 300 struct dmar_satc_unit { 301 struct list_head list; /* list of SATC units */ 302 struct acpi_dmar_header *hdr; /* ACPI header */ 303 struct dmar_dev_scope *devices; /* target devices */ 304 struct intel_iommu *iommu; /* the corresponding iommu */ 305 int devices_cnt; /* target device count */ 306 u8 atc_required:1; /* ATS is required */ 307 }; 308 309 static LIST_HEAD(dmar_atsr_units); 310 static LIST_HEAD(dmar_rmrr_units); 311 static LIST_HEAD(dmar_satc_units); 312 313 #define for_each_rmrr_units(rmrr) \ 314 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 315 316 /* bitmap for indexing intel_iommus */ 317 static int g_num_of_iommus; 318 319 static void domain_exit(struct dmar_domain *domain); 320 static void domain_remove_dev_info(struct dmar_domain *domain); 321 static void dmar_remove_one_dev_info(struct device *dev); 322 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 323 static int intel_iommu_attach_device(struct iommu_domain *domain, 324 struct device *dev); 325 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 326 dma_addr_t iova); 327 328 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 329 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 330 331 int intel_iommu_enabled = 0; 332 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 333 334 static int dmar_map_gfx = 1; 335 static int intel_iommu_superpage = 1; 336 static int iommu_identity_mapping; 337 static int iommu_skip_te_disable; 338 339 #define IDENTMAP_GFX 2 340 #define IDENTMAP_AZALIA 4 341 342 int intel_iommu_gfx_mapped; 343 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 344 345 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 346 struct device_domain_info *get_domain_info(struct device *dev) 347 { 348 struct device_domain_info *info; 349 350 if (!dev) 351 return NULL; 352 353 info = dev_iommu_priv_get(dev); 354 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 355 return NULL; 356 357 return info; 358 } 359 360 DEFINE_SPINLOCK(device_domain_lock); 361 static LIST_HEAD(device_domain_list); 362 363 /* 364 * Iterate over elements in device_domain_list and call the specified 365 * callback @fn against each element. 366 */ 367 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 368 void *data), void *data) 369 { 370 int ret = 0; 371 unsigned long flags; 372 struct device_domain_info *info; 373 374 spin_lock_irqsave(&device_domain_lock, flags); 375 list_for_each_entry(info, &device_domain_list, global) { 376 ret = fn(info, data); 377 if (ret) { 378 spin_unlock_irqrestore(&device_domain_lock, flags); 379 return ret; 380 } 381 } 382 spin_unlock_irqrestore(&device_domain_lock, flags); 383 384 return 0; 385 } 386 387 const struct iommu_ops intel_iommu_ops; 388 389 static bool translation_pre_enabled(struct intel_iommu *iommu) 390 { 391 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 392 } 393 394 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 395 { 396 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 397 } 398 399 static void init_translation_status(struct intel_iommu *iommu) 400 { 401 u32 gsts; 402 403 gsts = readl(iommu->reg + DMAR_GSTS_REG); 404 if (gsts & DMA_GSTS_TES) 405 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 406 } 407 408 static int __init intel_iommu_setup(char *str) 409 { 410 if (!str) 411 return -EINVAL; 412 413 while (*str) { 414 if (!strncmp(str, "on", 2)) { 415 dmar_disabled = 0; 416 pr_info("IOMMU enabled\n"); 417 } else if (!strncmp(str, "off", 3)) { 418 dmar_disabled = 1; 419 no_platform_optin = 1; 420 pr_info("IOMMU disabled\n"); 421 } else if (!strncmp(str, "igfx_off", 8)) { 422 dmar_map_gfx = 0; 423 pr_info("Disable GFX device mapping\n"); 424 } else if (!strncmp(str, "forcedac", 8)) { 425 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 426 iommu_dma_forcedac = true; 427 } else if (!strncmp(str, "strict", 6)) { 428 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 429 iommu_set_dma_strict(); 430 } else if (!strncmp(str, "sp_off", 6)) { 431 pr_info("Disable supported super page\n"); 432 intel_iommu_superpage = 0; 433 } else if (!strncmp(str, "sm_on", 5)) { 434 pr_info("Enable scalable mode if hardware supports\n"); 435 intel_iommu_sm = 1; 436 } else if (!strncmp(str, "sm_off", 6)) { 437 pr_info("Scalable mode is disallowed\n"); 438 intel_iommu_sm = 0; 439 } else if (!strncmp(str, "tboot_noforce", 13)) { 440 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 441 intel_iommu_tboot_noforce = 1; 442 } else { 443 pr_notice("Unknown option - '%s'\n", str); 444 } 445 446 str += strcspn(str, ","); 447 while (*str == ',') 448 str++; 449 } 450 451 return 1; 452 } 453 __setup("intel_iommu=", intel_iommu_setup); 454 455 static struct kmem_cache *iommu_domain_cache; 456 static struct kmem_cache *iommu_devinfo_cache; 457 458 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 459 { 460 struct dmar_domain **domains; 461 int idx = did >> 8; 462 463 domains = iommu->domains[idx]; 464 if (!domains) 465 return NULL; 466 467 return domains[did & 0xff]; 468 } 469 470 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 471 struct dmar_domain *domain) 472 { 473 struct dmar_domain **domains; 474 int idx = did >> 8; 475 476 if (!iommu->domains[idx]) { 477 size_t size = 256 * sizeof(struct dmar_domain *); 478 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 479 } 480 481 domains = iommu->domains[idx]; 482 if (WARN_ON(!domains)) 483 return; 484 else 485 domains[did & 0xff] = domain; 486 } 487 488 void *alloc_pgtable_page(int node) 489 { 490 struct page *page; 491 void *vaddr = NULL; 492 493 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 494 if (page) 495 vaddr = page_address(page); 496 return vaddr; 497 } 498 499 void free_pgtable_page(void *vaddr) 500 { 501 free_page((unsigned long)vaddr); 502 } 503 504 static inline void *alloc_domain_mem(void) 505 { 506 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 507 } 508 509 static void free_domain_mem(void *vaddr) 510 { 511 kmem_cache_free(iommu_domain_cache, vaddr); 512 } 513 514 static inline void * alloc_devinfo_mem(void) 515 { 516 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 517 } 518 519 static inline void free_devinfo_mem(void *vaddr) 520 { 521 kmem_cache_free(iommu_devinfo_cache, vaddr); 522 } 523 524 static inline int domain_type_is_si(struct dmar_domain *domain) 525 { 526 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 527 } 528 529 static inline bool domain_use_first_level(struct dmar_domain *domain) 530 { 531 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 532 } 533 534 static inline int domain_pfn_supported(struct dmar_domain *domain, 535 unsigned long pfn) 536 { 537 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 538 539 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 540 } 541 542 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 543 { 544 unsigned long sagaw; 545 int agaw; 546 547 sagaw = cap_sagaw(iommu->cap); 548 for (agaw = width_to_agaw(max_gaw); 549 agaw >= 0; agaw--) { 550 if (test_bit(agaw, &sagaw)) 551 break; 552 } 553 554 return agaw; 555 } 556 557 /* 558 * Calculate max SAGAW for each iommu. 559 */ 560 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 561 { 562 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 563 } 564 565 /* 566 * calculate agaw for each iommu. 567 * "SAGAW" may be different across iommus, use a default agaw, and 568 * get a supported less agaw for iommus that don't support the default agaw. 569 */ 570 int iommu_calculate_agaw(struct intel_iommu *iommu) 571 { 572 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 573 } 574 575 /* This functionin only returns single iommu in a domain */ 576 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 577 { 578 int iommu_id; 579 580 /* si_domain and vm domain should not get here. */ 581 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 582 return NULL; 583 584 for_each_domain_iommu(iommu_id, domain) 585 break; 586 587 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 588 return NULL; 589 590 return g_iommus[iommu_id]; 591 } 592 593 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 594 { 595 return sm_supported(iommu) ? 596 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 597 } 598 599 static void domain_update_iommu_coherency(struct dmar_domain *domain) 600 { 601 struct dmar_drhd_unit *drhd; 602 struct intel_iommu *iommu; 603 bool found = false; 604 int i; 605 606 domain->iommu_coherency = true; 607 608 for_each_domain_iommu(i, domain) { 609 found = true; 610 if (!iommu_paging_structure_coherency(g_iommus[i])) { 611 domain->iommu_coherency = false; 612 break; 613 } 614 } 615 if (found) 616 return; 617 618 /* No hardware attached; use lowest common denominator */ 619 rcu_read_lock(); 620 for_each_active_iommu(iommu, drhd) { 621 if (!iommu_paging_structure_coherency(iommu)) { 622 domain->iommu_coherency = false; 623 break; 624 } 625 } 626 rcu_read_unlock(); 627 } 628 629 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 630 { 631 struct dmar_drhd_unit *drhd; 632 struct intel_iommu *iommu; 633 bool ret = true; 634 635 rcu_read_lock(); 636 for_each_active_iommu(iommu, drhd) { 637 if (iommu != skip) { 638 /* 639 * If the hardware is operating in the scalable mode, 640 * the snooping control is always supported since we 641 * always set PASID-table-entry.PGSNP bit if the domain 642 * is managed outside (UNMANAGED). 643 */ 644 if (!sm_supported(iommu) && 645 !ecap_sc_support(iommu->ecap)) { 646 ret = false; 647 break; 648 } 649 } 650 } 651 rcu_read_unlock(); 652 653 return ret; 654 } 655 656 static int domain_update_iommu_superpage(struct dmar_domain *domain, 657 struct intel_iommu *skip) 658 { 659 struct dmar_drhd_unit *drhd; 660 struct intel_iommu *iommu; 661 int mask = 0x3; 662 663 if (!intel_iommu_superpage) 664 return 0; 665 666 /* set iommu_superpage to the smallest common denominator */ 667 rcu_read_lock(); 668 for_each_active_iommu(iommu, drhd) { 669 if (iommu != skip) { 670 if (domain && domain_use_first_level(domain)) { 671 if (!cap_fl1gp_support(iommu->cap)) 672 mask = 0x1; 673 } else { 674 mask &= cap_super_page_val(iommu->cap); 675 } 676 677 if (!mask) 678 break; 679 } 680 } 681 rcu_read_unlock(); 682 683 return fls(mask); 684 } 685 686 static int domain_update_device_node(struct dmar_domain *domain) 687 { 688 struct device_domain_info *info; 689 int nid = NUMA_NO_NODE; 690 691 assert_spin_locked(&device_domain_lock); 692 693 if (list_empty(&domain->devices)) 694 return NUMA_NO_NODE; 695 696 list_for_each_entry(info, &domain->devices, link) { 697 if (!info->dev) 698 continue; 699 700 /* 701 * There could possibly be multiple device numa nodes as devices 702 * within the same domain may sit behind different IOMMUs. There 703 * isn't perfect answer in such situation, so we select first 704 * come first served policy. 705 */ 706 nid = dev_to_node(info->dev); 707 if (nid != NUMA_NO_NODE) 708 break; 709 } 710 711 return nid; 712 } 713 714 static void domain_update_iotlb(struct dmar_domain *domain); 715 716 /* Return the super pagesize bitmap if supported. */ 717 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 718 { 719 unsigned long bitmap = 0; 720 721 /* 722 * 1-level super page supports page size of 2MiB, 2-level super page 723 * supports page size of both 2MiB and 1GiB. 724 */ 725 if (domain->iommu_superpage == 1) 726 bitmap |= SZ_2M; 727 else if (domain->iommu_superpage == 2) 728 bitmap |= SZ_2M | SZ_1G; 729 730 return bitmap; 731 } 732 733 /* Some capabilities may be different across iommus */ 734 static void domain_update_iommu_cap(struct dmar_domain *domain) 735 { 736 domain_update_iommu_coherency(domain); 737 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 738 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 739 740 /* 741 * If RHSA is missing, we should default to the device numa domain 742 * as fall back. 743 */ 744 if (domain->nid == NUMA_NO_NODE) 745 domain->nid = domain_update_device_node(domain); 746 747 /* 748 * First-level translation restricts the input-address to a 749 * canonical address (i.e., address bits 63:N have the same 750 * value as address bit [N-1], where N is 48-bits with 4-level 751 * paging and 57-bits with 5-level paging). Hence, skip bit 752 * [N-1]. 753 */ 754 if (domain_use_first_level(domain)) 755 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 756 else 757 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 758 759 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 760 domain_update_iotlb(domain); 761 } 762 763 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 764 u8 devfn, int alloc) 765 { 766 struct root_entry *root = &iommu->root_entry[bus]; 767 struct context_entry *context; 768 u64 *entry; 769 770 entry = &root->lo; 771 if (sm_supported(iommu)) { 772 if (devfn >= 0x80) { 773 devfn -= 0x80; 774 entry = &root->hi; 775 } 776 devfn *= 2; 777 } 778 if (*entry & 1) 779 context = phys_to_virt(*entry & VTD_PAGE_MASK); 780 else { 781 unsigned long phy_addr; 782 if (!alloc) 783 return NULL; 784 785 context = alloc_pgtable_page(iommu->node); 786 if (!context) 787 return NULL; 788 789 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 790 phy_addr = virt_to_phys((void *)context); 791 *entry = phy_addr | 1; 792 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 793 } 794 return &context[devfn]; 795 } 796 797 static bool attach_deferred(struct device *dev) 798 { 799 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 800 } 801 802 /** 803 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 804 * sub-hierarchy of a candidate PCI-PCI bridge 805 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 806 * @bridge: the candidate PCI-PCI bridge 807 * 808 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 809 */ 810 static bool 811 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 812 { 813 struct pci_dev *pdev, *pbridge; 814 815 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 816 return false; 817 818 pdev = to_pci_dev(dev); 819 pbridge = to_pci_dev(bridge); 820 821 if (pbridge->subordinate && 822 pbridge->subordinate->number <= pdev->bus->number && 823 pbridge->subordinate->busn_res.end >= pdev->bus->number) 824 return true; 825 826 return false; 827 } 828 829 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 830 { 831 struct dmar_drhd_unit *drhd; 832 u32 vtbar; 833 int rc; 834 835 /* We know that this device on this chipset has its own IOMMU. 836 * If we find it under a different IOMMU, then the BIOS is lying 837 * to us. Hope that the IOMMU for this device is actually 838 * disabled, and it needs no translation... 839 */ 840 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 841 if (rc) { 842 /* "can't" happen */ 843 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 844 return false; 845 } 846 vtbar &= 0xffff0000; 847 848 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 849 drhd = dmar_find_matched_drhd_unit(pdev); 850 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 851 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 852 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 853 return true; 854 } 855 856 return false; 857 } 858 859 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 860 { 861 if (!iommu || iommu->drhd->ignored) 862 return true; 863 864 if (dev_is_pci(dev)) { 865 struct pci_dev *pdev = to_pci_dev(dev); 866 867 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 868 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 869 quirk_ioat_snb_local_iommu(pdev)) 870 return true; 871 } 872 873 return false; 874 } 875 876 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 877 { 878 struct dmar_drhd_unit *drhd = NULL; 879 struct pci_dev *pdev = NULL; 880 struct intel_iommu *iommu; 881 struct device *tmp; 882 u16 segment = 0; 883 int i; 884 885 if (!dev) 886 return NULL; 887 888 if (dev_is_pci(dev)) { 889 struct pci_dev *pf_pdev; 890 891 pdev = pci_real_dma_dev(to_pci_dev(dev)); 892 893 /* VFs aren't listed in scope tables; we need to look up 894 * the PF instead to find the IOMMU. */ 895 pf_pdev = pci_physfn(pdev); 896 dev = &pf_pdev->dev; 897 segment = pci_domain_nr(pdev->bus); 898 } else if (has_acpi_companion(dev)) 899 dev = &ACPI_COMPANION(dev)->dev; 900 901 rcu_read_lock(); 902 for_each_iommu(iommu, drhd) { 903 if (pdev && segment != drhd->segment) 904 continue; 905 906 for_each_active_dev_scope(drhd->devices, 907 drhd->devices_cnt, i, tmp) { 908 if (tmp == dev) { 909 /* For a VF use its original BDF# not that of the PF 910 * which we used for the IOMMU lookup. Strictly speaking 911 * we could do this for all PCI devices; we only need to 912 * get the BDF# from the scope table for ACPI matches. */ 913 if (pdev && pdev->is_virtfn) 914 goto got_pdev; 915 916 if (bus && devfn) { 917 *bus = drhd->devices[i].bus; 918 *devfn = drhd->devices[i].devfn; 919 } 920 goto out; 921 } 922 923 if (is_downstream_to_pci_bridge(dev, tmp)) 924 goto got_pdev; 925 } 926 927 if (pdev && drhd->include_all) { 928 got_pdev: 929 if (bus && devfn) { 930 *bus = pdev->bus->number; 931 *devfn = pdev->devfn; 932 } 933 goto out; 934 } 935 } 936 iommu = NULL; 937 out: 938 if (iommu_is_dummy(iommu, dev)) 939 iommu = NULL; 940 941 rcu_read_unlock(); 942 943 return iommu; 944 } 945 946 static void domain_flush_cache(struct dmar_domain *domain, 947 void *addr, int size) 948 { 949 if (!domain->iommu_coherency) 950 clflush_cache_range(addr, size); 951 } 952 953 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 954 { 955 struct context_entry *context; 956 int ret = 0; 957 unsigned long flags; 958 959 spin_lock_irqsave(&iommu->lock, flags); 960 context = iommu_context_addr(iommu, bus, devfn, 0); 961 if (context) 962 ret = context_present(context); 963 spin_unlock_irqrestore(&iommu->lock, flags); 964 return ret; 965 } 966 967 static void free_context_table(struct intel_iommu *iommu) 968 { 969 int i; 970 unsigned long flags; 971 struct context_entry *context; 972 973 spin_lock_irqsave(&iommu->lock, flags); 974 if (!iommu->root_entry) { 975 goto out; 976 } 977 for (i = 0; i < ROOT_ENTRY_NR; i++) { 978 context = iommu_context_addr(iommu, i, 0, 0); 979 if (context) 980 free_pgtable_page(context); 981 982 if (!sm_supported(iommu)) 983 continue; 984 985 context = iommu_context_addr(iommu, i, 0x80, 0); 986 if (context) 987 free_pgtable_page(context); 988 989 } 990 free_pgtable_page(iommu->root_entry); 991 iommu->root_entry = NULL; 992 out: 993 spin_unlock_irqrestore(&iommu->lock, flags); 994 } 995 996 #ifdef CONFIG_DMAR_DEBUG 997 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 998 { 999 struct device_domain_info *info; 1000 struct dma_pte *parent, *pte; 1001 struct dmar_domain *domain; 1002 int offset, level; 1003 1004 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 1005 if (!info || !info->domain) { 1006 pr_info("device [%02x:%02x.%d] not probed\n", 1007 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1008 return; 1009 } 1010 1011 domain = info->domain; 1012 level = agaw_to_level(domain->agaw); 1013 parent = domain->pgd; 1014 if (!parent) { 1015 pr_info("no page table setup\n"); 1016 return; 1017 } 1018 1019 while (1) { 1020 offset = pfn_level_offset(pfn, level); 1021 pte = &parent[offset]; 1022 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 1023 pr_info("PTE not present at level %d\n", level); 1024 break; 1025 } 1026 1027 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 1028 1029 if (level == 1) 1030 break; 1031 1032 parent = phys_to_virt(dma_pte_addr(pte)); 1033 level--; 1034 } 1035 } 1036 1037 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 1038 unsigned long long addr, u32 pasid) 1039 { 1040 struct pasid_dir_entry *dir, *pde; 1041 struct pasid_entry *entries, *pte; 1042 struct context_entry *ctx_entry; 1043 struct root_entry *rt_entry; 1044 u8 devfn = source_id & 0xff; 1045 u8 bus = source_id >> 8; 1046 int i, dir_index, index; 1047 1048 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 1049 1050 /* root entry dump */ 1051 rt_entry = &iommu->root_entry[bus]; 1052 if (!rt_entry) { 1053 pr_info("root table entry is not present\n"); 1054 return; 1055 } 1056 1057 if (sm_supported(iommu)) 1058 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 1059 rt_entry->hi, rt_entry->lo); 1060 else 1061 pr_info("root entry: 0x%016llx", rt_entry->lo); 1062 1063 /* context entry dump */ 1064 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 1065 if (!ctx_entry) { 1066 pr_info("context table entry is not present\n"); 1067 return; 1068 } 1069 1070 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 1071 ctx_entry->hi, ctx_entry->lo); 1072 1073 /* legacy mode does not require PASID entries */ 1074 if (!sm_supported(iommu)) 1075 goto pgtable_walk; 1076 1077 /* get the pointer to pasid directory entry */ 1078 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 1079 if (!dir) { 1080 pr_info("pasid directory entry is not present\n"); 1081 return; 1082 } 1083 /* For request-without-pasid, get the pasid from context entry */ 1084 if (intel_iommu_sm && pasid == INVALID_IOASID) 1085 pasid = PASID_RID2PASID; 1086 1087 dir_index = pasid >> PASID_PDE_SHIFT; 1088 pde = &dir[dir_index]; 1089 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 1090 1091 /* get the pointer to the pasid table entry */ 1092 entries = get_pasid_table_from_pde(pde); 1093 if (!entries) { 1094 pr_info("pasid table entry is not present\n"); 1095 return; 1096 } 1097 index = pasid & PASID_PTE_MASK; 1098 pte = &entries[index]; 1099 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 1100 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 1101 1102 pgtable_walk: 1103 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 1104 } 1105 #endif 1106 1107 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1108 unsigned long pfn, int *target_level) 1109 { 1110 struct dma_pte *parent, *pte; 1111 int level = agaw_to_level(domain->agaw); 1112 int offset; 1113 1114 BUG_ON(!domain->pgd); 1115 1116 if (!domain_pfn_supported(domain, pfn)) 1117 /* Address beyond IOMMU's addressing capabilities. */ 1118 return NULL; 1119 1120 parent = domain->pgd; 1121 1122 while (1) { 1123 void *tmp_page; 1124 1125 offset = pfn_level_offset(pfn, level); 1126 pte = &parent[offset]; 1127 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1128 break; 1129 if (level == *target_level) 1130 break; 1131 1132 if (!dma_pte_present(pte)) { 1133 uint64_t pteval; 1134 1135 tmp_page = alloc_pgtable_page(domain->nid); 1136 1137 if (!tmp_page) 1138 return NULL; 1139 1140 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1141 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1142 if (domain_use_first_level(domain)) { 1143 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1144 if (iommu_is_dma_domain(&domain->domain)) 1145 pteval |= DMA_FL_PTE_ACCESS; 1146 } 1147 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1148 /* Someone else set it while we were thinking; use theirs. */ 1149 free_pgtable_page(tmp_page); 1150 else 1151 domain_flush_cache(domain, pte, sizeof(*pte)); 1152 } 1153 if (level == 1) 1154 break; 1155 1156 parent = phys_to_virt(dma_pte_addr(pte)); 1157 level--; 1158 } 1159 1160 if (!*target_level) 1161 *target_level = level; 1162 1163 return pte; 1164 } 1165 1166 /* return address's pte at specific level */ 1167 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1168 unsigned long pfn, 1169 int level, int *large_page) 1170 { 1171 struct dma_pte *parent, *pte; 1172 int total = agaw_to_level(domain->agaw); 1173 int offset; 1174 1175 parent = domain->pgd; 1176 while (level <= total) { 1177 offset = pfn_level_offset(pfn, total); 1178 pte = &parent[offset]; 1179 if (level == total) 1180 return pte; 1181 1182 if (!dma_pte_present(pte)) { 1183 *large_page = total; 1184 break; 1185 } 1186 1187 if (dma_pte_superpage(pte)) { 1188 *large_page = total; 1189 return pte; 1190 } 1191 1192 parent = phys_to_virt(dma_pte_addr(pte)); 1193 total--; 1194 } 1195 return NULL; 1196 } 1197 1198 /* clear last level pte, a tlb flush should be followed */ 1199 static void dma_pte_clear_range(struct dmar_domain *domain, 1200 unsigned long start_pfn, 1201 unsigned long last_pfn) 1202 { 1203 unsigned int large_page; 1204 struct dma_pte *first_pte, *pte; 1205 1206 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1207 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1208 BUG_ON(start_pfn > last_pfn); 1209 1210 /* we don't need lock here; nobody else touches the iova range */ 1211 do { 1212 large_page = 1; 1213 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1214 if (!pte) { 1215 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1216 continue; 1217 } 1218 do { 1219 dma_clear_pte(pte); 1220 start_pfn += lvl_to_nr_pages(large_page); 1221 pte++; 1222 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1223 1224 domain_flush_cache(domain, first_pte, 1225 (void *)pte - (void *)first_pte); 1226 1227 } while (start_pfn && start_pfn <= last_pfn); 1228 } 1229 1230 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1231 int retain_level, struct dma_pte *pte, 1232 unsigned long pfn, unsigned long start_pfn, 1233 unsigned long last_pfn) 1234 { 1235 pfn = max(start_pfn, pfn); 1236 pte = &pte[pfn_level_offset(pfn, level)]; 1237 1238 do { 1239 unsigned long level_pfn; 1240 struct dma_pte *level_pte; 1241 1242 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1243 goto next; 1244 1245 level_pfn = pfn & level_mask(level); 1246 level_pte = phys_to_virt(dma_pte_addr(pte)); 1247 1248 if (level > 2) { 1249 dma_pte_free_level(domain, level - 1, retain_level, 1250 level_pte, level_pfn, start_pfn, 1251 last_pfn); 1252 } 1253 1254 /* 1255 * Free the page table if we're below the level we want to 1256 * retain and the range covers the entire table. 1257 */ 1258 if (level < retain_level && !(start_pfn > level_pfn || 1259 last_pfn < level_pfn + level_size(level) - 1)) { 1260 dma_clear_pte(pte); 1261 domain_flush_cache(domain, pte, sizeof(*pte)); 1262 free_pgtable_page(level_pte); 1263 } 1264 next: 1265 pfn += level_size(level); 1266 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1267 } 1268 1269 /* 1270 * clear last level (leaf) ptes and free page table pages below the 1271 * level we wish to keep intact. 1272 */ 1273 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1274 unsigned long start_pfn, 1275 unsigned long last_pfn, 1276 int retain_level) 1277 { 1278 dma_pte_clear_range(domain, start_pfn, last_pfn); 1279 1280 /* We don't need lock here; nobody else touches the iova range */ 1281 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1282 domain->pgd, 0, start_pfn, last_pfn); 1283 1284 /* free pgd */ 1285 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1286 free_pgtable_page(domain->pgd); 1287 domain->pgd = NULL; 1288 } 1289 } 1290 1291 /* When a page at a given level is being unlinked from its parent, we don't 1292 need to *modify* it at all. All we need to do is make a list of all the 1293 pages which can be freed just as soon as we've flushed the IOTLB and we 1294 know the hardware page-walk will no longer touch them. 1295 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1296 be freed. */ 1297 static void dma_pte_list_pagetables(struct dmar_domain *domain, 1298 int level, struct dma_pte *pte, 1299 struct list_head *freelist) 1300 { 1301 struct page *pg; 1302 1303 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1304 list_add_tail(&pg->lru, freelist); 1305 1306 if (level == 1) 1307 return; 1308 1309 pte = page_address(pg); 1310 do { 1311 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1312 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1313 pte++; 1314 } while (!first_pte_in_page(pte)); 1315 } 1316 1317 static void dma_pte_clear_level(struct dmar_domain *domain, int level, 1318 struct dma_pte *pte, unsigned long pfn, 1319 unsigned long start_pfn, unsigned long last_pfn, 1320 struct list_head *freelist) 1321 { 1322 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1323 1324 pfn = max(start_pfn, pfn); 1325 pte = &pte[pfn_level_offset(pfn, level)]; 1326 1327 do { 1328 unsigned long level_pfn = pfn & level_mask(level); 1329 1330 if (!dma_pte_present(pte)) 1331 goto next; 1332 1333 /* If range covers entire pagetable, free it */ 1334 if (start_pfn <= level_pfn && 1335 last_pfn >= level_pfn + level_size(level) - 1) { 1336 /* These suborbinate page tables are going away entirely. Don't 1337 bother to clear them; we're just going to *free* them. */ 1338 if (level > 1 && !dma_pte_superpage(pte)) 1339 dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1340 1341 dma_clear_pte(pte); 1342 if (!first_pte) 1343 first_pte = pte; 1344 last_pte = pte; 1345 } else if (level > 1) { 1346 /* Recurse down into a level that isn't *entirely* obsolete */ 1347 dma_pte_clear_level(domain, level - 1, 1348 phys_to_virt(dma_pte_addr(pte)), 1349 level_pfn, start_pfn, last_pfn, 1350 freelist); 1351 } 1352 next: 1353 pfn = level_pfn + level_size(level); 1354 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1355 1356 if (first_pte) 1357 domain_flush_cache(domain, first_pte, 1358 (void *)++last_pte - (void *)first_pte); 1359 } 1360 1361 /* We can't just free the pages because the IOMMU may still be walking 1362 the page tables, and may have cached the intermediate levels. The 1363 pages can only be freed after the IOTLB flush has been done. */ 1364 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 1365 unsigned long last_pfn, struct list_head *freelist) 1366 { 1367 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1368 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1369 BUG_ON(start_pfn > last_pfn); 1370 1371 /* we don't need lock here; nobody else touches the iova range */ 1372 dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1373 domain->pgd, 0, start_pfn, last_pfn, freelist); 1374 1375 /* free pgd */ 1376 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1377 struct page *pgd_page = virt_to_page(domain->pgd); 1378 list_add_tail(&pgd_page->lru, freelist); 1379 domain->pgd = NULL; 1380 } 1381 } 1382 1383 /* iommu handling */ 1384 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1385 { 1386 struct root_entry *root; 1387 unsigned long flags; 1388 1389 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1390 if (!root) { 1391 pr_err("Allocating root entry for %s failed\n", 1392 iommu->name); 1393 return -ENOMEM; 1394 } 1395 1396 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1397 1398 spin_lock_irqsave(&iommu->lock, flags); 1399 iommu->root_entry = root; 1400 spin_unlock_irqrestore(&iommu->lock, flags); 1401 1402 return 0; 1403 } 1404 1405 static void iommu_set_root_entry(struct intel_iommu *iommu) 1406 { 1407 u64 addr; 1408 u32 sts; 1409 unsigned long flag; 1410 1411 addr = virt_to_phys(iommu->root_entry); 1412 if (sm_supported(iommu)) 1413 addr |= DMA_RTADDR_SMT; 1414 1415 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1416 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1417 1418 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1419 1420 /* Make sure hardware complete it */ 1421 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1422 readl, (sts & DMA_GSTS_RTPS), sts); 1423 1424 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1425 1426 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1427 if (sm_supported(iommu)) 1428 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1429 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1430 } 1431 1432 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1433 { 1434 u32 val; 1435 unsigned long flag; 1436 1437 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1438 return; 1439 1440 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1441 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1442 1443 /* Make sure hardware complete it */ 1444 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1445 readl, (!(val & DMA_GSTS_WBFS)), val); 1446 1447 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1448 } 1449 1450 /* return value determine if we need a write buffer flush */ 1451 static void __iommu_flush_context(struct intel_iommu *iommu, 1452 u16 did, u16 source_id, u8 function_mask, 1453 u64 type) 1454 { 1455 u64 val = 0; 1456 unsigned long flag; 1457 1458 switch (type) { 1459 case DMA_CCMD_GLOBAL_INVL: 1460 val = DMA_CCMD_GLOBAL_INVL; 1461 break; 1462 case DMA_CCMD_DOMAIN_INVL: 1463 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1464 break; 1465 case DMA_CCMD_DEVICE_INVL: 1466 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1467 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1468 break; 1469 default: 1470 BUG(); 1471 } 1472 val |= DMA_CCMD_ICC; 1473 1474 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1475 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1476 1477 /* Make sure hardware complete it */ 1478 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1479 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1480 1481 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1482 } 1483 1484 /* return value determine if we need a write buffer flush */ 1485 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1486 u64 addr, unsigned int size_order, u64 type) 1487 { 1488 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1489 u64 val = 0, val_iva = 0; 1490 unsigned long flag; 1491 1492 switch (type) { 1493 case DMA_TLB_GLOBAL_FLUSH: 1494 /* global flush doesn't need set IVA_REG */ 1495 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1496 break; 1497 case DMA_TLB_DSI_FLUSH: 1498 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1499 break; 1500 case DMA_TLB_PSI_FLUSH: 1501 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1502 /* IH bit is passed in as part of address */ 1503 val_iva = size_order | addr; 1504 break; 1505 default: 1506 BUG(); 1507 } 1508 /* Note: set drain read/write */ 1509 #if 0 1510 /* 1511 * This is probably to be super secure.. Looks like we can 1512 * ignore it without any impact. 1513 */ 1514 if (cap_read_drain(iommu->cap)) 1515 val |= DMA_TLB_READ_DRAIN; 1516 #endif 1517 if (cap_write_drain(iommu->cap)) 1518 val |= DMA_TLB_WRITE_DRAIN; 1519 1520 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1521 /* Note: Only uses first TLB reg currently */ 1522 if (val_iva) 1523 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1524 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1525 1526 /* Make sure hardware complete it */ 1527 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1528 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1529 1530 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1531 1532 /* check IOTLB invalidation granularity */ 1533 if (DMA_TLB_IAIG(val) == 0) 1534 pr_err("Flush IOTLB failed\n"); 1535 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1536 pr_debug("TLB flush request %Lx, actual %Lx\n", 1537 (unsigned long long)DMA_TLB_IIRG(type), 1538 (unsigned long long)DMA_TLB_IAIG(val)); 1539 } 1540 1541 static struct device_domain_info * 1542 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1543 u8 bus, u8 devfn) 1544 { 1545 struct device_domain_info *info; 1546 1547 assert_spin_locked(&device_domain_lock); 1548 1549 if (!iommu->qi) 1550 return NULL; 1551 1552 list_for_each_entry(info, &domain->devices, link) 1553 if (info->iommu == iommu && info->bus == bus && 1554 info->devfn == devfn) { 1555 if (info->ats_supported && info->dev) 1556 return info; 1557 break; 1558 } 1559 1560 return NULL; 1561 } 1562 1563 static void domain_update_iotlb(struct dmar_domain *domain) 1564 { 1565 struct device_domain_info *info; 1566 bool has_iotlb_device = false; 1567 1568 assert_spin_locked(&device_domain_lock); 1569 1570 list_for_each_entry(info, &domain->devices, link) 1571 if (info->ats_enabled) { 1572 has_iotlb_device = true; 1573 break; 1574 } 1575 1576 if (!has_iotlb_device) { 1577 struct subdev_domain_info *sinfo; 1578 1579 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1580 info = get_domain_info(sinfo->pdev); 1581 if (info && info->ats_enabled) { 1582 has_iotlb_device = true; 1583 break; 1584 } 1585 } 1586 } 1587 1588 domain->has_iotlb_device = has_iotlb_device; 1589 } 1590 1591 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1592 { 1593 struct pci_dev *pdev; 1594 1595 assert_spin_locked(&device_domain_lock); 1596 1597 if (!info || !dev_is_pci(info->dev)) 1598 return; 1599 1600 pdev = to_pci_dev(info->dev); 1601 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1602 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1603 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1604 * reserved, which should be set to 0. 1605 */ 1606 if (!ecap_dit(info->iommu->ecap)) 1607 info->pfsid = 0; 1608 else { 1609 struct pci_dev *pf_pdev; 1610 1611 /* pdev will be returned if device is not a vf */ 1612 pf_pdev = pci_physfn(pdev); 1613 info->pfsid = pci_dev_id(pf_pdev); 1614 } 1615 1616 #ifdef CONFIG_INTEL_IOMMU_SVM 1617 /* The PCIe spec, in its wisdom, declares that the behaviour of 1618 the device if you enable PASID support after ATS support is 1619 undefined. So always enable PASID support on devices which 1620 have it, even if we can't yet know if we're ever going to 1621 use it. */ 1622 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1623 info->pasid_enabled = 1; 1624 1625 if (info->pri_supported && 1626 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1627 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1628 info->pri_enabled = 1; 1629 #endif 1630 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1631 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1632 info->ats_enabled = 1; 1633 domain_update_iotlb(info->domain); 1634 info->ats_qdep = pci_ats_queue_depth(pdev); 1635 } 1636 } 1637 1638 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1639 { 1640 struct pci_dev *pdev; 1641 1642 assert_spin_locked(&device_domain_lock); 1643 1644 if (!dev_is_pci(info->dev)) 1645 return; 1646 1647 pdev = to_pci_dev(info->dev); 1648 1649 if (info->ats_enabled) { 1650 pci_disable_ats(pdev); 1651 info->ats_enabled = 0; 1652 domain_update_iotlb(info->domain); 1653 } 1654 #ifdef CONFIG_INTEL_IOMMU_SVM 1655 if (info->pri_enabled) { 1656 pci_disable_pri(pdev); 1657 info->pri_enabled = 0; 1658 } 1659 if (info->pasid_enabled) { 1660 pci_disable_pasid(pdev); 1661 info->pasid_enabled = 0; 1662 } 1663 #endif 1664 } 1665 1666 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1667 u64 addr, unsigned int mask) 1668 { 1669 u16 sid, qdep; 1670 1671 if (!info || !info->ats_enabled) 1672 return; 1673 1674 sid = info->bus << 8 | info->devfn; 1675 qdep = info->ats_qdep; 1676 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1677 qdep, addr, mask); 1678 } 1679 1680 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1681 u64 addr, unsigned mask) 1682 { 1683 unsigned long flags; 1684 struct device_domain_info *info; 1685 struct subdev_domain_info *sinfo; 1686 1687 if (!domain->has_iotlb_device) 1688 return; 1689 1690 spin_lock_irqsave(&device_domain_lock, flags); 1691 list_for_each_entry(info, &domain->devices, link) 1692 __iommu_flush_dev_iotlb(info, addr, mask); 1693 1694 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1695 info = get_domain_info(sinfo->pdev); 1696 __iommu_flush_dev_iotlb(info, addr, mask); 1697 } 1698 spin_unlock_irqrestore(&device_domain_lock, flags); 1699 } 1700 1701 static void domain_flush_piotlb(struct intel_iommu *iommu, 1702 struct dmar_domain *domain, 1703 u64 addr, unsigned long npages, bool ih) 1704 { 1705 u16 did = domain->iommu_did[iommu->seq_id]; 1706 1707 if (domain->default_pasid) 1708 qi_flush_piotlb(iommu, did, domain->default_pasid, 1709 addr, npages, ih); 1710 1711 if (!list_empty(&domain->devices)) 1712 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1713 } 1714 1715 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1716 struct dmar_domain *domain, 1717 unsigned long pfn, unsigned int pages, 1718 int ih, int map) 1719 { 1720 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1721 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1722 u16 did = domain->iommu_did[iommu->seq_id]; 1723 1724 BUG_ON(pages == 0); 1725 1726 if (ih) 1727 ih = 1 << 6; 1728 1729 if (domain_use_first_level(domain)) { 1730 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1731 } else { 1732 /* 1733 * Fallback to domain selective flush if no PSI support or 1734 * the size is too big. PSI requires page size to be 2 ^ x, 1735 * and the base address is naturally aligned to the size. 1736 */ 1737 if (!cap_pgsel_inv(iommu->cap) || 1738 mask > cap_max_amask_val(iommu->cap)) 1739 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1740 DMA_TLB_DSI_FLUSH); 1741 else 1742 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1743 DMA_TLB_PSI_FLUSH); 1744 } 1745 1746 /* 1747 * In caching mode, changes of pages from non-present to present require 1748 * flush. However, device IOTLB doesn't need to be flushed in this case. 1749 */ 1750 if (!cap_caching_mode(iommu->cap) || !map) 1751 iommu_flush_dev_iotlb(domain, addr, mask); 1752 } 1753 1754 /* Notification for newly created mappings */ 1755 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1756 struct dmar_domain *domain, 1757 unsigned long pfn, unsigned int pages) 1758 { 1759 /* 1760 * It's a non-present to present mapping. Only flush if caching mode 1761 * and second level. 1762 */ 1763 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1764 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1765 else 1766 iommu_flush_write_buffer(iommu); 1767 } 1768 1769 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1770 { 1771 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1772 int idx; 1773 1774 for_each_domain_iommu(idx, dmar_domain) { 1775 struct intel_iommu *iommu = g_iommus[idx]; 1776 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1777 1778 if (domain_use_first_level(dmar_domain)) 1779 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1780 else 1781 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1782 DMA_TLB_DSI_FLUSH); 1783 1784 if (!cap_caching_mode(iommu->cap)) 1785 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1786 0, MAX_AGAW_PFN_WIDTH); 1787 } 1788 } 1789 1790 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1791 { 1792 u32 pmen; 1793 unsigned long flags; 1794 1795 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1796 return; 1797 1798 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1799 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1800 pmen &= ~DMA_PMEN_EPM; 1801 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1802 1803 /* wait for the protected region status bit to clear */ 1804 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1805 readl, !(pmen & DMA_PMEN_PRS), pmen); 1806 1807 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1808 } 1809 1810 static void iommu_enable_translation(struct intel_iommu *iommu) 1811 { 1812 u32 sts; 1813 unsigned long flags; 1814 1815 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1816 iommu->gcmd |= DMA_GCMD_TE; 1817 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1818 1819 /* Make sure hardware complete it */ 1820 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1821 readl, (sts & DMA_GSTS_TES), sts); 1822 1823 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1824 } 1825 1826 static void iommu_disable_translation(struct intel_iommu *iommu) 1827 { 1828 u32 sts; 1829 unsigned long flag; 1830 1831 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1832 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1833 return; 1834 1835 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1836 iommu->gcmd &= ~DMA_GCMD_TE; 1837 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1838 1839 /* Make sure hardware complete it */ 1840 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1841 readl, (!(sts & DMA_GSTS_TES)), sts); 1842 1843 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1844 } 1845 1846 static int iommu_init_domains(struct intel_iommu *iommu) 1847 { 1848 u32 ndomains; 1849 size_t size; 1850 1851 ndomains = cap_ndoms(iommu->cap); 1852 pr_debug("%s: Number of Domains supported <%d>\n", 1853 iommu->name, ndomains); 1854 1855 spin_lock_init(&iommu->lock); 1856 1857 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); 1858 if (!iommu->domain_ids) 1859 return -ENOMEM; 1860 1861 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1862 iommu->domains = kzalloc(size, GFP_KERNEL); 1863 1864 if (iommu->domains) { 1865 size = 256 * sizeof(struct dmar_domain *); 1866 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1867 } 1868 1869 if (!iommu->domains || !iommu->domains[0]) { 1870 pr_err("%s: Allocating domain array failed\n", 1871 iommu->name); 1872 bitmap_free(iommu->domain_ids); 1873 kfree(iommu->domains); 1874 iommu->domain_ids = NULL; 1875 iommu->domains = NULL; 1876 return -ENOMEM; 1877 } 1878 1879 /* 1880 * If Caching mode is set, then invalid translations are tagged 1881 * with domain-id 0, hence we need to pre-allocate it. We also 1882 * use domain-id 0 as a marker for non-allocated domain-id, so 1883 * make sure it is not used for a real domain. 1884 */ 1885 set_bit(0, iommu->domain_ids); 1886 1887 /* 1888 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1889 * entry for first-level or pass-through translation modes should 1890 * be programmed with a domain id different from those used for 1891 * second-level or nested translation. We reserve a domain id for 1892 * this purpose. 1893 */ 1894 if (sm_supported(iommu)) 1895 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1896 1897 return 0; 1898 } 1899 1900 static void disable_dmar_iommu(struct intel_iommu *iommu) 1901 { 1902 struct device_domain_info *info, *tmp; 1903 unsigned long flags; 1904 1905 if (!iommu->domains || !iommu->domain_ids) 1906 return; 1907 1908 spin_lock_irqsave(&device_domain_lock, flags); 1909 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1910 if (info->iommu != iommu) 1911 continue; 1912 1913 if (!info->dev || !info->domain) 1914 continue; 1915 1916 __dmar_remove_one_dev_info(info); 1917 } 1918 spin_unlock_irqrestore(&device_domain_lock, flags); 1919 1920 if (iommu->gcmd & DMA_GCMD_TE) 1921 iommu_disable_translation(iommu); 1922 } 1923 1924 static void free_dmar_iommu(struct intel_iommu *iommu) 1925 { 1926 if ((iommu->domains) && (iommu->domain_ids)) { 1927 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1928 int i; 1929 1930 for (i = 0; i < elems; i++) 1931 kfree(iommu->domains[i]); 1932 kfree(iommu->domains); 1933 bitmap_free(iommu->domain_ids); 1934 iommu->domains = NULL; 1935 iommu->domain_ids = NULL; 1936 } 1937 1938 g_iommus[iommu->seq_id] = NULL; 1939 1940 /* free context mapping */ 1941 free_context_table(iommu); 1942 1943 #ifdef CONFIG_INTEL_IOMMU_SVM 1944 if (pasid_supported(iommu)) { 1945 if (ecap_prs(iommu->ecap)) 1946 intel_svm_finish_prq(iommu); 1947 } 1948 if (vccap_pasid(iommu->vccap)) 1949 ioasid_unregister_allocator(&iommu->pasid_allocator); 1950 1951 #endif 1952 } 1953 1954 /* 1955 * Check and return whether first level is used by default for 1956 * DMA translation. 1957 */ 1958 static bool first_level_by_default(unsigned int type) 1959 { 1960 /* Only SL is available in legacy mode */ 1961 if (!scalable_mode_support()) 1962 return false; 1963 1964 /* Only level (either FL or SL) is available, just use it */ 1965 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 1966 return intel_cap_flts_sanity(); 1967 1968 /* Both levels are available, decide it based on domain type */ 1969 return type != IOMMU_DOMAIN_UNMANAGED; 1970 } 1971 1972 static struct dmar_domain *alloc_domain(unsigned int type) 1973 { 1974 struct dmar_domain *domain; 1975 1976 domain = alloc_domain_mem(); 1977 if (!domain) 1978 return NULL; 1979 1980 memset(domain, 0, sizeof(*domain)); 1981 domain->nid = NUMA_NO_NODE; 1982 if (first_level_by_default(type)) 1983 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1984 domain->has_iotlb_device = false; 1985 INIT_LIST_HEAD(&domain->devices); 1986 INIT_LIST_HEAD(&domain->subdevices); 1987 1988 return domain; 1989 } 1990 1991 /* Must be called with iommu->lock */ 1992 static int domain_attach_iommu(struct dmar_domain *domain, 1993 struct intel_iommu *iommu) 1994 { 1995 unsigned long ndomains; 1996 int num; 1997 1998 assert_spin_locked(&device_domain_lock); 1999 assert_spin_locked(&iommu->lock); 2000 2001 domain->iommu_refcnt[iommu->seq_id] += 1; 2002 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 2003 ndomains = cap_ndoms(iommu->cap); 2004 num = find_first_zero_bit(iommu->domain_ids, ndomains); 2005 2006 if (num >= ndomains) { 2007 pr_err("%s: No free domain ids\n", iommu->name); 2008 domain->iommu_refcnt[iommu->seq_id] -= 1; 2009 return -ENOSPC; 2010 } 2011 2012 set_bit(num, iommu->domain_ids); 2013 set_iommu_domain(iommu, num, domain); 2014 2015 domain->iommu_did[iommu->seq_id] = num; 2016 domain->nid = iommu->node; 2017 2018 domain_update_iommu_cap(domain); 2019 } 2020 2021 return 0; 2022 } 2023 2024 static void domain_detach_iommu(struct dmar_domain *domain, 2025 struct intel_iommu *iommu) 2026 { 2027 int num; 2028 2029 assert_spin_locked(&device_domain_lock); 2030 assert_spin_locked(&iommu->lock); 2031 2032 domain->iommu_refcnt[iommu->seq_id] -= 1; 2033 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 2034 num = domain->iommu_did[iommu->seq_id]; 2035 clear_bit(num, iommu->domain_ids); 2036 set_iommu_domain(iommu, num, NULL); 2037 2038 domain_update_iommu_cap(domain); 2039 domain->iommu_did[iommu->seq_id] = 0; 2040 } 2041 } 2042 2043 static inline int guestwidth_to_adjustwidth(int gaw) 2044 { 2045 int agaw; 2046 int r = (gaw - 12) % 9; 2047 2048 if (r == 0) 2049 agaw = gaw; 2050 else 2051 agaw = gaw + 9 - r; 2052 if (agaw > 64) 2053 agaw = 64; 2054 return agaw; 2055 } 2056 2057 static void domain_exit(struct dmar_domain *domain) 2058 { 2059 2060 /* Remove associated devices and clear attached or cached domains */ 2061 domain_remove_dev_info(domain); 2062 2063 if (domain->pgd) { 2064 LIST_HEAD(freelist); 2065 2066 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); 2067 put_pages_list(&freelist); 2068 } 2069 2070 free_domain_mem(domain); 2071 } 2072 2073 /* 2074 * Get the PASID directory size for scalable mode context entry. 2075 * Value of X in the PDTS field of a scalable mode context entry 2076 * indicates PASID directory with 2^(X + 7) entries. 2077 */ 2078 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2079 { 2080 unsigned long pds, max_pde; 2081 2082 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2083 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); 2084 if (pds < 7) 2085 return 0; 2086 2087 return pds - 7; 2088 } 2089 2090 /* 2091 * Set the RID_PASID field of a scalable mode context entry. The 2092 * IOMMU hardware will use the PASID value set in this field for 2093 * DMA translations of DMA requests without PASID. 2094 */ 2095 static inline void 2096 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2097 { 2098 context->hi |= pasid & ((1 << 20) - 1); 2099 } 2100 2101 /* 2102 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2103 * entry. 2104 */ 2105 static inline void context_set_sm_dte(struct context_entry *context) 2106 { 2107 context->lo |= (1 << 2); 2108 } 2109 2110 /* 2111 * Set the PRE(Page Request Enable) field of a scalable mode context 2112 * entry. 2113 */ 2114 static inline void context_set_sm_pre(struct context_entry *context) 2115 { 2116 context->lo |= (1 << 4); 2117 } 2118 2119 /* Convert value to context PASID directory size field coding. */ 2120 #define context_pdts(pds) (((pds) & 0x7) << 9) 2121 2122 static int domain_context_mapping_one(struct dmar_domain *domain, 2123 struct intel_iommu *iommu, 2124 struct pasid_table *table, 2125 u8 bus, u8 devfn) 2126 { 2127 u16 did = domain->iommu_did[iommu->seq_id]; 2128 int translation = CONTEXT_TT_MULTI_LEVEL; 2129 struct device_domain_info *info = NULL; 2130 struct context_entry *context; 2131 unsigned long flags; 2132 int ret; 2133 2134 WARN_ON(did == 0); 2135 2136 if (hw_pass_through && domain_type_is_si(domain)) 2137 translation = CONTEXT_TT_PASS_THROUGH; 2138 2139 pr_debug("Set context mapping for %02x:%02x.%d\n", 2140 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2141 2142 BUG_ON(!domain->pgd); 2143 2144 spin_lock_irqsave(&device_domain_lock, flags); 2145 spin_lock(&iommu->lock); 2146 2147 ret = -ENOMEM; 2148 context = iommu_context_addr(iommu, bus, devfn, 1); 2149 if (!context) 2150 goto out_unlock; 2151 2152 ret = 0; 2153 if (context_present(context)) 2154 goto out_unlock; 2155 2156 /* 2157 * For kdump cases, old valid entries may be cached due to the 2158 * in-flight DMA and copied pgtable, but there is no unmapping 2159 * behaviour for them, thus we need an explicit cache flush for 2160 * the newly-mapped device. For kdump, at this point, the device 2161 * is supposed to finish reset at its driver probe stage, so no 2162 * in-flight DMA will exist, and we don't need to worry anymore 2163 * hereafter. 2164 */ 2165 if (context_copied(context)) { 2166 u16 did_old = context_domain_id(context); 2167 2168 if (did_old < cap_ndoms(iommu->cap)) { 2169 iommu->flush.flush_context(iommu, did_old, 2170 (((u16)bus) << 8) | devfn, 2171 DMA_CCMD_MASK_NOBIT, 2172 DMA_CCMD_DEVICE_INVL); 2173 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2174 DMA_TLB_DSI_FLUSH); 2175 } 2176 } 2177 2178 context_clear_entry(context); 2179 2180 if (sm_supported(iommu)) { 2181 unsigned long pds; 2182 2183 WARN_ON(!table); 2184 2185 /* Setup the PASID DIR pointer: */ 2186 pds = context_get_sm_pds(table); 2187 context->lo = (u64)virt_to_phys(table->table) | 2188 context_pdts(pds); 2189 2190 /* Setup the RID_PASID field: */ 2191 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2192 2193 /* 2194 * Setup the Device-TLB enable bit and Page request 2195 * Enable bit: 2196 */ 2197 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2198 if (info && info->ats_supported) 2199 context_set_sm_dte(context); 2200 if (info && info->pri_supported) 2201 context_set_sm_pre(context); 2202 } else { 2203 struct dma_pte *pgd = domain->pgd; 2204 int agaw; 2205 2206 context_set_domain_id(context, did); 2207 2208 if (translation != CONTEXT_TT_PASS_THROUGH) { 2209 /* 2210 * Skip top levels of page tables for iommu which has 2211 * less agaw than default. Unnecessary for PT mode. 2212 */ 2213 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2214 ret = -ENOMEM; 2215 pgd = phys_to_virt(dma_pte_addr(pgd)); 2216 if (!dma_pte_present(pgd)) 2217 goto out_unlock; 2218 } 2219 2220 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2221 if (info && info->ats_supported) 2222 translation = CONTEXT_TT_DEV_IOTLB; 2223 else 2224 translation = CONTEXT_TT_MULTI_LEVEL; 2225 2226 context_set_address_root(context, virt_to_phys(pgd)); 2227 context_set_address_width(context, agaw); 2228 } else { 2229 /* 2230 * In pass through mode, AW must be programmed to 2231 * indicate the largest AGAW value supported by 2232 * hardware. And ASR is ignored by hardware. 2233 */ 2234 context_set_address_width(context, iommu->msagaw); 2235 } 2236 2237 context_set_translation_type(context, translation); 2238 } 2239 2240 context_set_fault_enable(context); 2241 context_set_present(context); 2242 if (!ecap_coherent(iommu->ecap)) 2243 clflush_cache_range(context, sizeof(*context)); 2244 2245 /* 2246 * It's a non-present to present mapping. If hardware doesn't cache 2247 * non-present entry we only need to flush the write-buffer. If the 2248 * _does_ cache non-present entries, then it does so in the special 2249 * domain #0, which we have to flush: 2250 */ 2251 if (cap_caching_mode(iommu->cap)) { 2252 iommu->flush.flush_context(iommu, 0, 2253 (((u16)bus) << 8) | devfn, 2254 DMA_CCMD_MASK_NOBIT, 2255 DMA_CCMD_DEVICE_INVL); 2256 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2257 } else { 2258 iommu_flush_write_buffer(iommu); 2259 } 2260 iommu_enable_dev_iotlb(info); 2261 2262 ret = 0; 2263 2264 out_unlock: 2265 spin_unlock(&iommu->lock); 2266 spin_unlock_irqrestore(&device_domain_lock, flags); 2267 2268 return ret; 2269 } 2270 2271 struct domain_context_mapping_data { 2272 struct dmar_domain *domain; 2273 struct intel_iommu *iommu; 2274 struct pasid_table *table; 2275 }; 2276 2277 static int domain_context_mapping_cb(struct pci_dev *pdev, 2278 u16 alias, void *opaque) 2279 { 2280 struct domain_context_mapping_data *data = opaque; 2281 2282 return domain_context_mapping_one(data->domain, data->iommu, 2283 data->table, PCI_BUS_NUM(alias), 2284 alias & 0xff); 2285 } 2286 2287 static int 2288 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2289 { 2290 struct domain_context_mapping_data data; 2291 struct pasid_table *table; 2292 struct intel_iommu *iommu; 2293 u8 bus, devfn; 2294 2295 iommu = device_to_iommu(dev, &bus, &devfn); 2296 if (!iommu) 2297 return -ENODEV; 2298 2299 table = intel_pasid_get_table(dev); 2300 2301 if (!dev_is_pci(dev)) 2302 return domain_context_mapping_one(domain, iommu, table, 2303 bus, devfn); 2304 2305 data.domain = domain; 2306 data.iommu = iommu; 2307 data.table = table; 2308 2309 return pci_for_each_dma_alias(to_pci_dev(dev), 2310 &domain_context_mapping_cb, &data); 2311 } 2312 2313 static int domain_context_mapped_cb(struct pci_dev *pdev, 2314 u16 alias, void *opaque) 2315 { 2316 struct intel_iommu *iommu = opaque; 2317 2318 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2319 } 2320 2321 static int domain_context_mapped(struct device *dev) 2322 { 2323 struct intel_iommu *iommu; 2324 u8 bus, devfn; 2325 2326 iommu = device_to_iommu(dev, &bus, &devfn); 2327 if (!iommu) 2328 return -ENODEV; 2329 2330 if (!dev_is_pci(dev)) 2331 return device_context_mapped(iommu, bus, devfn); 2332 2333 return !pci_for_each_dma_alias(to_pci_dev(dev), 2334 domain_context_mapped_cb, iommu); 2335 } 2336 2337 /* Returns a number of VTD pages, but aligned to MM page size */ 2338 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2339 size_t size) 2340 { 2341 host_addr &= ~PAGE_MASK; 2342 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2343 } 2344 2345 /* Return largest possible superpage level for a given mapping */ 2346 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2347 unsigned long iov_pfn, 2348 unsigned long phy_pfn, 2349 unsigned long pages) 2350 { 2351 int support, level = 1; 2352 unsigned long pfnmerge; 2353 2354 support = domain->iommu_superpage; 2355 2356 /* To use a large page, the virtual *and* physical addresses 2357 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2358 of them will mean we have to use smaller pages. So just 2359 merge them and check both at once. */ 2360 pfnmerge = iov_pfn | phy_pfn; 2361 2362 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2363 pages >>= VTD_STRIDE_SHIFT; 2364 if (!pages) 2365 break; 2366 pfnmerge >>= VTD_STRIDE_SHIFT; 2367 level++; 2368 support--; 2369 } 2370 return level; 2371 } 2372 2373 /* 2374 * Ensure that old small page tables are removed to make room for superpage(s). 2375 * We're going to add new large pages, so make sure we don't remove their parent 2376 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2377 */ 2378 static void switch_to_super_page(struct dmar_domain *domain, 2379 unsigned long start_pfn, 2380 unsigned long end_pfn, int level) 2381 { 2382 unsigned long lvl_pages = lvl_to_nr_pages(level); 2383 struct dma_pte *pte = NULL; 2384 int i; 2385 2386 while (start_pfn <= end_pfn) { 2387 if (!pte) 2388 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2389 2390 if (dma_pte_present(pte)) { 2391 dma_pte_free_pagetable(domain, start_pfn, 2392 start_pfn + lvl_pages - 1, 2393 level + 1); 2394 2395 for_each_domain_iommu(i, domain) 2396 iommu_flush_iotlb_psi(g_iommus[i], domain, 2397 start_pfn, lvl_pages, 2398 0, 0); 2399 } 2400 2401 pte++; 2402 start_pfn += lvl_pages; 2403 if (first_pte_in_page(pte)) 2404 pte = NULL; 2405 } 2406 } 2407 2408 static int 2409 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2410 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2411 { 2412 struct dma_pte *first_pte = NULL, *pte = NULL; 2413 unsigned int largepage_lvl = 0; 2414 unsigned long lvl_pages = 0; 2415 phys_addr_t pteval; 2416 u64 attr; 2417 2418 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2419 2420 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2421 return -EINVAL; 2422 2423 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2424 attr |= DMA_FL_PTE_PRESENT; 2425 if (domain_use_first_level(domain)) { 2426 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2427 if (prot & DMA_PTE_WRITE) 2428 attr |= DMA_FL_PTE_DIRTY; 2429 } 2430 2431 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2432 2433 while (nr_pages > 0) { 2434 uint64_t tmp; 2435 2436 if (!pte) { 2437 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2438 phys_pfn, nr_pages); 2439 2440 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2441 if (!pte) 2442 return -ENOMEM; 2443 first_pte = pte; 2444 2445 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2446 2447 /* It is large page*/ 2448 if (largepage_lvl > 1) { 2449 unsigned long end_pfn; 2450 unsigned long pages_to_remove; 2451 2452 pteval |= DMA_PTE_LARGE_PAGE; 2453 pages_to_remove = min_t(unsigned long, nr_pages, 2454 nr_pte_to_next_page(pte) * lvl_pages); 2455 end_pfn = iov_pfn + pages_to_remove - 1; 2456 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2457 } else { 2458 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2459 } 2460 2461 } 2462 /* We don't need lock here, nobody else 2463 * touches the iova range 2464 */ 2465 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2466 if (tmp) { 2467 static int dumps = 5; 2468 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2469 iov_pfn, tmp, (unsigned long long)pteval); 2470 if (dumps) { 2471 dumps--; 2472 debug_dma_dump_mappings(NULL); 2473 } 2474 WARN_ON(1); 2475 } 2476 2477 nr_pages -= lvl_pages; 2478 iov_pfn += lvl_pages; 2479 phys_pfn += lvl_pages; 2480 pteval += lvl_pages * VTD_PAGE_SIZE; 2481 2482 /* If the next PTE would be the first in a new page, then we 2483 * need to flush the cache on the entries we've just written. 2484 * And then we'll need to recalculate 'pte', so clear it and 2485 * let it get set again in the if (!pte) block above. 2486 * 2487 * If we're done (!nr_pages) we need to flush the cache too. 2488 * 2489 * Also if we've been setting superpages, we may need to 2490 * recalculate 'pte' and switch back to smaller pages for the 2491 * end of the mapping, if the trailing size is not enough to 2492 * use another superpage (i.e. nr_pages < lvl_pages). 2493 */ 2494 pte++; 2495 if (!nr_pages || first_pte_in_page(pte) || 2496 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2497 domain_flush_cache(domain, first_pte, 2498 (void *)pte - (void *)first_pte); 2499 pte = NULL; 2500 } 2501 } 2502 2503 return 0; 2504 } 2505 2506 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2507 { 2508 struct intel_iommu *iommu = info->iommu; 2509 struct context_entry *context; 2510 unsigned long flags; 2511 u16 did_old; 2512 2513 if (!iommu) 2514 return; 2515 2516 spin_lock_irqsave(&iommu->lock, flags); 2517 context = iommu_context_addr(iommu, bus, devfn, 0); 2518 if (!context) { 2519 spin_unlock_irqrestore(&iommu->lock, flags); 2520 return; 2521 } 2522 2523 if (sm_supported(iommu)) { 2524 if (hw_pass_through && domain_type_is_si(info->domain)) 2525 did_old = FLPT_DEFAULT_DID; 2526 else 2527 did_old = info->domain->iommu_did[iommu->seq_id]; 2528 } else { 2529 did_old = context_domain_id(context); 2530 } 2531 2532 context_clear_entry(context); 2533 __iommu_flush_cache(iommu, context, sizeof(*context)); 2534 spin_unlock_irqrestore(&iommu->lock, flags); 2535 iommu->flush.flush_context(iommu, 2536 did_old, 2537 (((u16)bus) << 8) | devfn, 2538 DMA_CCMD_MASK_NOBIT, 2539 DMA_CCMD_DEVICE_INVL); 2540 2541 if (sm_supported(iommu)) 2542 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2543 2544 iommu->flush.flush_iotlb(iommu, 2545 did_old, 2546 0, 2547 0, 2548 DMA_TLB_DSI_FLUSH); 2549 2550 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2551 } 2552 2553 static inline void unlink_domain_info(struct device_domain_info *info) 2554 { 2555 assert_spin_locked(&device_domain_lock); 2556 list_del(&info->link); 2557 list_del(&info->global); 2558 if (info->dev) 2559 dev_iommu_priv_set(info->dev, NULL); 2560 } 2561 2562 static void domain_remove_dev_info(struct dmar_domain *domain) 2563 { 2564 struct device_domain_info *info, *tmp; 2565 unsigned long flags; 2566 2567 spin_lock_irqsave(&device_domain_lock, flags); 2568 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2569 __dmar_remove_one_dev_info(info); 2570 spin_unlock_irqrestore(&device_domain_lock, flags); 2571 } 2572 2573 struct dmar_domain *find_domain(struct device *dev) 2574 { 2575 struct device_domain_info *info; 2576 2577 if (unlikely(!dev || !dev->iommu)) 2578 return NULL; 2579 2580 if (unlikely(attach_deferred(dev))) 2581 return NULL; 2582 2583 /* No lock here, assumes no domain exit in normal case */ 2584 info = get_domain_info(dev); 2585 if (likely(info)) 2586 return info->domain; 2587 2588 return NULL; 2589 } 2590 2591 static inline struct device_domain_info * 2592 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2593 { 2594 struct device_domain_info *info; 2595 2596 list_for_each_entry(info, &device_domain_list, global) 2597 if (info->segment == segment && info->bus == bus && 2598 info->devfn == devfn) 2599 return info; 2600 2601 return NULL; 2602 } 2603 2604 static int domain_setup_first_level(struct intel_iommu *iommu, 2605 struct dmar_domain *domain, 2606 struct device *dev, 2607 u32 pasid) 2608 { 2609 struct dma_pte *pgd = domain->pgd; 2610 int agaw, level; 2611 int flags = 0; 2612 2613 /* 2614 * Skip top levels of page tables for iommu which has 2615 * less agaw than default. Unnecessary for PT mode. 2616 */ 2617 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2618 pgd = phys_to_virt(dma_pte_addr(pgd)); 2619 if (!dma_pte_present(pgd)) 2620 return -ENOMEM; 2621 } 2622 2623 level = agaw_to_level(agaw); 2624 if (level != 4 && level != 5) 2625 return -EINVAL; 2626 2627 if (pasid != PASID_RID2PASID) 2628 flags |= PASID_FLAG_SUPERVISOR_MODE; 2629 if (level == 5) 2630 flags |= PASID_FLAG_FL5LP; 2631 2632 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2633 flags |= PASID_FLAG_PAGE_SNOOP; 2634 2635 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2636 domain->iommu_did[iommu->seq_id], 2637 flags); 2638 } 2639 2640 static bool dev_is_real_dma_subdevice(struct device *dev) 2641 { 2642 return dev && dev_is_pci(dev) && 2643 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2644 } 2645 2646 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2647 int bus, int devfn, 2648 struct device *dev, 2649 struct dmar_domain *domain) 2650 { 2651 struct dmar_domain *found = NULL; 2652 struct device_domain_info *info; 2653 unsigned long flags; 2654 int ret; 2655 2656 info = alloc_devinfo_mem(); 2657 if (!info) 2658 return NULL; 2659 2660 if (!dev_is_real_dma_subdevice(dev)) { 2661 info->bus = bus; 2662 info->devfn = devfn; 2663 info->segment = iommu->segment; 2664 } else { 2665 struct pci_dev *pdev = to_pci_dev(dev); 2666 2667 info->bus = pdev->bus->number; 2668 info->devfn = pdev->devfn; 2669 info->segment = pci_domain_nr(pdev->bus); 2670 } 2671 2672 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2673 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2674 info->ats_qdep = 0; 2675 info->dev = dev; 2676 info->domain = domain; 2677 info->iommu = iommu; 2678 info->pasid_table = NULL; 2679 info->auxd_enabled = 0; 2680 INIT_LIST_HEAD(&info->subdevices); 2681 2682 if (dev && dev_is_pci(dev)) { 2683 struct pci_dev *pdev = to_pci_dev(info->dev); 2684 2685 if (ecap_dev_iotlb_support(iommu->ecap) && 2686 pci_ats_supported(pdev) && 2687 dmar_find_matched_atsr_unit(pdev)) 2688 info->ats_supported = 1; 2689 2690 if (sm_supported(iommu)) { 2691 if (pasid_supported(iommu)) { 2692 int features = pci_pasid_features(pdev); 2693 if (features >= 0) 2694 info->pasid_supported = features | 1; 2695 } 2696 2697 if (info->ats_supported && ecap_prs(iommu->ecap) && 2698 pci_pri_supported(pdev)) 2699 info->pri_supported = 1; 2700 } 2701 } 2702 2703 spin_lock_irqsave(&device_domain_lock, flags); 2704 if (dev) 2705 found = find_domain(dev); 2706 2707 if (!found) { 2708 struct device_domain_info *info2; 2709 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2710 info->devfn); 2711 if (info2) { 2712 found = info2->domain; 2713 info2->dev = dev; 2714 } 2715 } 2716 2717 if (found) { 2718 spin_unlock_irqrestore(&device_domain_lock, flags); 2719 free_devinfo_mem(info); 2720 /* Caller must free the original domain */ 2721 return found; 2722 } 2723 2724 spin_lock(&iommu->lock); 2725 ret = domain_attach_iommu(domain, iommu); 2726 spin_unlock(&iommu->lock); 2727 2728 if (ret) { 2729 spin_unlock_irqrestore(&device_domain_lock, flags); 2730 free_devinfo_mem(info); 2731 return NULL; 2732 } 2733 2734 list_add(&info->link, &domain->devices); 2735 list_add(&info->global, &device_domain_list); 2736 if (dev) 2737 dev_iommu_priv_set(dev, info); 2738 spin_unlock_irqrestore(&device_domain_lock, flags); 2739 2740 /* PASID table is mandatory for a PCI device in scalable mode. */ 2741 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { 2742 ret = intel_pasid_alloc_table(dev); 2743 if (ret) { 2744 dev_err(dev, "PASID table allocation failed\n"); 2745 dmar_remove_one_dev_info(dev); 2746 return NULL; 2747 } 2748 2749 /* Setup the PASID entry for requests without PASID: */ 2750 spin_lock_irqsave(&iommu->lock, flags); 2751 if (hw_pass_through && domain_type_is_si(domain)) 2752 ret = intel_pasid_setup_pass_through(iommu, domain, 2753 dev, PASID_RID2PASID); 2754 else if (domain_use_first_level(domain)) 2755 ret = domain_setup_first_level(iommu, domain, dev, 2756 PASID_RID2PASID); 2757 else 2758 ret = intel_pasid_setup_second_level(iommu, domain, 2759 dev, PASID_RID2PASID); 2760 spin_unlock_irqrestore(&iommu->lock, flags); 2761 if (ret) { 2762 dev_err(dev, "Setup RID2PASID failed\n"); 2763 dmar_remove_one_dev_info(dev); 2764 return NULL; 2765 } 2766 } 2767 2768 if (dev && domain_context_mapping(domain, dev)) { 2769 dev_err(dev, "Domain context map failed\n"); 2770 dmar_remove_one_dev_info(dev); 2771 return NULL; 2772 } 2773 2774 return domain; 2775 } 2776 2777 static int iommu_domain_identity_map(struct dmar_domain *domain, 2778 unsigned long first_vpfn, 2779 unsigned long last_vpfn) 2780 { 2781 /* 2782 * RMRR range might have overlap with physical memory range, 2783 * clear it first 2784 */ 2785 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2786 2787 return __domain_mapping(domain, first_vpfn, 2788 first_vpfn, last_vpfn - first_vpfn + 1, 2789 DMA_PTE_READ|DMA_PTE_WRITE); 2790 } 2791 2792 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2793 2794 static int __init si_domain_init(int hw) 2795 { 2796 struct dmar_rmrr_unit *rmrr; 2797 struct device *dev; 2798 int i, nid, ret; 2799 2800 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2801 if (!si_domain) 2802 return -EFAULT; 2803 2804 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2805 domain_exit(si_domain); 2806 return -EFAULT; 2807 } 2808 2809 if (hw) 2810 return 0; 2811 2812 for_each_online_node(nid) { 2813 unsigned long start_pfn, end_pfn; 2814 int i; 2815 2816 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2817 ret = iommu_domain_identity_map(si_domain, 2818 mm_to_dma_pfn(start_pfn), 2819 mm_to_dma_pfn(end_pfn)); 2820 if (ret) 2821 return ret; 2822 } 2823 } 2824 2825 /* 2826 * Identity map the RMRRs so that devices with RMRRs could also use 2827 * the si_domain. 2828 */ 2829 for_each_rmrr_units(rmrr) { 2830 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2831 i, dev) { 2832 unsigned long long start = rmrr->base_address; 2833 unsigned long long end = rmrr->end_address; 2834 2835 if (WARN_ON(end < start || 2836 end >> agaw_to_width(si_domain->agaw))) 2837 continue; 2838 2839 ret = iommu_domain_identity_map(si_domain, 2840 mm_to_dma_pfn(start >> PAGE_SHIFT), 2841 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2842 if (ret) 2843 return ret; 2844 } 2845 } 2846 2847 return 0; 2848 } 2849 2850 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2851 { 2852 struct dmar_domain *ndomain; 2853 struct intel_iommu *iommu; 2854 u8 bus, devfn; 2855 2856 iommu = device_to_iommu(dev, &bus, &devfn); 2857 if (!iommu) 2858 return -ENODEV; 2859 2860 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2861 if (ndomain != domain) 2862 return -EBUSY; 2863 2864 return 0; 2865 } 2866 2867 static bool device_has_rmrr(struct device *dev) 2868 { 2869 struct dmar_rmrr_unit *rmrr; 2870 struct device *tmp; 2871 int i; 2872 2873 rcu_read_lock(); 2874 for_each_rmrr_units(rmrr) { 2875 /* 2876 * Return TRUE if this RMRR contains the device that 2877 * is passed in. 2878 */ 2879 for_each_active_dev_scope(rmrr->devices, 2880 rmrr->devices_cnt, i, tmp) 2881 if (tmp == dev || 2882 is_downstream_to_pci_bridge(dev, tmp)) { 2883 rcu_read_unlock(); 2884 return true; 2885 } 2886 } 2887 rcu_read_unlock(); 2888 return false; 2889 } 2890 2891 /** 2892 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2893 * is relaxable (ie. is allowed to be not enforced under some conditions) 2894 * @dev: device handle 2895 * 2896 * We assume that PCI USB devices with RMRRs have them largely 2897 * for historical reasons and that the RMRR space is not actively used post 2898 * boot. This exclusion may change if vendors begin to abuse it. 2899 * 2900 * The same exception is made for graphics devices, with the requirement that 2901 * any use of the RMRR regions will be torn down before assigning the device 2902 * to a guest. 2903 * 2904 * Return: true if the RMRR is relaxable, false otherwise 2905 */ 2906 static bool device_rmrr_is_relaxable(struct device *dev) 2907 { 2908 struct pci_dev *pdev; 2909 2910 if (!dev_is_pci(dev)) 2911 return false; 2912 2913 pdev = to_pci_dev(dev); 2914 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2915 return true; 2916 else 2917 return false; 2918 } 2919 2920 /* 2921 * There are a couple cases where we need to restrict the functionality of 2922 * devices associated with RMRRs. The first is when evaluating a device for 2923 * identity mapping because problems exist when devices are moved in and out 2924 * of domains and their respective RMRR information is lost. This means that 2925 * a device with associated RMRRs will never be in a "passthrough" domain. 2926 * The second is use of the device through the IOMMU API. This interface 2927 * expects to have full control of the IOVA space for the device. We cannot 2928 * satisfy both the requirement that RMRR access is maintained and have an 2929 * unencumbered IOVA space. We also have no ability to quiesce the device's 2930 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2931 * We therefore prevent devices associated with an RMRR from participating in 2932 * the IOMMU API, which eliminates them from device assignment. 2933 * 2934 * In both cases, devices which have relaxable RMRRs are not concerned by this 2935 * restriction. See device_rmrr_is_relaxable comment. 2936 */ 2937 static bool device_is_rmrr_locked(struct device *dev) 2938 { 2939 if (!device_has_rmrr(dev)) 2940 return false; 2941 2942 if (device_rmrr_is_relaxable(dev)) 2943 return false; 2944 2945 return true; 2946 } 2947 2948 /* 2949 * Return the required default domain type for a specific device. 2950 * 2951 * @dev: the device in query 2952 * @startup: true if this is during early boot 2953 * 2954 * Returns: 2955 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2956 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2957 * - 0: both identity and dynamic domains work for this device 2958 */ 2959 static int device_def_domain_type(struct device *dev) 2960 { 2961 if (dev_is_pci(dev)) { 2962 struct pci_dev *pdev = to_pci_dev(dev); 2963 2964 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2965 return IOMMU_DOMAIN_IDENTITY; 2966 2967 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2968 return IOMMU_DOMAIN_IDENTITY; 2969 } 2970 2971 return 0; 2972 } 2973 2974 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2975 { 2976 /* 2977 * Start from the sane iommu hardware state. 2978 * If the queued invalidation is already initialized by us 2979 * (for example, while enabling interrupt-remapping) then 2980 * we got the things already rolling from a sane state. 2981 */ 2982 if (!iommu->qi) { 2983 /* 2984 * Clear any previous faults. 2985 */ 2986 dmar_fault(-1, iommu); 2987 /* 2988 * Disable queued invalidation if supported and already enabled 2989 * before OS handover. 2990 */ 2991 dmar_disable_qi(iommu); 2992 } 2993 2994 if (dmar_enable_qi(iommu)) { 2995 /* 2996 * Queued Invalidate not enabled, use Register Based Invalidate 2997 */ 2998 iommu->flush.flush_context = __iommu_flush_context; 2999 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 3000 pr_info("%s: Using Register based invalidation\n", 3001 iommu->name); 3002 } else { 3003 iommu->flush.flush_context = qi_flush_context; 3004 iommu->flush.flush_iotlb = qi_flush_iotlb; 3005 pr_info("%s: Using Queued invalidation\n", iommu->name); 3006 } 3007 } 3008 3009 static int copy_context_table(struct intel_iommu *iommu, 3010 struct root_entry *old_re, 3011 struct context_entry **tbl, 3012 int bus, bool ext) 3013 { 3014 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 3015 struct context_entry *new_ce = NULL, ce; 3016 struct context_entry *old_ce = NULL; 3017 struct root_entry re; 3018 phys_addr_t old_ce_phys; 3019 3020 tbl_idx = ext ? bus * 2 : bus; 3021 memcpy(&re, old_re, sizeof(re)); 3022 3023 for (devfn = 0; devfn < 256; devfn++) { 3024 /* First calculate the correct index */ 3025 idx = (ext ? devfn * 2 : devfn) % 256; 3026 3027 if (idx == 0) { 3028 /* First save what we may have and clean up */ 3029 if (new_ce) { 3030 tbl[tbl_idx] = new_ce; 3031 __iommu_flush_cache(iommu, new_ce, 3032 VTD_PAGE_SIZE); 3033 pos = 1; 3034 } 3035 3036 if (old_ce) 3037 memunmap(old_ce); 3038 3039 ret = 0; 3040 if (devfn < 0x80) 3041 old_ce_phys = root_entry_lctp(&re); 3042 else 3043 old_ce_phys = root_entry_uctp(&re); 3044 3045 if (!old_ce_phys) { 3046 if (ext && devfn == 0) { 3047 /* No LCTP, try UCTP */ 3048 devfn = 0x7f; 3049 continue; 3050 } else { 3051 goto out; 3052 } 3053 } 3054 3055 ret = -ENOMEM; 3056 old_ce = memremap(old_ce_phys, PAGE_SIZE, 3057 MEMREMAP_WB); 3058 if (!old_ce) 3059 goto out; 3060 3061 new_ce = alloc_pgtable_page(iommu->node); 3062 if (!new_ce) 3063 goto out_unmap; 3064 3065 ret = 0; 3066 } 3067 3068 /* Now copy the context entry */ 3069 memcpy(&ce, old_ce + idx, sizeof(ce)); 3070 3071 if (!__context_present(&ce)) 3072 continue; 3073 3074 did = context_domain_id(&ce); 3075 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3076 set_bit(did, iommu->domain_ids); 3077 3078 /* 3079 * We need a marker for copied context entries. This 3080 * marker needs to work for the old format as well as 3081 * for extended context entries. 3082 * 3083 * Bit 67 of the context entry is used. In the old 3084 * format this bit is available to software, in the 3085 * extended format it is the PGE bit, but PGE is ignored 3086 * by HW if PASIDs are disabled (and thus still 3087 * available). 3088 * 3089 * So disable PASIDs first and then mark the entry 3090 * copied. This means that we don't copy PASID 3091 * translations from the old kernel, but this is fine as 3092 * faults there are not fatal. 3093 */ 3094 context_clear_pasid_enable(&ce); 3095 context_set_copied(&ce); 3096 3097 new_ce[idx] = ce; 3098 } 3099 3100 tbl[tbl_idx + pos] = new_ce; 3101 3102 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3103 3104 out_unmap: 3105 memunmap(old_ce); 3106 3107 out: 3108 return ret; 3109 } 3110 3111 static int copy_translation_tables(struct intel_iommu *iommu) 3112 { 3113 struct context_entry **ctxt_tbls; 3114 struct root_entry *old_rt; 3115 phys_addr_t old_rt_phys; 3116 int ctxt_table_entries; 3117 unsigned long flags; 3118 u64 rtaddr_reg; 3119 int bus, ret; 3120 bool new_ext, ext; 3121 3122 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3123 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3124 new_ext = !!ecap_ecs(iommu->ecap); 3125 3126 /* 3127 * The RTT bit can only be changed when translation is disabled, 3128 * but disabling translation means to open a window for data 3129 * corruption. So bail out and don't copy anything if we would 3130 * have to change the bit. 3131 */ 3132 if (new_ext != ext) 3133 return -EINVAL; 3134 3135 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3136 if (!old_rt_phys) 3137 return -EINVAL; 3138 3139 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3140 if (!old_rt) 3141 return -ENOMEM; 3142 3143 /* This is too big for the stack - allocate it from slab */ 3144 ctxt_table_entries = ext ? 512 : 256; 3145 ret = -ENOMEM; 3146 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3147 if (!ctxt_tbls) 3148 goto out_unmap; 3149 3150 for (bus = 0; bus < 256; bus++) { 3151 ret = copy_context_table(iommu, &old_rt[bus], 3152 ctxt_tbls, bus, ext); 3153 if (ret) { 3154 pr_err("%s: Failed to copy context table for bus %d\n", 3155 iommu->name, bus); 3156 continue; 3157 } 3158 } 3159 3160 spin_lock_irqsave(&iommu->lock, flags); 3161 3162 /* Context tables are copied, now write them to the root_entry table */ 3163 for (bus = 0; bus < 256; bus++) { 3164 int idx = ext ? bus * 2 : bus; 3165 u64 val; 3166 3167 if (ctxt_tbls[idx]) { 3168 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3169 iommu->root_entry[bus].lo = val; 3170 } 3171 3172 if (!ext || !ctxt_tbls[idx + 1]) 3173 continue; 3174 3175 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3176 iommu->root_entry[bus].hi = val; 3177 } 3178 3179 spin_unlock_irqrestore(&iommu->lock, flags); 3180 3181 kfree(ctxt_tbls); 3182 3183 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3184 3185 ret = 0; 3186 3187 out_unmap: 3188 memunmap(old_rt); 3189 3190 return ret; 3191 } 3192 3193 #ifdef CONFIG_INTEL_IOMMU_SVM 3194 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3195 { 3196 struct intel_iommu *iommu = data; 3197 ioasid_t ioasid; 3198 3199 if (!iommu) 3200 return INVALID_IOASID; 3201 /* 3202 * VT-d virtual command interface always uses the full 20 bit 3203 * PASID range. Host can partition guest PASID range based on 3204 * policies but it is out of guest's control. 3205 */ 3206 if (min < PASID_MIN || max > intel_pasid_max_id) 3207 return INVALID_IOASID; 3208 3209 if (vcmd_alloc_pasid(iommu, &ioasid)) 3210 return INVALID_IOASID; 3211 3212 return ioasid; 3213 } 3214 3215 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3216 { 3217 struct intel_iommu *iommu = data; 3218 3219 if (!iommu) 3220 return; 3221 /* 3222 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3223 * We can only free the PASID when all the devices are unbound. 3224 */ 3225 if (ioasid_find(NULL, ioasid, NULL)) { 3226 pr_alert("Cannot free active IOASID %d\n", ioasid); 3227 return; 3228 } 3229 vcmd_free_pasid(iommu, ioasid); 3230 } 3231 3232 static void register_pasid_allocator(struct intel_iommu *iommu) 3233 { 3234 /* 3235 * If we are running in the host, no need for custom allocator 3236 * in that PASIDs are allocated from the host system-wide. 3237 */ 3238 if (!cap_caching_mode(iommu->cap)) 3239 return; 3240 3241 if (!sm_supported(iommu)) { 3242 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3243 return; 3244 } 3245 3246 /* 3247 * Register a custom PASID allocator if we are running in a guest, 3248 * guest PASID must be obtained via virtual command interface. 3249 * There can be multiple vIOMMUs in each guest but only one allocator 3250 * is active. All vIOMMU allocators will eventually be calling the same 3251 * host allocator. 3252 */ 3253 if (!vccap_pasid(iommu->vccap)) 3254 return; 3255 3256 pr_info("Register custom PASID allocator\n"); 3257 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3258 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3259 iommu->pasid_allocator.pdata = (void *)iommu; 3260 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3261 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3262 /* 3263 * Disable scalable mode on this IOMMU if there 3264 * is no custom allocator. Mixing SM capable vIOMMU 3265 * and non-SM vIOMMU are not supported. 3266 */ 3267 intel_iommu_sm = 0; 3268 } 3269 } 3270 #endif 3271 3272 static int __init init_dmars(void) 3273 { 3274 struct dmar_drhd_unit *drhd; 3275 struct intel_iommu *iommu; 3276 int ret; 3277 3278 /* 3279 * for each drhd 3280 * allocate root 3281 * initialize and program root entry to not present 3282 * endfor 3283 */ 3284 for_each_drhd_unit(drhd) { 3285 /* 3286 * lock not needed as this is only incremented in the single 3287 * threaded kernel __init code path all other access are read 3288 * only 3289 */ 3290 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3291 g_num_of_iommus++; 3292 continue; 3293 } 3294 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3295 } 3296 3297 /* Preallocate enough resources for IOMMU hot-addition */ 3298 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3299 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3300 3301 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3302 GFP_KERNEL); 3303 if (!g_iommus) { 3304 ret = -ENOMEM; 3305 goto error; 3306 } 3307 3308 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3309 if (ret) 3310 goto free_iommu; 3311 3312 for_each_iommu(iommu, drhd) { 3313 if (drhd->ignored) { 3314 iommu_disable_translation(iommu); 3315 continue; 3316 } 3317 3318 /* 3319 * Find the max pasid size of all IOMMU's in the system. 3320 * We need to ensure the system pasid table is no bigger 3321 * than the smallest supported. 3322 */ 3323 if (pasid_supported(iommu)) { 3324 u32 temp = 2 << ecap_pss(iommu->ecap); 3325 3326 intel_pasid_max_id = min_t(u32, temp, 3327 intel_pasid_max_id); 3328 } 3329 3330 g_iommus[iommu->seq_id] = iommu; 3331 3332 intel_iommu_init_qi(iommu); 3333 3334 ret = iommu_init_domains(iommu); 3335 if (ret) 3336 goto free_iommu; 3337 3338 init_translation_status(iommu); 3339 3340 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3341 iommu_disable_translation(iommu); 3342 clear_translation_pre_enabled(iommu); 3343 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3344 iommu->name); 3345 } 3346 3347 /* 3348 * TBD: 3349 * we could share the same root & context tables 3350 * among all IOMMU's. Need to Split it later. 3351 */ 3352 ret = iommu_alloc_root_entry(iommu); 3353 if (ret) 3354 goto free_iommu; 3355 3356 if (translation_pre_enabled(iommu)) { 3357 pr_info("Translation already enabled - trying to copy translation structures\n"); 3358 3359 ret = copy_translation_tables(iommu); 3360 if (ret) { 3361 /* 3362 * We found the IOMMU with translation 3363 * enabled - but failed to copy over the 3364 * old root-entry table. Try to proceed 3365 * by disabling translation now and 3366 * allocating a clean root-entry table. 3367 * This might cause DMAR faults, but 3368 * probably the dump will still succeed. 3369 */ 3370 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3371 iommu->name); 3372 iommu_disable_translation(iommu); 3373 clear_translation_pre_enabled(iommu); 3374 } else { 3375 pr_info("Copied translation tables from previous kernel for %s\n", 3376 iommu->name); 3377 } 3378 } 3379 3380 if (!ecap_pass_through(iommu->ecap)) 3381 hw_pass_through = 0; 3382 intel_svm_check(iommu); 3383 } 3384 3385 /* 3386 * Now that qi is enabled on all iommus, set the root entry and flush 3387 * caches. This is required on some Intel X58 chipsets, otherwise the 3388 * flush_context function will loop forever and the boot hangs. 3389 */ 3390 for_each_active_iommu(iommu, drhd) { 3391 iommu_flush_write_buffer(iommu); 3392 #ifdef CONFIG_INTEL_IOMMU_SVM 3393 register_pasid_allocator(iommu); 3394 #endif 3395 iommu_set_root_entry(iommu); 3396 } 3397 3398 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3399 dmar_map_gfx = 0; 3400 #endif 3401 3402 if (!dmar_map_gfx) 3403 iommu_identity_mapping |= IDENTMAP_GFX; 3404 3405 check_tylersburg_isoch(); 3406 3407 ret = si_domain_init(hw_pass_through); 3408 if (ret) 3409 goto free_iommu; 3410 3411 /* 3412 * for each drhd 3413 * enable fault log 3414 * global invalidate context cache 3415 * global invalidate iotlb 3416 * enable translation 3417 */ 3418 for_each_iommu(iommu, drhd) { 3419 if (drhd->ignored) { 3420 /* 3421 * we always have to disable PMRs or DMA may fail on 3422 * this device 3423 */ 3424 if (force_on) 3425 iommu_disable_protect_mem_regions(iommu); 3426 continue; 3427 } 3428 3429 iommu_flush_write_buffer(iommu); 3430 3431 #ifdef CONFIG_INTEL_IOMMU_SVM 3432 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3433 /* 3434 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3435 * could cause possible lock race condition. 3436 */ 3437 up_write(&dmar_global_lock); 3438 ret = intel_svm_enable_prq(iommu); 3439 down_write(&dmar_global_lock); 3440 if (ret) 3441 goto free_iommu; 3442 } 3443 #endif 3444 ret = dmar_set_interrupt(iommu); 3445 if (ret) 3446 goto free_iommu; 3447 } 3448 3449 return 0; 3450 3451 free_iommu: 3452 for_each_active_iommu(iommu, drhd) { 3453 disable_dmar_iommu(iommu); 3454 free_dmar_iommu(iommu); 3455 } 3456 3457 kfree(g_iommus); 3458 3459 error: 3460 return ret; 3461 } 3462 3463 static inline int iommu_domain_cache_init(void) 3464 { 3465 int ret = 0; 3466 3467 iommu_domain_cache = kmem_cache_create("iommu_domain", 3468 sizeof(struct dmar_domain), 3469 0, 3470 SLAB_HWCACHE_ALIGN, 3471 3472 NULL); 3473 if (!iommu_domain_cache) { 3474 pr_err("Couldn't create iommu_domain cache\n"); 3475 ret = -ENOMEM; 3476 } 3477 3478 return ret; 3479 } 3480 3481 static inline int iommu_devinfo_cache_init(void) 3482 { 3483 int ret = 0; 3484 3485 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3486 sizeof(struct device_domain_info), 3487 0, 3488 SLAB_HWCACHE_ALIGN, 3489 NULL); 3490 if (!iommu_devinfo_cache) { 3491 pr_err("Couldn't create devinfo cache\n"); 3492 ret = -ENOMEM; 3493 } 3494 3495 return ret; 3496 } 3497 3498 static int __init iommu_init_mempool(void) 3499 { 3500 int ret; 3501 ret = iova_cache_get(); 3502 if (ret) 3503 return ret; 3504 3505 ret = iommu_domain_cache_init(); 3506 if (ret) 3507 goto domain_error; 3508 3509 ret = iommu_devinfo_cache_init(); 3510 if (!ret) 3511 return ret; 3512 3513 kmem_cache_destroy(iommu_domain_cache); 3514 domain_error: 3515 iova_cache_put(); 3516 3517 return -ENOMEM; 3518 } 3519 3520 static void __init iommu_exit_mempool(void) 3521 { 3522 kmem_cache_destroy(iommu_devinfo_cache); 3523 kmem_cache_destroy(iommu_domain_cache); 3524 iova_cache_put(); 3525 } 3526 3527 static void __init init_no_remapping_devices(void) 3528 { 3529 struct dmar_drhd_unit *drhd; 3530 struct device *dev; 3531 int i; 3532 3533 for_each_drhd_unit(drhd) { 3534 if (!drhd->include_all) { 3535 for_each_active_dev_scope(drhd->devices, 3536 drhd->devices_cnt, i, dev) 3537 break; 3538 /* ignore DMAR unit if no devices exist */ 3539 if (i == drhd->devices_cnt) 3540 drhd->ignored = 1; 3541 } 3542 } 3543 3544 for_each_active_drhd_unit(drhd) { 3545 if (drhd->include_all) 3546 continue; 3547 3548 for_each_active_dev_scope(drhd->devices, 3549 drhd->devices_cnt, i, dev) 3550 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3551 break; 3552 if (i < drhd->devices_cnt) 3553 continue; 3554 3555 /* This IOMMU has *only* gfx devices. Either bypass it or 3556 set the gfx_mapped flag, as appropriate */ 3557 drhd->gfx_dedicated = 1; 3558 if (!dmar_map_gfx) 3559 drhd->ignored = 1; 3560 } 3561 } 3562 3563 #ifdef CONFIG_SUSPEND 3564 static int init_iommu_hw(void) 3565 { 3566 struct dmar_drhd_unit *drhd; 3567 struct intel_iommu *iommu = NULL; 3568 3569 for_each_active_iommu(iommu, drhd) 3570 if (iommu->qi) 3571 dmar_reenable_qi(iommu); 3572 3573 for_each_iommu(iommu, drhd) { 3574 if (drhd->ignored) { 3575 /* 3576 * we always have to disable PMRs or DMA may fail on 3577 * this device 3578 */ 3579 if (force_on) 3580 iommu_disable_protect_mem_regions(iommu); 3581 continue; 3582 } 3583 3584 iommu_flush_write_buffer(iommu); 3585 iommu_set_root_entry(iommu); 3586 iommu_enable_translation(iommu); 3587 iommu_disable_protect_mem_regions(iommu); 3588 } 3589 3590 return 0; 3591 } 3592 3593 static void iommu_flush_all(void) 3594 { 3595 struct dmar_drhd_unit *drhd; 3596 struct intel_iommu *iommu; 3597 3598 for_each_active_iommu(iommu, drhd) { 3599 iommu->flush.flush_context(iommu, 0, 0, 0, 3600 DMA_CCMD_GLOBAL_INVL); 3601 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3602 DMA_TLB_GLOBAL_FLUSH); 3603 } 3604 } 3605 3606 static int iommu_suspend(void) 3607 { 3608 struct dmar_drhd_unit *drhd; 3609 struct intel_iommu *iommu = NULL; 3610 unsigned long flag; 3611 3612 for_each_active_iommu(iommu, drhd) { 3613 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3614 GFP_KERNEL); 3615 if (!iommu->iommu_state) 3616 goto nomem; 3617 } 3618 3619 iommu_flush_all(); 3620 3621 for_each_active_iommu(iommu, drhd) { 3622 iommu_disable_translation(iommu); 3623 3624 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3625 3626 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3627 readl(iommu->reg + DMAR_FECTL_REG); 3628 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3629 readl(iommu->reg + DMAR_FEDATA_REG); 3630 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3631 readl(iommu->reg + DMAR_FEADDR_REG); 3632 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3633 readl(iommu->reg + DMAR_FEUADDR_REG); 3634 3635 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3636 } 3637 return 0; 3638 3639 nomem: 3640 for_each_active_iommu(iommu, drhd) 3641 kfree(iommu->iommu_state); 3642 3643 return -ENOMEM; 3644 } 3645 3646 static void iommu_resume(void) 3647 { 3648 struct dmar_drhd_unit *drhd; 3649 struct intel_iommu *iommu = NULL; 3650 unsigned long flag; 3651 3652 if (init_iommu_hw()) { 3653 if (force_on) 3654 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3655 else 3656 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3657 return; 3658 } 3659 3660 for_each_active_iommu(iommu, drhd) { 3661 3662 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3663 3664 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3665 iommu->reg + DMAR_FECTL_REG); 3666 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3667 iommu->reg + DMAR_FEDATA_REG); 3668 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3669 iommu->reg + DMAR_FEADDR_REG); 3670 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3671 iommu->reg + DMAR_FEUADDR_REG); 3672 3673 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3674 } 3675 3676 for_each_active_iommu(iommu, drhd) 3677 kfree(iommu->iommu_state); 3678 } 3679 3680 static struct syscore_ops iommu_syscore_ops = { 3681 .resume = iommu_resume, 3682 .suspend = iommu_suspend, 3683 }; 3684 3685 static void __init init_iommu_pm_ops(void) 3686 { 3687 register_syscore_ops(&iommu_syscore_ops); 3688 } 3689 3690 #else 3691 static inline void init_iommu_pm_ops(void) {} 3692 #endif /* CONFIG_PM */ 3693 3694 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3695 { 3696 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3697 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3698 rmrr->end_address <= rmrr->base_address || 3699 arch_rmrr_sanity_check(rmrr)) 3700 return -EINVAL; 3701 3702 return 0; 3703 } 3704 3705 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3706 { 3707 struct acpi_dmar_reserved_memory *rmrr; 3708 struct dmar_rmrr_unit *rmrru; 3709 3710 rmrr = (struct acpi_dmar_reserved_memory *)header; 3711 if (rmrr_sanity_check(rmrr)) { 3712 pr_warn(FW_BUG 3713 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3714 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3715 rmrr->base_address, rmrr->end_address, 3716 dmi_get_system_info(DMI_BIOS_VENDOR), 3717 dmi_get_system_info(DMI_BIOS_VERSION), 3718 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3719 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3720 } 3721 3722 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3723 if (!rmrru) 3724 goto out; 3725 3726 rmrru->hdr = header; 3727 3728 rmrru->base_address = rmrr->base_address; 3729 rmrru->end_address = rmrr->end_address; 3730 3731 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3732 ((void *)rmrr) + rmrr->header.length, 3733 &rmrru->devices_cnt); 3734 if (rmrru->devices_cnt && rmrru->devices == NULL) 3735 goto free_rmrru; 3736 3737 list_add(&rmrru->list, &dmar_rmrr_units); 3738 3739 return 0; 3740 free_rmrru: 3741 kfree(rmrru); 3742 out: 3743 return -ENOMEM; 3744 } 3745 3746 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3747 { 3748 struct dmar_atsr_unit *atsru; 3749 struct acpi_dmar_atsr *tmp; 3750 3751 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3752 dmar_rcu_check()) { 3753 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3754 if (atsr->segment != tmp->segment) 3755 continue; 3756 if (atsr->header.length != tmp->header.length) 3757 continue; 3758 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3759 return atsru; 3760 } 3761 3762 return NULL; 3763 } 3764 3765 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3766 { 3767 struct acpi_dmar_atsr *atsr; 3768 struct dmar_atsr_unit *atsru; 3769 3770 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3771 return 0; 3772 3773 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3774 atsru = dmar_find_atsr(atsr); 3775 if (atsru) 3776 return 0; 3777 3778 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3779 if (!atsru) 3780 return -ENOMEM; 3781 3782 /* 3783 * If memory is allocated from slab by ACPI _DSM method, we need to 3784 * copy the memory content because the memory buffer will be freed 3785 * on return. 3786 */ 3787 atsru->hdr = (void *)(atsru + 1); 3788 memcpy(atsru->hdr, hdr, hdr->length); 3789 atsru->include_all = atsr->flags & 0x1; 3790 if (!atsru->include_all) { 3791 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3792 (void *)atsr + atsr->header.length, 3793 &atsru->devices_cnt); 3794 if (atsru->devices_cnt && atsru->devices == NULL) { 3795 kfree(atsru); 3796 return -ENOMEM; 3797 } 3798 } 3799 3800 list_add_rcu(&atsru->list, &dmar_atsr_units); 3801 3802 return 0; 3803 } 3804 3805 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3806 { 3807 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3808 kfree(atsru); 3809 } 3810 3811 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3812 { 3813 struct acpi_dmar_atsr *atsr; 3814 struct dmar_atsr_unit *atsru; 3815 3816 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3817 atsru = dmar_find_atsr(atsr); 3818 if (atsru) { 3819 list_del_rcu(&atsru->list); 3820 synchronize_rcu(); 3821 intel_iommu_free_atsr(atsru); 3822 } 3823 3824 return 0; 3825 } 3826 3827 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3828 { 3829 int i; 3830 struct device *dev; 3831 struct acpi_dmar_atsr *atsr; 3832 struct dmar_atsr_unit *atsru; 3833 3834 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3835 atsru = dmar_find_atsr(atsr); 3836 if (!atsru) 3837 return 0; 3838 3839 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3840 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3841 i, dev) 3842 return -EBUSY; 3843 } 3844 3845 return 0; 3846 } 3847 3848 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3849 { 3850 struct dmar_satc_unit *satcu; 3851 struct acpi_dmar_satc *tmp; 3852 3853 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3854 dmar_rcu_check()) { 3855 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3856 if (satc->segment != tmp->segment) 3857 continue; 3858 if (satc->header.length != tmp->header.length) 3859 continue; 3860 if (memcmp(satc, tmp, satc->header.length) == 0) 3861 return satcu; 3862 } 3863 3864 return NULL; 3865 } 3866 3867 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3868 { 3869 struct acpi_dmar_satc *satc; 3870 struct dmar_satc_unit *satcu; 3871 3872 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3873 return 0; 3874 3875 satc = container_of(hdr, struct acpi_dmar_satc, header); 3876 satcu = dmar_find_satc(satc); 3877 if (satcu) 3878 return 0; 3879 3880 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3881 if (!satcu) 3882 return -ENOMEM; 3883 3884 satcu->hdr = (void *)(satcu + 1); 3885 memcpy(satcu->hdr, hdr, hdr->length); 3886 satcu->atc_required = satc->flags & 0x1; 3887 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3888 (void *)satc + satc->header.length, 3889 &satcu->devices_cnt); 3890 if (satcu->devices_cnt && !satcu->devices) { 3891 kfree(satcu); 3892 return -ENOMEM; 3893 } 3894 list_add_rcu(&satcu->list, &dmar_satc_units); 3895 3896 return 0; 3897 } 3898 3899 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3900 { 3901 int sp, ret; 3902 struct intel_iommu *iommu = dmaru->iommu; 3903 3904 if (g_iommus[iommu->seq_id]) 3905 return 0; 3906 3907 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3908 if (ret) 3909 goto out; 3910 3911 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3912 pr_warn("%s: Doesn't support hardware pass through.\n", 3913 iommu->name); 3914 return -ENXIO; 3915 } 3916 if (!ecap_sc_support(iommu->ecap) && 3917 domain_update_iommu_snooping(iommu)) { 3918 pr_warn("%s: Doesn't support snooping.\n", 3919 iommu->name); 3920 return -ENXIO; 3921 } 3922 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3923 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3924 pr_warn("%s: Doesn't support large page.\n", 3925 iommu->name); 3926 return -ENXIO; 3927 } 3928 3929 /* 3930 * Disable translation if already enabled prior to OS handover. 3931 */ 3932 if (iommu->gcmd & DMA_GCMD_TE) 3933 iommu_disable_translation(iommu); 3934 3935 g_iommus[iommu->seq_id] = iommu; 3936 ret = iommu_init_domains(iommu); 3937 if (ret == 0) 3938 ret = iommu_alloc_root_entry(iommu); 3939 if (ret) 3940 goto out; 3941 3942 intel_svm_check(iommu); 3943 3944 if (dmaru->ignored) { 3945 /* 3946 * we always have to disable PMRs or DMA may fail on this device 3947 */ 3948 if (force_on) 3949 iommu_disable_protect_mem_regions(iommu); 3950 return 0; 3951 } 3952 3953 intel_iommu_init_qi(iommu); 3954 iommu_flush_write_buffer(iommu); 3955 3956 #ifdef CONFIG_INTEL_IOMMU_SVM 3957 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3958 ret = intel_svm_enable_prq(iommu); 3959 if (ret) 3960 goto disable_iommu; 3961 } 3962 #endif 3963 ret = dmar_set_interrupt(iommu); 3964 if (ret) 3965 goto disable_iommu; 3966 3967 iommu_set_root_entry(iommu); 3968 iommu_enable_translation(iommu); 3969 3970 iommu_disable_protect_mem_regions(iommu); 3971 return 0; 3972 3973 disable_iommu: 3974 disable_dmar_iommu(iommu); 3975 out: 3976 free_dmar_iommu(iommu); 3977 return ret; 3978 } 3979 3980 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3981 { 3982 int ret = 0; 3983 struct intel_iommu *iommu = dmaru->iommu; 3984 3985 if (!intel_iommu_enabled) 3986 return 0; 3987 if (iommu == NULL) 3988 return -EINVAL; 3989 3990 if (insert) { 3991 ret = intel_iommu_add(dmaru); 3992 } else { 3993 disable_dmar_iommu(iommu); 3994 free_dmar_iommu(iommu); 3995 } 3996 3997 return ret; 3998 } 3999 4000 static void intel_iommu_free_dmars(void) 4001 { 4002 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4003 struct dmar_atsr_unit *atsru, *atsr_n; 4004 struct dmar_satc_unit *satcu, *satc_n; 4005 4006 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4007 list_del(&rmrru->list); 4008 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4009 kfree(rmrru); 4010 } 4011 4012 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4013 list_del(&atsru->list); 4014 intel_iommu_free_atsr(atsru); 4015 } 4016 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 4017 list_del(&satcu->list); 4018 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 4019 kfree(satcu); 4020 } 4021 } 4022 4023 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4024 { 4025 int i, ret = 1; 4026 struct pci_bus *bus; 4027 struct pci_dev *bridge = NULL; 4028 struct device *tmp; 4029 struct acpi_dmar_atsr *atsr; 4030 struct dmar_atsr_unit *atsru; 4031 4032 dev = pci_physfn(dev); 4033 for (bus = dev->bus; bus; bus = bus->parent) { 4034 bridge = bus->self; 4035 /* If it's an integrated device, allow ATS */ 4036 if (!bridge) 4037 return 1; 4038 /* Connected via non-PCIe: no ATS */ 4039 if (!pci_is_pcie(bridge) || 4040 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4041 return 0; 4042 /* If we found the root port, look it up in the ATSR */ 4043 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4044 break; 4045 } 4046 4047 rcu_read_lock(); 4048 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4049 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4050 if (atsr->segment != pci_domain_nr(dev->bus)) 4051 continue; 4052 4053 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4054 if (tmp == &bridge->dev) 4055 goto out; 4056 4057 if (atsru->include_all) 4058 goto out; 4059 } 4060 ret = 0; 4061 out: 4062 rcu_read_unlock(); 4063 4064 return ret; 4065 } 4066 4067 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4068 { 4069 int ret; 4070 struct dmar_rmrr_unit *rmrru; 4071 struct dmar_atsr_unit *atsru; 4072 struct dmar_satc_unit *satcu; 4073 struct acpi_dmar_atsr *atsr; 4074 struct acpi_dmar_reserved_memory *rmrr; 4075 struct acpi_dmar_satc *satc; 4076 4077 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4078 return 0; 4079 4080 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4081 rmrr = container_of(rmrru->hdr, 4082 struct acpi_dmar_reserved_memory, header); 4083 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4084 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4085 ((void *)rmrr) + rmrr->header.length, 4086 rmrr->segment, rmrru->devices, 4087 rmrru->devices_cnt); 4088 if (ret < 0) 4089 return ret; 4090 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4091 dmar_remove_dev_scope(info, rmrr->segment, 4092 rmrru->devices, rmrru->devices_cnt); 4093 } 4094 } 4095 4096 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4097 if (atsru->include_all) 4098 continue; 4099 4100 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4101 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4102 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4103 (void *)atsr + atsr->header.length, 4104 atsr->segment, atsru->devices, 4105 atsru->devices_cnt); 4106 if (ret > 0) 4107 break; 4108 else if (ret < 0) 4109 return ret; 4110 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4111 if (dmar_remove_dev_scope(info, atsr->segment, 4112 atsru->devices, atsru->devices_cnt)) 4113 break; 4114 } 4115 } 4116 list_for_each_entry(satcu, &dmar_satc_units, list) { 4117 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 4118 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4119 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 4120 (void *)satc + satc->header.length, 4121 satc->segment, satcu->devices, 4122 satcu->devices_cnt); 4123 if (ret > 0) 4124 break; 4125 else if (ret < 0) 4126 return ret; 4127 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4128 if (dmar_remove_dev_scope(info, satc->segment, 4129 satcu->devices, satcu->devices_cnt)) 4130 break; 4131 } 4132 } 4133 4134 return 0; 4135 } 4136 4137 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4138 unsigned long val, void *v) 4139 { 4140 struct memory_notify *mhp = v; 4141 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4142 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4143 mhp->nr_pages - 1); 4144 4145 switch (val) { 4146 case MEM_GOING_ONLINE: 4147 if (iommu_domain_identity_map(si_domain, 4148 start_vpfn, last_vpfn)) { 4149 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4150 start_vpfn, last_vpfn); 4151 return NOTIFY_BAD; 4152 } 4153 break; 4154 4155 case MEM_OFFLINE: 4156 case MEM_CANCEL_ONLINE: 4157 { 4158 struct dmar_drhd_unit *drhd; 4159 struct intel_iommu *iommu; 4160 LIST_HEAD(freelist); 4161 4162 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); 4163 4164 rcu_read_lock(); 4165 for_each_active_iommu(iommu, drhd) 4166 iommu_flush_iotlb_psi(iommu, si_domain, 4167 start_vpfn, mhp->nr_pages, 4168 list_empty(&freelist), 0); 4169 rcu_read_unlock(); 4170 put_pages_list(&freelist); 4171 } 4172 break; 4173 } 4174 4175 return NOTIFY_OK; 4176 } 4177 4178 static struct notifier_block intel_iommu_memory_nb = { 4179 .notifier_call = intel_iommu_memory_notifier, 4180 .priority = 0 4181 }; 4182 4183 static void intel_disable_iommus(void) 4184 { 4185 struct intel_iommu *iommu = NULL; 4186 struct dmar_drhd_unit *drhd; 4187 4188 for_each_iommu(iommu, drhd) 4189 iommu_disable_translation(iommu); 4190 } 4191 4192 void intel_iommu_shutdown(void) 4193 { 4194 struct dmar_drhd_unit *drhd; 4195 struct intel_iommu *iommu = NULL; 4196 4197 if (no_iommu || dmar_disabled) 4198 return; 4199 4200 down_write(&dmar_global_lock); 4201 4202 /* Disable PMRs explicitly here. */ 4203 for_each_iommu(iommu, drhd) 4204 iommu_disable_protect_mem_regions(iommu); 4205 4206 /* Make sure the IOMMUs are switched off */ 4207 intel_disable_iommus(); 4208 4209 up_write(&dmar_global_lock); 4210 } 4211 4212 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4213 { 4214 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4215 4216 return container_of(iommu_dev, struct intel_iommu, iommu); 4217 } 4218 4219 static ssize_t version_show(struct device *dev, 4220 struct device_attribute *attr, char *buf) 4221 { 4222 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4223 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4224 return sprintf(buf, "%d:%d\n", 4225 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4226 } 4227 static DEVICE_ATTR_RO(version); 4228 4229 static ssize_t address_show(struct device *dev, 4230 struct device_attribute *attr, char *buf) 4231 { 4232 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4233 return sprintf(buf, "%llx\n", iommu->reg_phys); 4234 } 4235 static DEVICE_ATTR_RO(address); 4236 4237 static ssize_t cap_show(struct device *dev, 4238 struct device_attribute *attr, char *buf) 4239 { 4240 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4241 return sprintf(buf, "%llx\n", iommu->cap); 4242 } 4243 static DEVICE_ATTR_RO(cap); 4244 4245 static ssize_t ecap_show(struct device *dev, 4246 struct device_attribute *attr, char *buf) 4247 { 4248 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4249 return sprintf(buf, "%llx\n", iommu->ecap); 4250 } 4251 static DEVICE_ATTR_RO(ecap); 4252 4253 static ssize_t domains_supported_show(struct device *dev, 4254 struct device_attribute *attr, char *buf) 4255 { 4256 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4257 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4258 } 4259 static DEVICE_ATTR_RO(domains_supported); 4260 4261 static ssize_t domains_used_show(struct device *dev, 4262 struct device_attribute *attr, char *buf) 4263 { 4264 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4265 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4266 cap_ndoms(iommu->cap))); 4267 } 4268 static DEVICE_ATTR_RO(domains_used); 4269 4270 static struct attribute *intel_iommu_attrs[] = { 4271 &dev_attr_version.attr, 4272 &dev_attr_address.attr, 4273 &dev_attr_cap.attr, 4274 &dev_attr_ecap.attr, 4275 &dev_attr_domains_supported.attr, 4276 &dev_attr_domains_used.attr, 4277 NULL, 4278 }; 4279 4280 static struct attribute_group intel_iommu_group = { 4281 .name = "intel-iommu", 4282 .attrs = intel_iommu_attrs, 4283 }; 4284 4285 const struct attribute_group *intel_iommu_groups[] = { 4286 &intel_iommu_group, 4287 NULL, 4288 }; 4289 4290 static inline bool has_external_pci(void) 4291 { 4292 struct pci_dev *pdev = NULL; 4293 4294 for_each_pci_dev(pdev) 4295 if (pdev->external_facing) 4296 return true; 4297 4298 return false; 4299 } 4300 4301 static int __init platform_optin_force_iommu(void) 4302 { 4303 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4304 return 0; 4305 4306 if (no_iommu || dmar_disabled) 4307 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4308 4309 /* 4310 * If Intel-IOMMU is disabled by default, we will apply identity 4311 * map for all devices except those marked as being untrusted. 4312 */ 4313 if (dmar_disabled) 4314 iommu_set_default_passthrough(false); 4315 4316 dmar_disabled = 0; 4317 no_iommu = 0; 4318 4319 return 1; 4320 } 4321 4322 static int __init probe_acpi_namespace_devices(void) 4323 { 4324 struct dmar_drhd_unit *drhd; 4325 /* To avoid a -Wunused-but-set-variable warning. */ 4326 struct intel_iommu *iommu __maybe_unused; 4327 struct device *dev; 4328 int i, ret = 0; 4329 4330 for_each_active_iommu(iommu, drhd) { 4331 for_each_active_dev_scope(drhd->devices, 4332 drhd->devices_cnt, i, dev) { 4333 struct acpi_device_physical_node *pn; 4334 struct iommu_group *group; 4335 struct acpi_device *adev; 4336 4337 if (dev->bus != &acpi_bus_type) 4338 continue; 4339 4340 adev = to_acpi_device(dev); 4341 mutex_lock(&adev->physical_node_lock); 4342 list_for_each_entry(pn, 4343 &adev->physical_node_list, node) { 4344 group = iommu_group_get(pn->dev); 4345 if (group) { 4346 iommu_group_put(group); 4347 continue; 4348 } 4349 4350 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4351 ret = iommu_probe_device(pn->dev); 4352 if (ret) 4353 break; 4354 } 4355 mutex_unlock(&adev->physical_node_lock); 4356 4357 if (ret) 4358 return ret; 4359 } 4360 } 4361 4362 return 0; 4363 } 4364 4365 int __init intel_iommu_init(void) 4366 { 4367 int ret = -ENODEV; 4368 struct dmar_drhd_unit *drhd; 4369 struct intel_iommu *iommu; 4370 4371 /* 4372 * Intel IOMMU is required for a TXT/tboot launch or platform 4373 * opt in, so enforce that. 4374 */ 4375 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4376 platform_optin_force_iommu(); 4377 4378 if (iommu_init_mempool()) { 4379 if (force_on) 4380 panic("tboot: Failed to initialize iommu memory\n"); 4381 return -ENOMEM; 4382 } 4383 4384 down_write(&dmar_global_lock); 4385 if (dmar_table_init()) { 4386 if (force_on) 4387 panic("tboot: Failed to initialize DMAR table\n"); 4388 goto out_free_dmar; 4389 } 4390 4391 if (dmar_dev_scope_init() < 0) { 4392 if (force_on) 4393 panic("tboot: Failed to initialize DMAR device scope\n"); 4394 goto out_free_dmar; 4395 } 4396 4397 up_write(&dmar_global_lock); 4398 4399 /* 4400 * The bus notifier takes the dmar_global_lock, so lockdep will 4401 * complain later when we register it under the lock. 4402 */ 4403 dmar_register_bus_notifier(); 4404 4405 down_write(&dmar_global_lock); 4406 4407 if (!no_iommu) 4408 intel_iommu_debugfs_init(); 4409 4410 if (no_iommu || dmar_disabled) { 4411 /* 4412 * We exit the function here to ensure IOMMU's remapping and 4413 * mempool aren't setup, which means that the IOMMU's PMRs 4414 * won't be disabled via the call to init_dmars(). So disable 4415 * it explicitly here. The PMRs were setup by tboot prior to 4416 * calling SENTER, but the kernel is expected to reset/tear 4417 * down the PMRs. 4418 */ 4419 if (intel_iommu_tboot_noforce) { 4420 for_each_iommu(iommu, drhd) 4421 iommu_disable_protect_mem_regions(iommu); 4422 } 4423 4424 /* 4425 * Make sure the IOMMUs are switched off, even when we 4426 * boot into a kexec kernel and the previous kernel left 4427 * them enabled 4428 */ 4429 intel_disable_iommus(); 4430 goto out_free_dmar; 4431 } 4432 4433 if (list_empty(&dmar_rmrr_units)) 4434 pr_info("No RMRR found\n"); 4435 4436 if (list_empty(&dmar_atsr_units)) 4437 pr_info("No ATSR found\n"); 4438 4439 if (list_empty(&dmar_satc_units)) 4440 pr_info("No SATC found\n"); 4441 4442 if (dmar_map_gfx) 4443 intel_iommu_gfx_mapped = 1; 4444 4445 init_no_remapping_devices(); 4446 4447 ret = init_dmars(); 4448 if (ret) { 4449 if (force_on) 4450 panic("tboot: Failed to initialize DMARs\n"); 4451 pr_err("Initialization failed\n"); 4452 goto out_free_dmar; 4453 } 4454 up_write(&dmar_global_lock); 4455 4456 init_iommu_pm_ops(); 4457 4458 down_read(&dmar_global_lock); 4459 for_each_active_iommu(iommu, drhd) { 4460 /* 4461 * The flush queue implementation does not perform 4462 * page-selective invalidations that are required for efficient 4463 * TLB flushes in virtual environments. The benefit of batching 4464 * is likely to be much lower than the overhead of synchronizing 4465 * the virtual and physical IOMMU page-tables. 4466 */ 4467 if (cap_caching_mode(iommu->cap)) { 4468 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4469 iommu_set_dma_strict(); 4470 } 4471 iommu_device_sysfs_add(&iommu->iommu, NULL, 4472 intel_iommu_groups, 4473 "%s", iommu->name); 4474 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4475 } 4476 up_read(&dmar_global_lock); 4477 4478 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4479 if (si_domain && !hw_pass_through) 4480 register_memory_notifier(&intel_iommu_memory_nb); 4481 4482 down_read(&dmar_global_lock); 4483 if (probe_acpi_namespace_devices()) 4484 pr_warn("ACPI name space devices didn't probe correctly\n"); 4485 4486 /* Finally, we enable the DMA remapping hardware. */ 4487 for_each_iommu(iommu, drhd) { 4488 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4489 iommu_enable_translation(iommu); 4490 4491 iommu_disable_protect_mem_regions(iommu); 4492 } 4493 up_read(&dmar_global_lock); 4494 4495 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4496 4497 intel_iommu_enabled = 1; 4498 4499 return 0; 4500 4501 out_free_dmar: 4502 intel_iommu_free_dmars(); 4503 up_write(&dmar_global_lock); 4504 iommu_exit_mempool(); 4505 return ret; 4506 } 4507 4508 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4509 { 4510 struct device_domain_info *info = opaque; 4511 4512 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4513 return 0; 4514 } 4515 4516 /* 4517 * NB - intel-iommu lacks any sort of reference counting for the users of 4518 * dependent devices. If multiple endpoints have intersecting dependent 4519 * devices, unbinding the driver from any one of them will possibly leave 4520 * the others unable to operate. 4521 */ 4522 static void domain_context_clear(struct device_domain_info *info) 4523 { 4524 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4525 return; 4526 4527 pci_for_each_dma_alias(to_pci_dev(info->dev), 4528 &domain_context_clear_one_cb, info); 4529 } 4530 4531 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4532 { 4533 struct dmar_domain *domain; 4534 struct intel_iommu *iommu; 4535 unsigned long flags; 4536 4537 assert_spin_locked(&device_domain_lock); 4538 4539 if (WARN_ON(!info)) 4540 return; 4541 4542 iommu = info->iommu; 4543 domain = info->domain; 4544 4545 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4546 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4547 intel_pasid_tear_down_entry(iommu, info->dev, 4548 PASID_RID2PASID, false); 4549 4550 iommu_disable_dev_iotlb(info); 4551 domain_context_clear(info); 4552 intel_pasid_free_table(info->dev); 4553 } 4554 4555 unlink_domain_info(info); 4556 4557 spin_lock_irqsave(&iommu->lock, flags); 4558 domain_detach_iommu(domain, iommu); 4559 spin_unlock_irqrestore(&iommu->lock, flags); 4560 4561 free_devinfo_mem(info); 4562 } 4563 4564 static void dmar_remove_one_dev_info(struct device *dev) 4565 { 4566 struct device_domain_info *info; 4567 unsigned long flags; 4568 4569 spin_lock_irqsave(&device_domain_lock, flags); 4570 info = get_domain_info(dev); 4571 if (info) 4572 __dmar_remove_one_dev_info(info); 4573 spin_unlock_irqrestore(&device_domain_lock, flags); 4574 } 4575 4576 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4577 { 4578 int adjust_width; 4579 4580 /* calculate AGAW */ 4581 domain->gaw = guest_width; 4582 adjust_width = guestwidth_to_adjustwidth(guest_width); 4583 domain->agaw = width_to_agaw(adjust_width); 4584 4585 domain->iommu_coherency = false; 4586 domain->iommu_snooping = false; 4587 domain->iommu_superpage = 0; 4588 domain->max_addr = 0; 4589 4590 /* always allocate the top pgd */ 4591 domain->pgd = alloc_pgtable_page(domain->nid); 4592 if (!domain->pgd) 4593 return -ENOMEM; 4594 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4595 return 0; 4596 } 4597 4598 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4599 { 4600 struct dmar_domain *dmar_domain; 4601 struct iommu_domain *domain; 4602 4603 switch (type) { 4604 case IOMMU_DOMAIN_DMA: 4605 case IOMMU_DOMAIN_DMA_FQ: 4606 case IOMMU_DOMAIN_UNMANAGED: 4607 dmar_domain = alloc_domain(type); 4608 if (!dmar_domain) { 4609 pr_err("Can't allocate dmar_domain\n"); 4610 return NULL; 4611 } 4612 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4613 pr_err("Domain initialization failed\n"); 4614 domain_exit(dmar_domain); 4615 return NULL; 4616 } 4617 4618 domain = &dmar_domain->domain; 4619 domain->geometry.aperture_start = 0; 4620 domain->geometry.aperture_end = 4621 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4622 domain->geometry.force_aperture = true; 4623 4624 return domain; 4625 case IOMMU_DOMAIN_IDENTITY: 4626 return &si_domain->domain; 4627 default: 4628 return NULL; 4629 } 4630 4631 return NULL; 4632 } 4633 4634 static void intel_iommu_domain_free(struct iommu_domain *domain) 4635 { 4636 if (domain != &si_domain->domain) 4637 domain_exit(to_dmar_domain(domain)); 4638 } 4639 4640 /* 4641 * Check whether a @domain could be attached to the @dev through the 4642 * aux-domain attach/detach APIs. 4643 */ 4644 static inline bool 4645 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4646 { 4647 struct device_domain_info *info = get_domain_info(dev); 4648 4649 return info && info->auxd_enabled && 4650 domain->type == IOMMU_DOMAIN_UNMANAGED; 4651 } 4652 4653 static inline struct subdev_domain_info * 4654 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4655 { 4656 struct subdev_domain_info *sinfo; 4657 4658 if (!list_empty(&domain->subdevices)) { 4659 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4660 if (sinfo->pdev == dev) 4661 return sinfo; 4662 } 4663 } 4664 4665 return NULL; 4666 } 4667 4668 static int auxiliary_link_device(struct dmar_domain *domain, 4669 struct device *dev) 4670 { 4671 struct device_domain_info *info = get_domain_info(dev); 4672 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4673 4674 assert_spin_locked(&device_domain_lock); 4675 if (WARN_ON(!info)) 4676 return -EINVAL; 4677 4678 if (!sinfo) { 4679 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4680 if (!sinfo) 4681 return -ENOMEM; 4682 sinfo->domain = domain; 4683 sinfo->pdev = dev; 4684 list_add(&sinfo->link_phys, &info->subdevices); 4685 list_add(&sinfo->link_domain, &domain->subdevices); 4686 } 4687 4688 return ++sinfo->users; 4689 } 4690 4691 static int auxiliary_unlink_device(struct dmar_domain *domain, 4692 struct device *dev) 4693 { 4694 struct device_domain_info *info = get_domain_info(dev); 4695 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4696 int ret; 4697 4698 assert_spin_locked(&device_domain_lock); 4699 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4700 return -EINVAL; 4701 4702 ret = --sinfo->users; 4703 if (!ret) { 4704 list_del(&sinfo->link_phys); 4705 list_del(&sinfo->link_domain); 4706 kfree(sinfo); 4707 } 4708 4709 return ret; 4710 } 4711 4712 static int aux_domain_add_dev(struct dmar_domain *domain, 4713 struct device *dev) 4714 { 4715 int ret; 4716 unsigned long flags; 4717 struct intel_iommu *iommu; 4718 4719 iommu = device_to_iommu(dev, NULL, NULL); 4720 if (!iommu) 4721 return -ENODEV; 4722 4723 if (domain->default_pasid <= 0) { 4724 u32 pasid; 4725 4726 /* No private data needed for the default pasid */ 4727 pasid = ioasid_alloc(NULL, PASID_MIN, 4728 pci_max_pasids(to_pci_dev(dev)) - 1, 4729 NULL); 4730 if (pasid == INVALID_IOASID) { 4731 pr_err("Can't allocate default pasid\n"); 4732 return -ENODEV; 4733 } 4734 domain->default_pasid = pasid; 4735 } 4736 4737 spin_lock_irqsave(&device_domain_lock, flags); 4738 ret = auxiliary_link_device(domain, dev); 4739 if (ret <= 0) 4740 goto link_failed; 4741 4742 /* 4743 * Subdevices from the same physical device can be attached to the 4744 * same domain. For such cases, only the first subdevice attachment 4745 * needs to go through the full steps in this function. So if ret > 4746 * 1, just goto out. 4747 */ 4748 if (ret > 1) 4749 goto out; 4750 4751 /* 4752 * iommu->lock must be held to attach domain to iommu and setup the 4753 * pasid entry for second level translation. 4754 */ 4755 spin_lock(&iommu->lock); 4756 ret = domain_attach_iommu(domain, iommu); 4757 if (ret) 4758 goto attach_failed; 4759 4760 /* Setup the PASID entry for mediated devices: */ 4761 if (domain_use_first_level(domain)) 4762 ret = domain_setup_first_level(iommu, domain, dev, 4763 domain->default_pasid); 4764 else 4765 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4766 domain->default_pasid); 4767 if (ret) 4768 goto table_failed; 4769 4770 spin_unlock(&iommu->lock); 4771 out: 4772 spin_unlock_irqrestore(&device_domain_lock, flags); 4773 4774 return 0; 4775 4776 table_failed: 4777 domain_detach_iommu(domain, iommu); 4778 attach_failed: 4779 spin_unlock(&iommu->lock); 4780 auxiliary_unlink_device(domain, dev); 4781 link_failed: 4782 spin_unlock_irqrestore(&device_domain_lock, flags); 4783 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4784 ioasid_put(domain->default_pasid); 4785 4786 return ret; 4787 } 4788 4789 static void aux_domain_remove_dev(struct dmar_domain *domain, 4790 struct device *dev) 4791 { 4792 struct device_domain_info *info; 4793 struct intel_iommu *iommu; 4794 unsigned long flags; 4795 4796 if (!is_aux_domain(dev, &domain->domain)) 4797 return; 4798 4799 spin_lock_irqsave(&device_domain_lock, flags); 4800 info = get_domain_info(dev); 4801 iommu = info->iommu; 4802 4803 if (!auxiliary_unlink_device(domain, dev)) { 4804 spin_lock(&iommu->lock); 4805 intel_pasid_tear_down_entry(iommu, dev, 4806 domain->default_pasid, false); 4807 domain_detach_iommu(domain, iommu); 4808 spin_unlock(&iommu->lock); 4809 } 4810 4811 spin_unlock_irqrestore(&device_domain_lock, flags); 4812 4813 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4814 ioasid_put(domain->default_pasid); 4815 } 4816 4817 static int prepare_domain_attach_device(struct iommu_domain *domain, 4818 struct device *dev) 4819 { 4820 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4821 struct intel_iommu *iommu; 4822 int addr_width; 4823 4824 iommu = device_to_iommu(dev, NULL, NULL); 4825 if (!iommu) 4826 return -ENODEV; 4827 4828 if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) && 4829 !ecap_nest(iommu->ecap)) { 4830 dev_err(dev, "%s: iommu not support nested translation\n", 4831 iommu->name); 4832 return -EINVAL; 4833 } 4834 4835 /* check if this iommu agaw is sufficient for max mapped address */ 4836 addr_width = agaw_to_width(iommu->agaw); 4837 if (addr_width > cap_mgaw(iommu->cap)) 4838 addr_width = cap_mgaw(iommu->cap); 4839 4840 if (dmar_domain->max_addr > (1LL << addr_width)) { 4841 dev_err(dev, "%s: iommu width (%d) is not " 4842 "sufficient for the mapped address (%llx)\n", 4843 __func__, addr_width, dmar_domain->max_addr); 4844 return -EFAULT; 4845 } 4846 dmar_domain->gaw = addr_width; 4847 4848 /* 4849 * Knock out extra levels of page tables if necessary 4850 */ 4851 while (iommu->agaw < dmar_domain->agaw) { 4852 struct dma_pte *pte; 4853 4854 pte = dmar_domain->pgd; 4855 if (dma_pte_present(pte)) { 4856 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4857 free_pgtable_page(pte); 4858 } 4859 dmar_domain->agaw--; 4860 } 4861 4862 return 0; 4863 } 4864 4865 static int intel_iommu_attach_device(struct iommu_domain *domain, 4866 struct device *dev) 4867 { 4868 int ret; 4869 4870 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4871 device_is_rmrr_locked(dev)) { 4872 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4873 return -EPERM; 4874 } 4875 4876 if (is_aux_domain(dev, domain)) 4877 return -EPERM; 4878 4879 /* normally dev is not mapped */ 4880 if (unlikely(domain_context_mapped(dev))) { 4881 struct dmar_domain *old_domain; 4882 4883 old_domain = find_domain(dev); 4884 if (old_domain) 4885 dmar_remove_one_dev_info(dev); 4886 } 4887 4888 ret = prepare_domain_attach_device(domain, dev); 4889 if (ret) 4890 return ret; 4891 4892 return domain_add_dev_info(to_dmar_domain(domain), dev); 4893 } 4894 4895 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4896 struct device *dev) 4897 { 4898 int ret; 4899 4900 if (!is_aux_domain(dev, domain)) 4901 return -EPERM; 4902 4903 ret = prepare_domain_attach_device(domain, dev); 4904 if (ret) 4905 return ret; 4906 4907 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4908 } 4909 4910 static void intel_iommu_detach_device(struct iommu_domain *domain, 4911 struct device *dev) 4912 { 4913 dmar_remove_one_dev_info(dev); 4914 } 4915 4916 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4917 struct device *dev) 4918 { 4919 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4920 } 4921 4922 #ifdef CONFIG_INTEL_IOMMU_SVM 4923 /* 4924 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4925 * VT-d granularity. Invalidation is typically included in the unmap operation 4926 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4927 * owns the first level page tables. Invalidations of translation caches in the 4928 * guest are trapped and passed down to the host. 4929 * 4930 * vIOMMU in the guest will only expose first level page tables, therefore 4931 * we do not support IOTLB granularity for request without PASID (second level). 4932 * 4933 * For example, to find the VT-d granularity encoding for IOTLB 4934 * type and page selective granularity within PASID: 4935 * X: indexed by iommu cache type 4936 * Y: indexed by enum iommu_inv_granularity 4937 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4938 */ 4939 4940 static const int 4941 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4942 /* 4943 * PASID based IOTLB invalidation: PASID selective (per PASID), 4944 * page selective (address granularity) 4945 */ 4946 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4947 /* PASID based dev TLBs */ 4948 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4949 /* PASID cache */ 4950 {-EINVAL, -EINVAL, -EINVAL} 4951 }; 4952 4953 static inline int to_vtd_granularity(int type, int granu) 4954 { 4955 return inv_type_granu_table[type][granu]; 4956 } 4957 4958 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4959 { 4960 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4961 4962 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 4963 * IOMMU cache invalidate API passes granu_size in bytes, and number of 4964 * granu size in contiguous memory. 4965 */ 4966 return order_base_2(nr_pages); 4967 } 4968 4969 static int 4970 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 4971 struct iommu_cache_invalidate_info *inv_info) 4972 { 4973 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4974 struct device_domain_info *info; 4975 struct intel_iommu *iommu; 4976 unsigned long flags; 4977 int cache_type; 4978 u8 bus, devfn; 4979 u16 did, sid; 4980 int ret = 0; 4981 u64 size = 0; 4982 4983 if (!inv_info || !dmar_domain) 4984 return -EINVAL; 4985 4986 if (!dev || !dev_is_pci(dev)) 4987 return -ENODEV; 4988 4989 iommu = device_to_iommu(dev, &bus, &devfn); 4990 if (!iommu) 4991 return -ENODEV; 4992 4993 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 4994 return -EINVAL; 4995 4996 spin_lock_irqsave(&device_domain_lock, flags); 4997 spin_lock(&iommu->lock); 4998 info = get_domain_info(dev); 4999 if (!info) { 5000 ret = -EINVAL; 5001 goto out_unlock; 5002 } 5003 did = dmar_domain->iommu_did[iommu->seq_id]; 5004 sid = PCI_DEVID(bus, devfn); 5005 5006 /* Size is only valid in address selective invalidation */ 5007 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5008 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 5009 inv_info->granu.addr_info.nb_granules); 5010 5011 for_each_set_bit(cache_type, 5012 (unsigned long *)&inv_info->cache, 5013 IOMMU_CACHE_INV_TYPE_NR) { 5014 int granu = 0; 5015 u64 pasid = 0; 5016 u64 addr = 0; 5017 5018 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5019 if (granu == -EINVAL) { 5020 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5021 cache_type, inv_info->granularity); 5022 break; 5023 } 5024 5025 /* 5026 * PASID is stored in different locations based on the 5027 * granularity. 5028 */ 5029 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5030 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5031 pasid = inv_info->granu.pasid_info.pasid; 5032 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5033 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5034 pasid = inv_info->granu.addr_info.pasid; 5035 5036 switch (BIT(cache_type)) { 5037 case IOMMU_CACHE_INV_TYPE_IOTLB: 5038 /* HW will ignore LSB bits based on address mask */ 5039 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5040 size && 5041 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5042 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5043 inv_info->granu.addr_info.addr, size); 5044 } 5045 5046 /* 5047 * If granu is PASID-selective, address is ignored. 5048 * We use npages = -1 to indicate that. 5049 */ 5050 qi_flush_piotlb(iommu, did, pasid, 5051 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 5052 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5053 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5054 5055 if (!info->ats_enabled) 5056 break; 5057 /* 5058 * Always flush device IOTLB if ATS is enabled. vIOMMU 5059 * in the guest may assume IOTLB flush is inclusive, 5060 * which is more efficient. 5061 */ 5062 fallthrough; 5063 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5064 /* 5065 * PASID based device TLB invalidation does not support 5066 * IOMMU_INV_GRANU_PASID granularity but only supports 5067 * IOMMU_INV_GRANU_ADDR. 5068 * The equivalent of that is we set the size to be the 5069 * entire range of 64 bit. User only provides PASID info 5070 * without address info. So we set addr to 0. 5071 */ 5072 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5073 size = 64 - VTD_PAGE_SHIFT; 5074 addr = 0; 5075 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5076 addr = inv_info->granu.addr_info.addr; 5077 } 5078 5079 if (info->ats_enabled) 5080 qi_flush_dev_iotlb_pasid(iommu, sid, 5081 info->pfsid, pasid, 5082 info->ats_qdep, addr, 5083 size); 5084 else 5085 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5086 break; 5087 default: 5088 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5089 cache_type); 5090 ret = -EINVAL; 5091 } 5092 } 5093 out_unlock: 5094 spin_unlock(&iommu->lock); 5095 spin_unlock_irqrestore(&device_domain_lock, flags); 5096 5097 return ret; 5098 } 5099 #endif 5100 5101 static int intel_iommu_map(struct iommu_domain *domain, 5102 unsigned long iova, phys_addr_t hpa, 5103 size_t size, int iommu_prot, gfp_t gfp) 5104 { 5105 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5106 u64 max_addr; 5107 int prot = 0; 5108 5109 if (iommu_prot & IOMMU_READ) 5110 prot |= DMA_PTE_READ; 5111 if (iommu_prot & IOMMU_WRITE) 5112 prot |= DMA_PTE_WRITE; 5113 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5114 prot |= DMA_PTE_SNP; 5115 5116 max_addr = iova + size; 5117 if (dmar_domain->max_addr < max_addr) { 5118 u64 end; 5119 5120 /* check if minimum agaw is sufficient for mapped address */ 5121 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5122 if (end < max_addr) { 5123 pr_err("%s: iommu width (%d) is not " 5124 "sufficient for the mapped address (%llx)\n", 5125 __func__, dmar_domain->gaw, max_addr); 5126 return -EFAULT; 5127 } 5128 dmar_domain->max_addr = max_addr; 5129 } 5130 /* Round up size to next multiple of PAGE_SIZE, if it and 5131 the low bits of hpa would take us onto the next page */ 5132 size = aligned_nrpages(hpa, size); 5133 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5134 hpa >> VTD_PAGE_SHIFT, size, prot); 5135 } 5136 5137 static int intel_iommu_map_pages(struct iommu_domain *domain, 5138 unsigned long iova, phys_addr_t paddr, 5139 size_t pgsize, size_t pgcount, 5140 int prot, gfp_t gfp, size_t *mapped) 5141 { 5142 unsigned long pgshift = __ffs(pgsize); 5143 size_t size = pgcount << pgshift; 5144 int ret; 5145 5146 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 5147 return -EINVAL; 5148 5149 if (!IS_ALIGNED(iova | paddr, pgsize)) 5150 return -EINVAL; 5151 5152 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 5153 if (!ret && mapped) 5154 *mapped = size; 5155 5156 return ret; 5157 } 5158 5159 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5160 unsigned long iova, size_t size, 5161 struct iommu_iotlb_gather *gather) 5162 { 5163 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5164 unsigned long start_pfn, last_pfn; 5165 int level = 0; 5166 5167 /* Cope with horrid API which requires us to unmap more than the 5168 size argument if it happens to be a large-page mapping. */ 5169 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5170 5171 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5172 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5173 5174 start_pfn = iova >> VTD_PAGE_SHIFT; 5175 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5176 5177 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 5178 5179 if (dmar_domain->max_addr == iova + size) 5180 dmar_domain->max_addr = iova; 5181 5182 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5183 5184 return size; 5185 } 5186 5187 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 5188 unsigned long iova, 5189 size_t pgsize, size_t pgcount, 5190 struct iommu_iotlb_gather *gather) 5191 { 5192 unsigned long pgshift = __ffs(pgsize); 5193 size_t size = pgcount << pgshift; 5194 5195 return intel_iommu_unmap(domain, iova, size, gather); 5196 } 5197 5198 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5199 struct iommu_iotlb_gather *gather) 5200 { 5201 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5202 unsigned long iova_pfn = IOVA_PFN(gather->start); 5203 size_t size = gather->end - gather->start; 5204 unsigned long start_pfn; 5205 unsigned long nrpages; 5206 int iommu_id; 5207 5208 nrpages = aligned_nrpages(gather->start, size); 5209 start_pfn = mm_to_dma_pfn(iova_pfn); 5210 5211 for_each_domain_iommu(iommu_id, dmar_domain) 5212 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5213 start_pfn, nrpages, 5214 list_empty(&gather->freelist), 0); 5215 5216 put_pages_list(&gather->freelist); 5217 } 5218 5219 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5220 dma_addr_t iova) 5221 { 5222 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5223 struct dma_pte *pte; 5224 int level = 0; 5225 u64 phys = 0; 5226 5227 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5228 if (pte && dma_pte_present(pte)) 5229 phys = dma_pte_addr(pte) + 5230 (iova & (BIT_MASK(level_to_offset_bits(level) + 5231 VTD_PAGE_SHIFT) - 1)); 5232 5233 return phys; 5234 } 5235 5236 static bool intel_iommu_capable(enum iommu_cap cap) 5237 { 5238 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5239 return domain_update_iommu_snooping(NULL); 5240 if (cap == IOMMU_CAP_INTR_REMAP) 5241 return irq_remapping_enabled == 1; 5242 5243 return false; 5244 } 5245 5246 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5247 { 5248 struct intel_iommu *iommu; 5249 5250 iommu = device_to_iommu(dev, NULL, NULL); 5251 if (!iommu) 5252 return ERR_PTR(-ENODEV); 5253 5254 if (translation_pre_enabled(iommu)) 5255 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5256 5257 return &iommu->iommu; 5258 } 5259 5260 static void intel_iommu_release_device(struct device *dev) 5261 { 5262 struct intel_iommu *iommu; 5263 5264 iommu = device_to_iommu(dev, NULL, NULL); 5265 if (!iommu) 5266 return; 5267 5268 dmar_remove_one_dev_info(dev); 5269 5270 set_dma_ops(dev, NULL); 5271 } 5272 5273 static void intel_iommu_probe_finalize(struct device *dev) 5274 { 5275 set_dma_ops(dev, NULL); 5276 iommu_setup_dma_ops(dev, 0, U64_MAX); 5277 } 5278 5279 static void intel_iommu_get_resv_regions(struct device *device, 5280 struct list_head *head) 5281 { 5282 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5283 struct iommu_resv_region *reg; 5284 struct dmar_rmrr_unit *rmrr; 5285 struct device *i_dev; 5286 int i; 5287 5288 down_read(&dmar_global_lock); 5289 for_each_rmrr_units(rmrr) { 5290 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5291 i, i_dev) { 5292 struct iommu_resv_region *resv; 5293 enum iommu_resv_type type; 5294 size_t length; 5295 5296 if (i_dev != device && 5297 !is_downstream_to_pci_bridge(device, i_dev)) 5298 continue; 5299 5300 length = rmrr->end_address - rmrr->base_address + 1; 5301 5302 type = device_rmrr_is_relaxable(device) ? 5303 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5304 5305 resv = iommu_alloc_resv_region(rmrr->base_address, 5306 length, prot, type); 5307 if (!resv) 5308 break; 5309 5310 list_add_tail(&resv->list, head); 5311 } 5312 } 5313 up_read(&dmar_global_lock); 5314 5315 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5316 if (dev_is_pci(device)) { 5317 struct pci_dev *pdev = to_pci_dev(device); 5318 5319 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5320 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5321 IOMMU_RESV_DIRECT_RELAXABLE); 5322 if (reg) 5323 list_add_tail(®->list, head); 5324 } 5325 } 5326 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5327 5328 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5329 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5330 0, IOMMU_RESV_MSI); 5331 if (!reg) 5332 return; 5333 list_add_tail(®->list, head); 5334 } 5335 5336 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5337 { 5338 struct device_domain_info *info; 5339 struct context_entry *context; 5340 struct dmar_domain *domain; 5341 unsigned long flags; 5342 u64 ctx_lo; 5343 int ret; 5344 5345 domain = find_domain(dev); 5346 if (!domain) 5347 return -EINVAL; 5348 5349 spin_lock_irqsave(&device_domain_lock, flags); 5350 spin_lock(&iommu->lock); 5351 5352 ret = -EINVAL; 5353 info = get_domain_info(dev); 5354 if (!info || !info->pasid_supported) 5355 goto out; 5356 5357 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5358 if (WARN_ON(!context)) 5359 goto out; 5360 5361 ctx_lo = context[0].lo; 5362 5363 if (!(ctx_lo & CONTEXT_PASIDE)) { 5364 ctx_lo |= CONTEXT_PASIDE; 5365 context[0].lo = ctx_lo; 5366 wmb(); 5367 iommu->flush.flush_context(iommu, 5368 domain->iommu_did[iommu->seq_id], 5369 PCI_DEVID(info->bus, info->devfn), 5370 DMA_CCMD_MASK_NOBIT, 5371 DMA_CCMD_DEVICE_INVL); 5372 } 5373 5374 /* Enable PASID support in the device, if it wasn't already */ 5375 if (!info->pasid_enabled) 5376 iommu_enable_dev_iotlb(info); 5377 5378 ret = 0; 5379 5380 out: 5381 spin_unlock(&iommu->lock); 5382 spin_unlock_irqrestore(&device_domain_lock, flags); 5383 5384 return ret; 5385 } 5386 5387 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5388 { 5389 if (dev_is_pci(dev)) 5390 return pci_device_group(dev); 5391 return generic_device_group(dev); 5392 } 5393 5394 static int intel_iommu_enable_auxd(struct device *dev) 5395 { 5396 struct device_domain_info *info; 5397 struct intel_iommu *iommu; 5398 unsigned long flags; 5399 int ret; 5400 5401 iommu = device_to_iommu(dev, NULL, NULL); 5402 if (!iommu || dmar_disabled) 5403 return -EINVAL; 5404 5405 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5406 return -EINVAL; 5407 5408 ret = intel_iommu_enable_pasid(iommu, dev); 5409 if (ret) 5410 return -ENODEV; 5411 5412 spin_lock_irqsave(&device_domain_lock, flags); 5413 info = get_domain_info(dev); 5414 info->auxd_enabled = 1; 5415 spin_unlock_irqrestore(&device_domain_lock, flags); 5416 5417 return 0; 5418 } 5419 5420 static int intel_iommu_disable_auxd(struct device *dev) 5421 { 5422 struct device_domain_info *info; 5423 unsigned long flags; 5424 5425 spin_lock_irqsave(&device_domain_lock, flags); 5426 info = get_domain_info(dev); 5427 if (!WARN_ON(!info)) 5428 info->auxd_enabled = 0; 5429 spin_unlock_irqrestore(&device_domain_lock, flags); 5430 5431 return 0; 5432 } 5433 5434 static int intel_iommu_enable_sva(struct device *dev) 5435 { 5436 struct device_domain_info *info = get_domain_info(dev); 5437 struct intel_iommu *iommu; 5438 int ret; 5439 5440 if (!info || dmar_disabled) 5441 return -EINVAL; 5442 5443 iommu = info->iommu; 5444 if (!iommu) 5445 return -EINVAL; 5446 5447 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 5448 return -ENODEV; 5449 5450 if (intel_iommu_enable_pasid(iommu, dev)) 5451 return -ENODEV; 5452 5453 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 5454 return -EINVAL; 5455 5456 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 5457 if (!ret) 5458 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 5459 5460 return ret; 5461 } 5462 5463 static int intel_iommu_disable_sva(struct device *dev) 5464 { 5465 struct device_domain_info *info = get_domain_info(dev); 5466 struct intel_iommu *iommu = info->iommu; 5467 int ret; 5468 5469 ret = iommu_unregister_device_fault_handler(dev); 5470 if (!ret) 5471 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 5472 5473 return ret; 5474 } 5475 5476 static int intel_iommu_enable_iopf(struct device *dev) 5477 { 5478 struct device_domain_info *info = get_domain_info(dev); 5479 5480 if (info && info->pri_supported) 5481 return 0; 5482 5483 return -ENODEV; 5484 } 5485 5486 static int 5487 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5488 { 5489 switch (feat) { 5490 case IOMMU_DEV_FEAT_AUX: 5491 return intel_iommu_enable_auxd(dev); 5492 5493 case IOMMU_DEV_FEAT_IOPF: 5494 return intel_iommu_enable_iopf(dev); 5495 5496 case IOMMU_DEV_FEAT_SVA: 5497 return intel_iommu_enable_sva(dev); 5498 5499 default: 5500 return -ENODEV; 5501 } 5502 } 5503 5504 static int 5505 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5506 { 5507 switch (feat) { 5508 case IOMMU_DEV_FEAT_AUX: 5509 return intel_iommu_disable_auxd(dev); 5510 5511 case IOMMU_DEV_FEAT_IOPF: 5512 return 0; 5513 5514 case IOMMU_DEV_FEAT_SVA: 5515 return intel_iommu_disable_sva(dev); 5516 5517 default: 5518 return -ENODEV; 5519 } 5520 } 5521 5522 static bool 5523 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5524 { 5525 struct device_domain_info *info = get_domain_info(dev); 5526 5527 if (feat == IOMMU_DEV_FEAT_AUX) 5528 return scalable_mode_support() && info && info->auxd_enabled; 5529 5530 return false; 5531 } 5532 5533 static int 5534 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5535 { 5536 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5537 5538 return dmar_domain->default_pasid > 0 ? 5539 dmar_domain->default_pasid : -EINVAL; 5540 } 5541 5542 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5543 struct device *dev) 5544 { 5545 return attach_deferred(dev); 5546 } 5547 5548 static int 5549 intel_iommu_enable_nesting(struct iommu_domain *domain) 5550 { 5551 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5552 unsigned long flags; 5553 int ret = -ENODEV; 5554 5555 spin_lock_irqsave(&device_domain_lock, flags); 5556 if (list_empty(&dmar_domain->devices)) { 5557 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5558 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5559 ret = 0; 5560 } 5561 spin_unlock_irqrestore(&device_domain_lock, flags); 5562 5563 return ret; 5564 } 5565 5566 /* 5567 * Check that the device does not live on an external facing PCI port that is 5568 * marked as untrusted. Such devices should not be able to apply quirks and 5569 * thus not be able to bypass the IOMMU restrictions. 5570 */ 5571 static bool risky_device(struct pci_dev *pdev) 5572 { 5573 if (pdev->untrusted) { 5574 pci_info(pdev, 5575 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5576 pdev->vendor, pdev->device); 5577 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5578 return true; 5579 } 5580 return false; 5581 } 5582 5583 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 5584 unsigned long iova, size_t size) 5585 { 5586 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5587 unsigned long pages = aligned_nrpages(iova, size); 5588 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 5589 struct intel_iommu *iommu; 5590 int iommu_id; 5591 5592 for_each_domain_iommu(iommu_id, dmar_domain) { 5593 iommu = g_iommus[iommu_id]; 5594 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 5595 } 5596 } 5597 5598 const struct iommu_ops intel_iommu_ops = { 5599 .capable = intel_iommu_capable, 5600 .domain_alloc = intel_iommu_domain_alloc, 5601 .domain_free = intel_iommu_domain_free, 5602 .enable_nesting = intel_iommu_enable_nesting, 5603 .attach_dev = intel_iommu_attach_device, 5604 .detach_dev = intel_iommu_detach_device, 5605 .aux_attach_dev = intel_iommu_aux_attach_device, 5606 .aux_detach_dev = intel_iommu_aux_detach_device, 5607 .aux_get_pasid = intel_iommu_aux_get_pasid, 5608 .map_pages = intel_iommu_map_pages, 5609 .unmap_pages = intel_iommu_unmap_pages, 5610 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 5611 .flush_iotlb_all = intel_flush_iotlb_all, 5612 .iotlb_sync = intel_iommu_tlb_sync, 5613 .iova_to_phys = intel_iommu_iova_to_phys, 5614 .probe_device = intel_iommu_probe_device, 5615 .probe_finalize = intel_iommu_probe_finalize, 5616 .release_device = intel_iommu_release_device, 5617 .get_resv_regions = intel_iommu_get_resv_regions, 5618 .put_resv_regions = generic_iommu_put_resv_regions, 5619 .device_group = intel_iommu_device_group, 5620 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5621 .dev_enable_feat = intel_iommu_dev_enable_feat, 5622 .dev_disable_feat = intel_iommu_dev_disable_feat, 5623 .is_attach_deferred = intel_iommu_is_attach_deferred, 5624 .def_domain_type = device_def_domain_type, 5625 .pgsize_bitmap = SZ_4K, 5626 #ifdef CONFIG_INTEL_IOMMU_SVM 5627 .cache_invalidate = intel_iommu_sva_invalidate, 5628 .sva_bind_gpasid = intel_svm_bind_gpasid, 5629 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5630 .sva_bind = intel_svm_bind, 5631 .sva_unbind = intel_svm_unbind, 5632 .sva_get_pasid = intel_svm_get_pasid, 5633 .page_response = intel_svm_page_response, 5634 #endif 5635 }; 5636 5637 static void quirk_iommu_igfx(struct pci_dev *dev) 5638 { 5639 if (risky_device(dev)) 5640 return; 5641 5642 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5643 dmar_map_gfx = 0; 5644 } 5645 5646 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5654 5655 /* Broadwell igfx malfunctions with dmar */ 5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5680 5681 static void quirk_iommu_rwbf(struct pci_dev *dev) 5682 { 5683 if (risky_device(dev)) 5684 return; 5685 5686 /* 5687 * Mobile 4 Series Chipset neglects to set RWBF capability, 5688 * but needs it. Same seems to hold for the desktop versions. 5689 */ 5690 pci_info(dev, "Forcing write-buffer flush capability\n"); 5691 rwbf_quirk = 1; 5692 } 5693 5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5701 5702 #define GGC 0x52 5703 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5704 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5705 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5706 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5707 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5708 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5709 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5710 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5711 5712 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5713 { 5714 unsigned short ggc; 5715 5716 if (risky_device(dev)) 5717 return; 5718 5719 if (pci_read_config_word(dev, GGC, &ggc)) 5720 return; 5721 5722 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5723 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5724 dmar_map_gfx = 0; 5725 } else if (dmar_map_gfx) { 5726 /* we have to ensure the gfx device is idle before we flush */ 5727 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5728 iommu_set_dma_strict(); 5729 } 5730 } 5731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5735 5736 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5737 { 5738 unsigned short ver; 5739 5740 if (!IS_GFX_DEVICE(dev)) 5741 return; 5742 5743 ver = (dev->device >> 8) & 0xff; 5744 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5745 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5746 ver != 0x9a) 5747 return; 5748 5749 if (risky_device(dev)) 5750 return; 5751 5752 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5753 iommu_skip_te_disable = 1; 5754 } 5755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5756 5757 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5758 ISOCH DMAR unit for the Azalia sound device, but not give it any 5759 TLB entries, which causes it to deadlock. Check for that. We do 5760 this in a function called from init_dmars(), instead of in a PCI 5761 quirk, because we don't want to print the obnoxious "BIOS broken" 5762 message if VT-d is actually disabled. 5763 */ 5764 static void __init check_tylersburg_isoch(void) 5765 { 5766 struct pci_dev *pdev; 5767 uint32_t vtisochctrl; 5768 5769 /* If there's no Azalia in the system anyway, forget it. */ 5770 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5771 if (!pdev) 5772 return; 5773 5774 if (risky_device(pdev)) { 5775 pci_dev_put(pdev); 5776 return; 5777 } 5778 5779 pci_dev_put(pdev); 5780 5781 /* System Management Registers. Might be hidden, in which case 5782 we can't do the sanity check. But that's OK, because the 5783 known-broken BIOSes _don't_ actually hide it, so far. */ 5784 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5785 if (!pdev) 5786 return; 5787 5788 if (risky_device(pdev)) { 5789 pci_dev_put(pdev); 5790 return; 5791 } 5792 5793 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5794 pci_dev_put(pdev); 5795 return; 5796 } 5797 5798 pci_dev_put(pdev); 5799 5800 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5801 if (vtisochctrl & 1) 5802 return; 5803 5804 /* Drop all bits other than the number of TLB entries */ 5805 vtisochctrl &= 0x1c; 5806 5807 /* If we have the recommended number of TLB entries (16), fine. */ 5808 if (vtisochctrl == 0x10) 5809 return; 5810 5811 /* Zero TLB entries? You get to ride the short bus to school. */ 5812 if (!vtisochctrl) { 5813 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5814 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5815 dmi_get_system_info(DMI_BIOS_VENDOR), 5816 dmi_get_system_info(DMI_BIOS_VERSION), 5817 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5818 iommu_identity_mapping |= IDENTMAP_AZALIA; 5819 return; 5820 } 5821 5822 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5823 vtisochctrl); 5824 } 5825