1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/intel-svm.h> 37 #include <linux/syscore_ops.h> 38 #include <linux/tboot.h> 39 #include <linux/dmi.h> 40 #include <linux/pci-ats.h> 41 #include <linux/memblock.h> 42 #include <linux/dma-direct.h> 43 #include <linux/crash_dump.h> 44 #include <linux/numa.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 49 #include "../irq_remapping.h" 50 #include "../iommu-sva-lib.h" 51 #include "pasid.h" 52 #include "cap_audit.h" 53 54 #define ROOT_SIZE VTD_PAGE_SIZE 55 #define CONTEXT_SIZE VTD_PAGE_SIZE 56 57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 61 62 #define IOAPIC_RANGE_START (0xfee00000) 63 #define IOAPIC_RANGE_END (0xfeefffff) 64 #define IOVA_START_ADDR (0x1000) 65 66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 67 68 #define MAX_AGAW_WIDTH 64 69 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 70 71 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 73 74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 75 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 76 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 77 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 78 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 79 80 /* IO virtual address start page frame number */ 81 #define IOVA_START_PFN (1) 82 83 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 84 85 /* page table handling */ 86 #define LEVEL_STRIDE (9) 87 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 88 89 static inline int agaw_to_level(int agaw) 90 { 91 return agaw + 2; 92 } 93 94 static inline int agaw_to_width(int agaw) 95 { 96 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 97 } 98 99 static inline int width_to_agaw(int width) 100 { 101 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 102 } 103 104 static inline unsigned int level_to_offset_bits(int level) 105 { 106 return (level - 1) * LEVEL_STRIDE; 107 } 108 109 static inline int pfn_level_offset(u64 pfn, int level) 110 { 111 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 112 } 113 114 static inline u64 level_mask(int level) 115 { 116 return -1ULL << level_to_offset_bits(level); 117 } 118 119 static inline u64 level_size(int level) 120 { 121 return 1ULL << level_to_offset_bits(level); 122 } 123 124 static inline u64 align_to_level(u64 pfn, int level) 125 { 126 return (pfn + level_size(level) - 1) & level_mask(level); 127 } 128 129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 130 { 131 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 132 } 133 134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 135 are never going to work. */ 136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 137 { 138 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 139 } 140 141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 142 { 143 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 144 } 145 static inline unsigned long page_to_dma_pfn(struct page *pg) 146 { 147 return mm_to_dma_pfn(page_to_pfn(pg)); 148 } 149 static inline unsigned long virt_to_dma_pfn(void *p) 150 { 151 return page_to_dma_pfn(virt_to_page(p)); 152 } 153 154 /* global iommu list, set NULL for ignored DMAR units */ 155 static struct intel_iommu **g_iommus; 156 157 static void __init check_tylersburg_isoch(void); 158 static int rwbf_quirk; 159 static inline struct device_domain_info * 160 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 161 162 /* 163 * set to 1 to panic kernel if can't successfully enable VT-d 164 * (used when kernel is launched w/ TXT) 165 */ 166 static int force_on = 0; 167 static int intel_iommu_tboot_noforce; 168 static int no_platform_optin; 169 170 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 171 172 /* 173 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 174 * if marked present. 175 */ 176 static phys_addr_t root_entry_lctp(struct root_entry *re) 177 { 178 if (!(re->lo & 1)) 179 return 0; 180 181 return re->lo & VTD_PAGE_MASK; 182 } 183 184 /* 185 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 186 * if marked present. 187 */ 188 static phys_addr_t root_entry_uctp(struct root_entry *re) 189 { 190 if (!(re->hi & 1)) 191 return 0; 192 193 return re->hi & VTD_PAGE_MASK; 194 } 195 196 static inline void context_clear_pasid_enable(struct context_entry *context) 197 { 198 context->lo &= ~(1ULL << 11); 199 } 200 201 static inline bool context_pasid_enabled(struct context_entry *context) 202 { 203 return !!(context->lo & (1ULL << 11)); 204 } 205 206 static inline void context_set_copied(struct context_entry *context) 207 { 208 context->hi |= (1ull << 3); 209 } 210 211 static inline bool context_copied(struct context_entry *context) 212 { 213 return !!(context->hi & (1ULL << 3)); 214 } 215 216 static inline bool __context_present(struct context_entry *context) 217 { 218 return (context->lo & 1); 219 } 220 221 bool context_present(struct context_entry *context) 222 { 223 return context_pasid_enabled(context) ? 224 __context_present(context) : 225 __context_present(context) && !context_copied(context); 226 } 227 228 static inline void context_set_present(struct context_entry *context) 229 { 230 context->lo |= 1; 231 } 232 233 static inline void context_set_fault_enable(struct context_entry *context) 234 { 235 context->lo &= (((u64)-1) << 2) | 1; 236 } 237 238 static inline void context_set_translation_type(struct context_entry *context, 239 unsigned long value) 240 { 241 context->lo &= (((u64)-1) << 4) | 3; 242 context->lo |= (value & 3) << 2; 243 } 244 245 static inline void context_set_address_root(struct context_entry *context, 246 unsigned long value) 247 { 248 context->lo &= ~VTD_PAGE_MASK; 249 context->lo |= value & VTD_PAGE_MASK; 250 } 251 252 static inline void context_set_address_width(struct context_entry *context, 253 unsigned long value) 254 { 255 context->hi |= value & 7; 256 } 257 258 static inline void context_set_domain_id(struct context_entry *context, 259 unsigned long value) 260 { 261 context->hi |= (value & ((1 << 16) - 1)) << 8; 262 } 263 264 static inline int context_domain_id(struct context_entry *c) 265 { 266 return((c->hi >> 8) & 0xffff); 267 } 268 269 static inline void context_clear_entry(struct context_entry *context) 270 { 271 context->lo = 0; 272 context->hi = 0; 273 } 274 275 /* 276 * This domain is a statically identity mapping domain. 277 * 1. This domain creats a static 1:1 mapping to all usable memory. 278 * 2. It maps to each iommu if successful. 279 * 3. Each iommu mapps to this domain if successful. 280 */ 281 static struct dmar_domain *si_domain; 282 static int hw_pass_through = 1; 283 284 #define for_each_domain_iommu(idx, domain) \ 285 for (idx = 0; idx < g_num_of_iommus; idx++) \ 286 if (domain->iommu_refcnt[idx]) 287 288 struct dmar_rmrr_unit { 289 struct list_head list; /* list of rmrr units */ 290 struct acpi_dmar_header *hdr; /* ACPI header */ 291 u64 base_address; /* reserved base address*/ 292 u64 end_address; /* reserved end address */ 293 struct dmar_dev_scope *devices; /* target devices */ 294 int devices_cnt; /* target device count */ 295 }; 296 297 struct dmar_atsr_unit { 298 struct list_head list; /* list of ATSR units */ 299 struct acpi_dmar_header *hdr; /* ACPI header */ 300 struct dmar_dev_scope *devices; /* target devices */ 301 int devices_cnt; /* target device count */ 302 u8 include_all:1; /* include all ports */ 303 }; 304 305 struct dmar_satc_unit { 306 struct list_head list; /* list of SATC units */ 307 struct acpi_dmar_header *hdr; /* ACPI header */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 struct intel_iommu *iommu; /* the corresponding iommu */ 310 int devices_cnt; /* target device count */ 311 u8 atc_required:1; /* ATS is required */ 312 }; 313 314 static LIST_HEAD(dmar_atsr_units); 315 static LIST_HEAD(dmar_rmrr_units); 316 static LIST_HEAD(dmar_satc_units); 317 318 #define for_each_rmrr_units(rmrr) \ 319 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 320 321 /* bitmap for indexing intel_iommus */ 322 static int g_num_of_iommus; 323 324 static void domain_exit(struct dmar_domain *domain); 325 static void domain_remove_dev_info(struct dmar_domain *domain); 326 static void dmar_remove_one_dev_info(struct device *dev); 327 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 328 static int intel_iommu_attach_device(struct iommu_domain *domain, 329 struct device *dev); 330 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 331 dma_addr_t iova); 332 333 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 334 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 335 336 int intel_iommu_enabled = 0; 337 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 338 339 static int dmar_map_gfx = 1; 340 static int intel_iommu_superpage = 1; 341 static int iommu_identity_mapping; 342 static int iommu_skip_te_disable; 343 344 #define IDENTMAP_GFX 2 345 #define IDENTMAP_AZALIA 4 346 347 int intel_iommu_gfx_mapped; 348 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 349 350 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 351 struct device_domain_info *get_domain_info(struct device *dev) 352 { 353 struct device_domain_info *info; 354 355 if (!dev) 356 return NULL; 357 358 info = dev_iommu_priv_get(dev); 359 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 360 return NULL; 361 362 return info; 363 } 364 365 DEFINE_SPINLOCK(device_domain_lock); 366 static LIST_HEAD(device_domain_list); 367 368 /* 369 * Iterate over elements in device_domain_list and call the specified 370 * callback @fn against each element. 371 */ 372 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 373 void *data), void *data) 374 { 375 int ret = 0; 376 unsigned long flags; 377 struct device_domain_info *info; 378 379 spin_lock_irqsave(&device_domain_lock, flags); 380 list_for_each_entry(info, &device_domain_list, global) { 381 ret = fn(info, data); 382 if (ret) { 383 spin_unlock_irqrestore(&device_domain_lock, flags); 384 return ret; 385 } 386 } 387 spin_unlock_irqrestore(&device_domain_lock, flags); 388 389 return 0; 390 } 391 392 const struct iommu_ops intel_iommu_ops; 393 394 static bool translation_pre_enabled(struct intel_iommu *iommu) 395 { 396 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 397 } 398 399 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 400 { 401 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 402 } 403 404 static void init_translation_status(struct intel_iommu *iommu) 405 { 406 u32 gsts; 407 408 gsts = readl(iommu->reg + DMAR_GSTS_REG); 409 if (gsts & DMA_GSTS_TES) 410 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 411 } 412 413 static int __init intel_iommu_setup(char *str) 414 { 415 if (!str) 416 return -EINVAL; 417 418 while (*str) { 419 if (!strncmp(str, "on", 2)) { 420 dmar_disabled = 0; 421 pr_info("IOMMU enabled\n"); 422 } else if (!strncmp(str, "off", 3)) { 423 dmar_disabled = 1; 424 no_platform_optin = 1; 425 pr_info("IOMMU disabled\n"); 426 } else if (!strncmp(str, "igfx_off", 8)) { 427 dmar_map_gfx = 0; 428 pr_info("Disable GFX device mapping\n"); 429 } else if (!strncmp(str, "forcedac", 8)) { 430 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 431 iommu_dma_forcedac = true; 432 } else if (!strncmp(str, "strict", 6)) { 433 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 434 iommu_set_dma_strict(); 435 } else if (!strncmp(str, "sp_off", 6)) { 436 pr_info("Disable supported super page\n"); 437 intel_iommu_superpage = 0; 438 } else if (!strncmp(str, "sm_on", 5)) { 439 pr_info("Enable scalable mode if hardware supports\n"); 440 intel_iommu_sm = 1; 441 } else if (!strncmp(str, "sm_off", 6)) { 442 pr_info("Scalable mode is disallowed\n"); 443 intel_iommu_sm = 0; 444 } else if (!strncmp(str, "tboot_noforce", 13)) { 445 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 446 intel_iommu_tboot_noforce = 1; 447 } else { 448 pr_notice("Unknown option - '%s'\n", str); 449 } 450 451 str += strcspn(str, ","); 452 while (*str == ',') 453 str++; 454 } 455 456 return 1; 457 } 458 __setup("intel_iommu=", intel_iommu_setup); 459 460 static struct kmem_cache *iommu_domain_cache; 461 static struct kmem_cache *iommu_devinfo_cache; 462 463 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 464 { 465 struct dmar_domain **domains; 466 int idx = did >> 8; 467 468 domains = iommu->domains[idx]; 469 if (!domains) 470 return NULL; 471 472 return domains[did & 0xff]; 473 } 474 475 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 476 struct dmar_domain *domain) 477 { 478 struct dmar_domain **domains; 479 int idx = did >> 8; 480 481 if (!iommu->domains[idx]) { 482 size_t size = 256 * sizeof(struct dmar_domain *); 483 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 484 } 485 486 domains = iommu->domains[idx]; 487 if (WARN_ON(!domains)) 488 return; 489 else 490 domains[did & 0xff] = domain; 491 } 492 493 void *alloc_pgtable_page(int node) 494 { 495 struct page *page; 496 void *vaddr = NULL; 497 498 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 499 if (page) 500 vaddr = page_address(page); 501 return vaddr; 502 } 503 504 void free_pgtable_page(void *vaddr) 505 { 506 free_page((unsigned long)vaddr); 507 } 508 509 static inline void *alloc_domain_mem(void) 510 { 511 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 512 } 513 514 static void free_domain_mem(void *vaddr) 515 { 516 kmem_cache_free(iommu_domain_cache, vaddr); 517 } 518 519 static inline void * alloc_devinfo_mem(void) 520 { 521 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 522 } 523 524 static inline void free_devinfo_mem(void *vaddr) 525 { 526 kmem_cache_free(iommu_devinfo_cache, vaddr); 527 } 528 529 static inline int domain_type_is_si(struct dmar_domain *domain) 530 { 531 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 532 } 533 534 static inline bool domain_use_first_level(struct dmar_domain *domain) 535 { 536 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 537 } 538 539 static inline int domain_pfn_supported(struct dmar_domain *domain, 540 unsigned long pfn) 541 { 542 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 543 544 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 545 } 546 547 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 548 { 549 unsigned long sagaw; 550 int agaw; 551 552 sagaw = cap_sagaw(iommu->cap); 553 for (agaw = width_to_agaw(max_gaw); 554 agaw >= 0; agaw--) { 555 if (test_bit(agaw, &sagaw)) 556 break; 557 } 558 559 return agaw; 560 } 561 562 /* 563 * Calculate max SAGAW for each iommu. 564 */ 565 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 566 { 567 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 568 } 569 570 /* 571 * calculate agaw for each iommu. 572 * "SAGAW" may be different across iommus, use a default agaw, and 573 * get a supported less agaw for iommus that don't support the default agaw. 574 */ 575 int iommu_calculate_agaw(struct intel_iommu *iommu) 576 { 577 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 578 } 579 580 /* This functionin only returns single iommu in a domain */ 581 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 582 { 583 int iommu_id; 584 585 /* si_domain and vm domain should not get here. */ 586 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 587 return NULL; 588 589 for_each_domain_iommu(iommu_id, domain) 590 break; 591 592 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 593 return NULL; 594 595 return g_iommus[iommu_id]; 596 } 597 598 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 599 { 600 return sm_supported(iommu) ? 601 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 602 } 603 604 static void domain_update_iommu_coherency(struct dmar_domain *domain) 605 { 606 struct dmar_drhd_unit *drhd; 607 struct intel_iommu *iommu; 608 bool found = false; 609 int i; 610 611 domain->iommu_coherency = true; 612 613 for_each_domain_iommu(i, domain) { 614 found = true; 615 if (!iommu_paging_structure_coherency(g_iommus[i])) { 616 domain->iommu_coherency = false; 617 break; 618 } 619 } 620 if (found) 621 return; 622 623 /* No hardware attached; use lowest common denominator */ 624 rcu_read_lock(); 625 for_each_active_iommu(iommu, drhd) { 626 if (!iommu_paging_structure_coherency(iommu)) { 627 domain->iommu_coherency = false; 628 break; 629 } 630 } 631 rcu_read_unlock(); 632 } 633 634 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 635 { 636 struct dmar_drhd_unit *drhd; 637 struct intel_iommu *iommu; 638 bool ret = true; 639 640 rcu_read_lock(); 641 for_each_active_iommu(iommu, drhd) { 642 if (iommu != skip) { 643 /* 644 * If the hardware is operating in the scalable mode, 645 * the snooping control is always supported since we 646 * always set PASID-table-entry.PGSNP bit if the domain 647 * is managed outside (UNMANAGED). 648 */ 649 if (!sm_supported(iommu) && 650 !ecap_sc_support(iommu->ecap)) { 651 ret = false; 652 break; 653 } 654 } 655 } 656 rcu_read_unlock(); 657 658 return ret; 659 } 660 661 static int domain_update_iommu_superpage(struct dmar_domain *domain, 662 struct intel_iommu *skip) 663 { 664 struct dmar_drhd_unit *drhd; 665 struct intel_iommu *iommu; 666 int mask = 0x3; 667 668 if (!intel_iommu_superpage) 669 return 0; 670 671 /* set iommu_superpage to the smallest common denominator */ 672 rcu_read_lock(); 673 for_each_active_iommu(iommu, drhd) { 674 if (iommu != skip) { 675 if (domain && domain_use_first_level(domain)) { 676 if (!cap_fl1gp_support(iommu->cap)) 677 mask = 0x1; 678 } else { 679 mask &= cap_super_page_val(iommu->cap); 680 } 681 682 if (!mask) 683 break; 684 } 685 } 686 rcu_read_unlock(); 687 688 return fls(mask); 689 } 690 691 static int domain_update_device_node(struct dmar_domain *domain) 692 { 693 struct device_domain_info *info; 694 int nid = NUMA_NO_NODE; 695 696 assert_spin_locked(&device_domain_lock); 697 698 if (list_empty(&domain->devices)) 699 return NUMA_NO_NODE; 700 701 list_for_each_entry(info, &domain->devices, link) { 702 if (!info->dev) 703 continue; 704 705 /* 706 * There could possibly be multiple device numa nodes as devices 707 * within the same domain may sit behind different IOMMUs. There 708 * isn't perfect answer in such situation, so we select first 709 * come first served policy. 710 */ 711 nid = dev_to_node(info->dev); 712 if (nid != NUMA_NO_NODE) 713 break; 714 } 715 716 return nid; 717 } 718 719 static void domain_update_iotlb(struct dmar_domain *domain); 720 721 /* Return the super pagesize bitmap if supported. */ 722 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 723 { 724 unsigned long bitmap = 0; 725 726 /* 727 * 1-level super page supports page size of 2MiB, 2-level super page 728 * supports page size of both 2MiB and 1GiB. 729 */ 730 if (domain->iommu_superpage == 1) 731 bitmap |= SZ_2M; 732 else if (domain->iommu_superpage == 2) 733 bitmap |= SZ_2M | SZ_1G; 734 735 return bitmap; 736 } 737 738 /* Some capabilities may be different across iommus */ 739 static void domain_update_iommu_cap(struct dmar_domain *domain) 740 { 741 domain_update_iommu_coherency(domain); 742 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 743 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 744 745 /* 746 * If RHSA is missing, we should default to the device numa domain 747 * as fall back. 748 */ 749 if (domain->nid == NUMA_NO_NODE) 750 domain->nid = domain_update_device_node(domain); 751 752 /* 753 * First-level translation restricts the input-address to a 754 * canonical address (i.e., address bits 63:N have the same 755 * value as address bit [N-1], where N is 48-bits with 4-level 756 * paging and 57-bits with 5-level paging). Hence, skip bit 757 * [N-1]. 758 */ 759 if (domain_use_first_level(domain)) 760 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 761 else 762 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 763 764 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 765 domain_update_iotlb(domain); 766 } 767 768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 769 u8 devfn, int alloc) 770 { 771 struct root_entry *root = &iommu->root_entry[bus]; 772 struct context_entry *context; 773 u64 *entry; 774 775 entry = &root->lo; 776 if (sm_supported(iommu)) { 777 if (devfn >= 0x80) { 778 devfn -= 0x80; 779 entry = &root->hi; 780 } 781 devfn *= 2; 782 } 783 if (*entry & 1) 784 context = phys_to_virt(*entry & VTD_PAGE_MASK); 785 else { 786 unsigned long phy_addr; 787 if (!alloc) 788 return NULL; 789 790 context = alloc_pgtable_page(iommu->node); 791 if (!context) 792 return NULL; 793 794 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 795 phy_addr = virt_to_phys((void *)context); 796 *entry = phy_addr | 1; 797 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 798 } 799 return &context[devfn]; 800 } 801 802 static bool attach_deferred(struct device *dev) 803 { 804 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 805 } 806 807 /** 808 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 809 * sub-hierarchy of a candidate PCI-PCI bridge 810 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 811 * @bridge: the candidate PCI-PCI bridge 812 * 813 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 814 */ 815 static bool 816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 817 { 818 struct pci_dev *pdev, *pbridge; 819 820 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 821 return false; 822 823 pdev = to_pci_dev(dev); 824 pbridge = to_pci_dev(bridge); 825 826 if (pbridge->subordinate && 827 pbridge->subordinate->number <= pdev->bus->number && 828 pbridge->subordinate->busn_res.end >= pdev->bus->number) 829 return true; 830 831 return false; 832 } 833 834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 835 { 836 struct dmar_drhd_unit *drhd; 837 u32 vtbar; 838 int rc; 839 840 /* We know that this device on this chipset has its own IOMMU. 841 * If we find it under a different IOMMU, then the BIOS is lying 842 * to us. Hope that the IOMMU for this device is actually 843 * disabled, and it needs no translation... 844 */ 845 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 846 if (rc) { 847 /* "can't" happen */ 848 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 849 return false; 850 } 851 vtbar &= 0xffff0000; 852 853 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 854 drhd = dmar_find_matched_drhd_unit(pdev); 855 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 856 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 857 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 858 return true; 859 } 860 861 return false; 862 } 863 864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 865 { 866 if (!iommu || iommu->drhd->ignored) 867 return true; 868 869 if (dev_is_pci(dev)) { 870 struct pci_dev *pdev = to_pci_dev(dev); 871 872 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 873 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 874 quirk_ioat_snb_local_iommu(pdev)) 875 return true; 876 } 877 878 return false; 879 } 880 881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 882 { 883 struct dmar_drhd_unit *drhd = NULL; 884 struct pci_dev *pdev = NULL; 885 struct intel_iommu *iommu; 886 struct device *tmp; 887 u16 segment = 0; 888 int i; 889 890 if (!dev) 891 return NULL; 892 893 if (dev_is_pci(dev)) { 894 struct pci_dev *pf_pdev; 895 896 pdev = pci_real_dma_dev(to_pci_dev(dev)); 897 898 /* VFs aren't listed in scope tables; we need to look up 899 * the PF instead to find the IOMMU. */ 900 pf_pdev = pci_physfn(pdev); 901 dev = &pf_pdev->dev; 902 segment = pci_domain_nr(pdev->bus); 903 } else if (has_acpi_companion(dev)) 904 dev = &ACPI_COMPANION(dev)->dev; 905 906 rcu_read_lock(); 907 for_each_iommu(iommu, drhd) { 908 if (pdev && segment != drhd->segment) 909 continue; 910 911 for_each_active_dev_scope(drhd->devices, 912 drhd->devices_cnt, i, tmp) { 913 if (tmp == dev) { 914 /* For a VF use its original BDF# not that of the PF 915 * which we used for the IOMMU lookup. Strictly speaking 916 * we could do this for all PCI devices; we only need to 917 * get the BDF# from the scope table for ACPI matches. */ 918 if (pdev && pdev->is_virtfn) 919 goto got_pdev; 920 921 if (bus && devfn) { 922 *bus = drhd->devices[i].bus; 923 *devfn = drhd->devices[i].devfn; 924 } 925 goto out; 926 } 927 928 if (is_downstream_to_pci_bridge(dev, tmp)) 929 goto got_pdev; 930 } 931 932 if (pdev && drhd->include_all) { 933 got_pdev: 934 if (bus && devfn) { 935 *bus = pdev->bus->number; 936 *devfn = pdev->devfn; 937 } 938 goto out; 939 } 940 } 941 iommu = NULL; 942 out: 943 if (iommu_is_dummy(iommu, dev)) 944 iommu = NULL; 945 946 rcu_read_unlock(); 947 948 return iommu; 949 } 950 951 static void domain_flush_cache(struct dmar_domain *domain, 952 void *addr, int size) 953 { 954 if (!domain->iommu_coherency) 955 clflush_cache_range(addr, size); 956 } 957 958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 959 { 960 struct context_entry *context; 961 int ret = 0; 962 unsigned long flags; 963 964 spin_lock_irqsave(&iommu->lock, flags); 965 context = iommu_context_addr(iommu, bus, devfn, 0); 966 if (context) 967 ret = context_present(context); 968 spin_unlock_irqrestore(&iommu->lock, flags); 969 return ret; 970 } 971 972 static void free_context_table(struct intel_iommu *iommu) 973 { 974 int i; 975 unsigned long flags; 976 struct context_entry *context; 977 978 spin_lock_irqsave(&iommu->lock, flags); 979 if (!iommu->root_entry) { 980 goto out; 981 } 982 for (i = 0; i < ROOT_ENTRY_NR; i++) { 983 context = iommu_context_addr(iommu, i, 0, 0); 984 if (context) 985 free_pgtable_page(context); 986 987 if (!sm_supported(iommu)) 988 continue; 989 990 context = iommu_context_addr(iommu, i, 0x80, 0); 991 if (context) 992 free_pgtable_page(context); 993 994 } 995 free_pgtable_page(iommu->root_entry); 996 iommu->root_entry = NULL; 997 out: 998 spin_unlock_irqrestore(&iommu->lock, flags); 999 } 1000 1001 #ifdef CONFIG_DMAR_DEBUG 1002 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 1003 { 1004 struct device_domain_info *info; 1005 struct dma_pte *parent, *pte; 1006 struct dmar_domain *domain; 1007 int offset, level; 1008 1009 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 1010 if (!info || !info->domain) { 1011 pr_info("device [%02x:%02x.%d] not probed\n", 1012 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1013 return; 1014 } 1015 1016 domain = info->domain; 1017 level = agaw_to_level(domain->agaw); 1018 parent = domain->pgd; 1019 if (!parent) { 1020 pr_info("no page table setup\n"); 1021 return; 1022 } 1023 1024 while (1) { 1025 offset = pfn_level_offset(pfn, level); 1026 pte = &parent[offset]; 1027 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 1028 pr_info("PTE not present at level %d\n", level); 1029 break; 1030 } 1031 1032 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 1033 1034 if (level == 1) 1035 break; 1036 1037 parent = phys_to_virt(dma_pte_addr(pte)); 1038 level--; 1039 } 1040 } 1041 1042 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 1043 unsigned long long addr, u32 pasid) 1044 { 1045 struct pasid_dir_entry *dir, *pde; 1046 struct pasid_entry *entries, *pte; 1047 struct context_entry *ctx_entry; 1048 struct root_entry *rt_entry; 1049 u8 devfn = source_id & 0xff; 1050 u8 bus = source_id >> 8; 1051 int i, dir_index, index; 1052 1053 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 1054 1055 /* root entry dump */ 1056 rt_entry = &iommu->root_entry[bus]; 1057 if (!rt_entry) { 1058 pr_info("root table entry is not present\n"); 1059 return; 1060 } 1061 1062 if (sm_supported(iommu)) 1063 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 1064 rt_entry->hi, rt_entry->lo); 1065 else 1066 pr_info("root entry: 0x%016llx", rt_entry->lo); 1067 1068 /* context entry dump */ 1069 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 1070 if (!ctx_entry) { 1071 pr_info("context table entry is not present\n"); 1072 return; 1073 } 1074 1075 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 1076 ctx_entry->hi, ctx_entry->lo); 1077 1078 /* legacy mode does not require PASID entries */ 1079 if (!sm_supported(iommu)) 1080 goto pgtable_walk; 1081 1082 /* get the pointer to pasid directory entry */ 1083 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 1084 if (!dir) { 1085 pr_info("pasid directory entry is not present\n"); 1086 return; 1087 } 1088 /* For request-without-pasid, get the pasid from context entry */ 1089 if (intel_iommu_sm && pasid == INVALID_IOASID) 1090 pasid = PASID_RID2PASID; 1091 1092 dir_index = pasid >> PASID_PDE_SHIFT; 1093 pde = &dir[dir_index]; 1094 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 1095 1096 /* get the pointer to the pasid table entry */ 1097 entries = get_pasid_table_from_pde(pde); 1098 if (!entries) { 1099 pr_info("pasid table entry is not present\n"); 1100 return; 1101 } 1102 index = pasid & PASID_PTE_MASK; 1103 pte = &entries[index]; 1104 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 1105 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 1106 1107 pgtable_walk: 1108 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 1109 } 1110 #endif 1111 1112 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1113 unsigned long pfn, int *target_level) 1114 { 1115 struct dma_pte *parent, *pte; 1116 int level = agaw_to_level(domain->agaw); 1117 int offset; 1118 1119 BUG_ON(!domain->pgd); 1120 1121 if (!domain_pfn_supported(domain, pfn)) 1122 /* Address beyond IOMMU's addressing capabilities. */ 1123 return NULL; 1124 1125 parent = domain->pgd; 1126 1127 while (1) { 1128 void *tmp_page; 1129 1130 offset = pfn_level_offset(pfn, level); 1131 pte = &parent[offset]; 1132 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1133 break; 1134 if (level == *target_level) 1135 break; 1136 1137 if (!dma_pte_present(pte)) { 1138 uint64_t pteval; 1139 1140 tmp_page = alloc_pgtable_page(domain->nid); 1141 1142 if (!tmp_page) 1143 return NULL; 1144 1145 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1146 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1147 if (domain_use_first_level(domain)) { 1148 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1149 if (iommu_is_dma_domain(&domain->domain)) 1150 pteval |= DMA_FL_PTE_ACCESS; 1151 } 1152 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1153 /* Someone else set it while we were thinking; use theirs. */ 1154 free_pgtable_page(tmp_page); 1155 else 1156 domain_flush_cache(domain, pte, sizeof(*pte)); 1157 } 1158 if (level == 1) 1159 break; 1160 1161 parent = phys_to_virt(dma_pte_addr(pte)); 1162 level--; 1163 } 1164 1165 if (!*target_level) 1166 *target_level = level; 1167 1168 return pte; 1169 } 1170 1171 /* return address's pte at specific level */ 1172 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1173 unsigned long pfn, 1174 int level, int *large_page) 1175 { 1176 struct dma_pte *parent, *pte; 1177 int total = agaw_to_level(domain->agaw); 1178 int offset; 1179 1180 parent = domain->pgd; 1181 while (level <= total) { 1182 offset = pfn_level_offset(pfn, total); 1183 pte = &parent[offset]; 1184 if (level == total) 1185 return pte; 1186 1187 if (!dma_pte_present(pte)) { 1188 *large_page = total; 1189 break; 1190 } 1191 1192 if (dma_pte_superpage(pte)) { 1193 *large_page = total; 1194 return pte; 1195 } 1196 1197 parent = phys_to_virt(dma_pte_addr(pte)); 1198 total--; 1199 } 1200 return NULL; 1201 } 1202 1203 /* clear last level pte, a tlb flush should be followed */ 1204 static void dma_pte_clear_range(struct dmar_domain *domain, 1205 unsigned long start_pfn, 1206 unsigned long last_pfn) 1207 { 1208 unsigned int large_page; 1209 struct dma_pte *first_pte, *pte; 1210 1211 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1212 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1213 BUG_ON(start_pfn > last_pfn); 1214 1215 /* we don't need lock here; nobody else touches the iova range */ 1216 do { 1217 large_page = 1; 1218 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1219 if (!pte) { 1220 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1221 continue; 1222 } 1223 do { 1224 dma_clear_pte(pte); 1225 start_pfn += lvl_to_nr_pages(large_page); 1226 pte++; 1227 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1228 1229 domain_flush_cache(domain, first_pte, 1230 (void *)pte - (void *)first_pte); 1231 1232 } while (start_pfn && start_pfn <= last_pfn); 1233 } 1234 1235 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1236 int retain_level, struct dma_pte *pte, 1237 unsigned long pfn, unsigned long start_pfn, 1238 unsigned long last_pfn) 1239 { 1240 pfn = max(start_pfn, pfn); 1241 pte = &pte[pfn_level_offset(pfn, level)]; 1242 1243 do { 1244 unsigned long level_pfn; 1245 struct dma_pte *level_pte; 1246 1247 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1248 goto next; 1249 1250 level_pfn = pfn & level_mask(level); 1251 level_pte = phys_to_virt(dma_pte_addr(pte)); 1252 1253 if (level > 2) { 1254 dma_pte_free_level(domain, level - 1, retain_level, 1255 level_pte, level_pfn, start_pfn, 1256 last_pfn); 1257 } 1258 1259 /* 1260 * Free the page table if we're below the level we want to 1261 * retain and the range covers the entire table. 1262 */ 1263 if (level < retain_level && !(start_pfn > level_pfn || 1264 last_pfn < level_pfn + level_size(level) - 1)) { 1265 dma_clear_pte(pte); 1266 domain_flush_cache(domain, pte, sizeof(*pte)); 1267 free_pgtable_page(level_pte); 1268 } 1269 next: 1270 pfn += level_size(level); 1271 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1272 } 1273 1274 /* 1275 * clear last level (leaf) ptes and free page table pages below the 1276 * level we wish to keep intact. 1277 */ 1278 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1279 unsigned long start_pfn, 1280 unsigned long last_pfn, 1281 int retain_level) 1282 { 1283 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1284 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1285 BUG_ON(start_pfn > last_pfn); 1286 1287 dma_pte_clear_range(domain, start_pfn, last_pfn); 1288 1289 /* We don't need lock here; nobody else touches the iova range */ 1290 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1291 domain->pgd, 0, start_pfn, last_pfn); 1292 1293 /* free pgd */ 1294 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1295 free_pgtable_page(domain->pgd); 1296 domain->pgd = NULL; 1297 } 1298 } 1299 1300 /* When a page at a given level is being unlinked from its parent, we don't 1301 need to *modify* it at all. All we need to do is make a list of all the 1302 pages which can be freed just as soon as we've flushed the IOTLB and we 1303 know the hardware page-walk will no longer touch them. 1304 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1305 be freed. */ 1306 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1307 int level, struct dma_pte *pte, 1308 struct page *freelist) 1309 { 1310 struct page *pg; 1311 1312 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1313 pg->freelist = freelist; 1314 freelist = pg; 1315 1316 if (level == 1) 1317 return freelist; 1318 1319 pte = page_address(pg); 1320 do { 1321 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1322 freelist = dma_pte_list_pagetables(domain, level - 1, 1323 pte, freelist); 1324 pte++; 1325 } while (!first_pte_in_page(pte)); 1326 1327 return freelist; 1328 } 1329 1330 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1331 struct dma_pte *pte, unsigned long pfn, 1332 unsigned long start_pfn, 1333 unsigned long last_pfn, 1334 struct page *freelist) 1335 { 1336 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1337 1338 pfn = max(start_pfn, pfn); 1339 pte = &pte[pfn_level_offset(pfn, level)]; 1340 1341 do { 1342 unsigned long level_pfn = pfn & level_mask(level); 1343 1344 if (!dma_pte_present(pte)) 1345 goto next; 1346 1347 /* If range covers entire pagetable, free it */ 1348 if (start_pfn <= level_pfn && 1349 last_pfn >= level_pfn + level_size(level) - 1) { 1350 /* These suborbinate page tables are going away entirely. Don't 1351 bother to clear them; we're just going to *free* them. */ 1352 if (level > 1 && !dma_pte_superpage(pte)) 1353 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1354 1355 dma_clear_pte(pte); 1356 if (!first_pte) 1357 first_pte = pte; 1358 last_pte = pte; 1359 } else if (level > 1) { 1360 /* Recurse down into a level that isn't *entirely* obsolete */ 1361 freelist = dma_pte_clear_level(domain, level - 1, 1362 phys_to_virt(dma_pte_addr(pte)), 1363 level_pfn, start_pfn, last_pfn, 1364 freelist); 1365 } 1366 next: 1367 pfn = level_pfn + level_size(level); 1368 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1369 1370 if (first_pte) 1371 domain_flush_cache(domain, first_pte, 1372 (void *)++last_pte - (void *)first_pte); 1373 1374 return freelist; 1375 } 1376 1377 /* We can't just free the pages because the IOMMU may still be walking 1378 the page tables, and may have cached the intermediate levels. The 1379 pages can only be freed after the IOTLB flush has been done. */ 1380 static struct page *domain_unmap(struct dmar_domain *domain, 1381 unsigned long start_pfn, 1382 unsigned long last_pfn, 1383 struct page *freelist) 1384 { 1385 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1386 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1387 BUG_ON(start_pfn > last_pfn); 1388 1389 /* we don't need lock here; nobody else touches the iova range */ 1390 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1391 domain->pgd, 0, start_pfn, last_pfn, 1392 freelist); 1393 1394 /* free pgd */ 1395 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1396 struct page *pgd_page = virt_to_page(domain->pgd); 1397 pgd_page->freelist = freelist; 1398 freelist = pgd_page; 1399 1400 domain->pgd = NULL; 1401 } 1402 1403 return freelist; 1404 } 1405 1406 static void dma_free_pagelist(struct page *freelist) 1407 { 1408 struct page *pg; 1409 1410 while ((pg = freelist)) { 1411 freelist = pg->freelist; 1412 free_pgtable_page(page_address(pg)); 1413 } 1414 } 1415 1416 /* iommu handling */ 1417 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1418 { 1419 struct root_entry *root; 1420 unsigned long flags; 1421 1422 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1423 if (!root) { 1424 pr_err("Allocating root entry for %s failed\n", 1425 iommu->name); 1426 return -ENOMEM; 1427 } 1428 1429 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1430 1431 spin_lock_irqsave(&iommu->lock, flags); 1432 iommu->root_entry = root; 1433 spin_unlock_irqrestore(&iommu->lock, flags); 1434 1435 return 0; 1436 } 1437 1438 static void iommu_set_root_entry(struct intel_iommu *iommu) 1439 { 1440 u64 addr; 1441 u32 sts; 1442 unsigned long flag; 1443 1444 addr = virt_to_phys(iommu->root_entry); 1445 if (sm_supported(iommu)) 1446 addr |= DMA_RTADDR_SMT; 1447 1448 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1449 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1450 1451 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1452 1453 /* Make sure hardware complete it */ 1454 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1455 readl, (sts & DMA_GSTS_RTPS), sts); 1456 1457 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1458 1459 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1460 if (sm_supported(iommu)) 1461 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1462 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1463 } 1464 1465 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1466 { 1467 u32 val; 1468 unsigned long flag; 1469 1470 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1471 return; 1472 1473 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1474 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1475 1476 /* Make sure hardware complete it */ 1477 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1478 readl, (!(val & DMA_GSTS_WBFS)), val); 1479 1480 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1481 } 1482 1483 /* return value determine if we need a write buffer flush */ 1484 static void __iommu_flush_context(struct intel_iommu *iommu, 1485 u16 did, u16 source_id, u8 function_mask, 1486 u64 type) 1487 { 1488 u64 val = 0; 1489 unsigned long flag; 1490 1491 switch (type) { 1492 case DMA_CCMD_GLOBAL_INVL: 1493 val = DMA_CCMD_GLOBAL_INVL; 1494 break; 1495 case DMA_CCMD_DOMAIN_INVL: 1496 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1497 break; 1498 case DMA_CCMD_DEVICE_INVL: 1499 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1500 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1501 break; 1502 default: 1503 BUG(); 1504 } 1505 val |= DMA_CCMD_ICC; 1506 1507 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1508 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1509 1510 /* Make sure hardware complete it */ 1511 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1512 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1513 1514 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1515 } 1516 1517 /* return value determine if we need a write buffer flush */ 1518 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1519 u64 addr, unsigned int size_order, u64 type) 1520 { 1521 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1522 u64 val = 0, val_iva = 0; 1523 unsigned long flag; 1524 1525 switch (type) { 1526 case DMA_TLB_GLOBAL_FLUSH: 1527 /* global flush doesn't need set IVA_REG */ 1528 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1529 break; 1530 case DMA_TLB_DSI_FLUSH: 1531 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1532 break; 1533 case DMA_TLB_PSI_FLUSH: 1534 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1535 /* IH bit is passed in as part of address */ 1536 val_iva = size_order | addr; 1537 break; 1538 default: 1539 BUG(); 1540 } 1541 /* Note: set drain read/write */ 1542 #if 0 1543 /* 1544 * This is probably to be super secure.. Looks like we can 1545 * ignore it without any impact. 1546 */ 1547 if (cap_read_drain(iommu->cap)) 1548 val |= DMA_TLB_READ_DRAIN; 1549 #endif 1550 if (cap_write_drain(iommu->cap)) 1551 val |= DMA_TLB_WRITE_DRAIN; 1552 1553 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1554 /* Note: Only uses first TLB reg currently */ 1555 if (val_iva) 1556 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1557 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1558 1559 /* Make sure hardware complete it */ 1560 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1561 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1562 1563 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1564 1565 /* check IOTLB invalidation granularity */ 1566 if (DMA_TLB_IAIG(val) == 0) 1567 pr_err("Flush IOTLB failed\n"); 1568 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1569 pr_debug("TLB flush request %Lx, actual %Lx\n", 1570 (unsigned long long)DMA_TLB_IIRG(type), 1571 (unsigned long long)DMA_TLB_IAIG(val)); 1572 } 1573 1574 static struct device_domain_info * 1575 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1576 u8 bus, u8 devfn) 1577 { 1578 struct device_domain_info *info; 1579 1580 assert_spin_locked(&device_domain_lock); 1581 1582 if (!iommu->qi) 1583 return NULL; 1584 1585 list_for_each_entry(info, &domain->devices, link) 1586 if (info->iommu == iommu && info->bus == bus && 1587 info->devfn == devfn) { 1588 if (info->ats_supported && info->dev) 1589 return info; 1590 break; 1591 } 1592 1593 return NULL; 1594 } 1595 1596 static void domain_update_iotlb(struct dmar_domain *domain) 1597 { 1598 struct device_domain_info *info; 1599 bool has_iotlb_device = false; 1600 1601 assert_spin_locked(&device_domain_lock); 1602 1603 list_for_each_entry(info, &domain->devices, link) 1604 if (info->ats_enabled) { 1605 has_iotlb_device = true; 1606 break; 1607 } 1608 1609 if (!has_iotlb_device) { 1610 struct subdev_domain_info *sinfo; 1611 1612 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1613 info = get_domain_info(sinfo->pdev); 1614 if (info && info->ats_enabled) { 1615 has_iotlb_device = true; 1616 break; 1617 } 1618 } 1619 } 1620 1621 domain->has_iotlb_device = has_iotlb_device; 1622 } 1623 1624 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1625 { 1626 struct pci_dev *pdev; 1627 1628 assert_spin_locked(&device_domain_lock); 1629 1630 if (!info || !dev_is_pci(info->dev)) 1631 return; 1632 1633 pdev = to_pci_dev(info->dev); 1634 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1635 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1636 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1637 * reserved, which should be set to 0. 1638 */ 1639 if (!ecap_dit(info->iommu->ecap)) 1640 info->pfsid = 0; 1641 else { 1642 struct pci_dev *pf_pdev; 1643 1644 /* pdev will be returned if device is not a vf */ 1645 pf_pdev = pci_physfn(pdev); 1646 info->pfsid = pci_dev_id(pf_pdev); 1647 } 1648 1649 #ifdef CONFIG_INTEL_IOMMU_SVM 1650 /* The PCIe spec, in its wisdom, declares that the behaviour of 1651 the device if you enable PASID support after ATS support is 1652 undefined. So always enable PASID support on devices which 1653 have it, even if we can't yet know if we're ever going to 1654 use it. */ 1655 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1656 info->pasid_enabled = 1; 1657 1658 if (info->pri_supported && 1659 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1660 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1661 info->pri_enabled = 1; 1662 #endif 1663 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1664 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1665 info->ats_enabled = 1; 1666 domain_update_iotlb(info->domain); 1667 info->ats_qdep = pci_ats_queue_depth(pdev); 1668 } 1669 } 1670 1671 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1672 { 1673 struct pci_dev *pdev; 1674 1675 assert_spin_locked(&device_domain_lock); 1676 1677 if (!dev_is_pci(info->dev)) 1678 return; 1679 1680 pdev = to_pci_dev(info->dev); 1681 1682 if (info->ats_enabled) { 1683 pci_disable_ats(pdev); 1684 info->ats_enabled = 0; 1685 domain_update_iotlb(info->domain); 1686 } 1687 #ifdef CONFIG_INTEL_IOMMU_SVM 1688 if (info->pri_enabled) { 1689 pci_disable_pri(pdev); 1690 info->pri_enabled = 0; 1691 } 1692 if (info->pasid_enabled) { 1693 pci_disable_pasid(pdev); 1694 info->pasid_enabled = 0; 1695 } 1696 #endif 1697 } 1698 1699 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1700 u64 addr, unsigned int mask) 1701 { 1702 u16 sid, qdep; 1703 1704 if (!info || !info->ats_enabled) 1705 return; 1706 1707 sid = info->bus << 8 | info->devfn; 1708 qdep = info->ats_qdep; 1709 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1710 qdep, addr, mask); 1711 } 1712 1713 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1714 u64 addr, unsigned mask) 1715 { 1716 unsigned long flags; 1717 struct device_domain_info *info; 1718 struct subdev_domain_info *sinfo; 1719 1720 if (!domain->has_iotlb_device) 1721 return; 1722 1723 spin_lock_irqsave(&device_domain_lock, flags); 1724 list_for_each_entry(info, &domain->devices, link) 1725 __iommu_flush_dev_iotlb(info, addr, mask); 1726 1727 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1728 info = get_domain_info(sinfo->pdev); 1729 __iommu_flush_dev_iotlb(info, addr, mask); 1730 } 1731 spin_unlock_irqrestore(&device_domain_lock, flags); 1732 } 1733 1734 static void domain_flush_piotlb(struct intel_iommu *iommu, 1735 struct dmar_domain *domain, 1736 u64 addr, unsigned long npages, bool ih) 1737 { 1738 u16 did = domain->iommu_did[iommu->seq_id]; 1739 1740 if (domain->default_pasid) 1741 qi_flush_piotlb(iommu, did, domain->default_pasid, 1742 addr, npages, ih); 1743 1744 if (!list_empty(&domain->devices)) 1745 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1746 } 1747 1748 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1749 struct dmar_domain *domain, 1750 unsigned long pfn, unsigned int pages, 1751 int ih, int map) 1752 { 1753 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1754 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1755 u16 did = domain->iommu_did[iommu->seq_id]; 1756 1757 BUG_ON(pages == 0); 1758 1759 if (ih) 1760 ih = 1 << 6; 1761 1762 if (domain_use_first_level(domain)) { 1763 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1764 } else { 1765 /* 1766 * Fallback to domain selective flush if no PSI support or 1767 * the size is too big. PSI requires page size to be 2 ^ x, 1768 * and the base address is naturally aligned to the size. 1769 */ 1770 if (!cap_pgsel_inv(iommu->cap) || 1771 mask > cap_max_amask_val(iommu->cap)) 1772 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1773 DMA_TLB_DSI_FLUSH); 1774 else 1775 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1776 DMA_TLB_PSI_FLUSH); 1777 } 1778 1779 /* 1780 * In caching mode, changes of pages from non-present to present require 1781 * flush. However, device IOTLB doesn't need to be flushed in this case. 1782 */ 1783 if (!cap_caching_mode(iommu->cap) || !map) 1784 iommu_flush_dev_iotlb(domain, addr, mask); 1785 } 1786 1787 /* Notification for newly created mappings */ 1788 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1789 struct dmar_domain *domain, 1790 unsigned long pfn, unsigned int pages) 1791 { 1792 /* 1793 * It's a non-present to present mapping. Only flush if caching mode 1794 * and second level. 1795 */ 1796 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1797 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1798 else 1799 iommu_flush_write_buffer(iommu); 1800 } 1801 1802 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1803 { 1804 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1805 int idx; 1806 1807 for_each_domain_iommu(idx, dmar_domain) { 1808 struct intel_iommu *iommu = g_iommus[idx]; 1809 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1810 1811 if (domain_use_first_level(dmar_domain)) 1812 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1813 else 1814 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1815 DMA_TLB_DSI_FLUSH); 1816 1817 if (!cap_caching_mode(iommu->cap)) 1818 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1819 0, MAX_AGAW_PFN_WIDTH); 1820 } 1821 } 1822 1823 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1824 { 1825 u32 pmen; 1826 unsigned long flags; 1827 1828 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1829 return; 1830 1831 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1832 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1833 pmen &= ~DMA_PMEN_EPM; 1834 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1835 1836 /* wait for the protected region status bit to clear */ 1837 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1838 readl, !(pmen & DMA_PMEN_PRS), pmen); 1839 1840 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1841 } 1842 1843 static void iommu_enable_translation(struct intel_iommu *iommu) 1844 { 1845 u32 sts; 1846 unsigned long flags; 1847 1848 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1849 iommu->gcmd |= DMA_GCMD_TE; 1850 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1851 1852 /* Make sure hardware complete it */ 1853 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1854 readl, (sts & DMA_GSTS_TES), sts); 1855 1856 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1857 } 1858 1859 static void iommu_disable_translation(struct intel_iommu *iommu) 1860 { 1861 u32 sts; 1862 unsigned long flag; 1863 1864 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1865 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1866 return; 1867 1868 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1869 iommu->gcmd &= ~DMA_GCMD_TE; 1870 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1871 1872 /* Make sure hardware complete it */ 1873 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1874 readl, (!(sts & DMA_GSTS_TES)), sts); 1875 1876 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1877 } 1878 1879 static int iommu_init_domains(struct intel_iommu *iommu) 1880 { 1881 u32 ndomains, nlongs; 1882 size_t size; 1883 1884 ndomains = cap_ndoms(iommu->cap); 1885 pr_debug("%s: Number of Domains supported <%d>\n", 1886 iommu->name, ndomains); 1887 nlongs = BITS_TO_LONGS(ndomains); 1888 1889 spin_lock_init(&iommu->lock); 1890 1891 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1892 if (!iommu->domain_ids) 1893 return -ENOMEM; 1894 1895 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1896 iommu->domains = kzalloc(size, GFP_KERNEL); 1897 1898 if (iommu->domains) { 1899 size = 256 * sizeof(struct dmar_domain *); 1900 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1901 } 1902 1903 if (!iommu->domains || !iommu->domains[0]) { 1904 pr_err("%s: Allocating domain array failed\n", 1905 iommu->name); 1906 kfree(iommu->domain_ids); 1907 kfree(iommu->domains); 1908 iommu->domain_ids = NULL; 1909 iommu->domains = NULL; 1910 return -ENOMEM; 1911 } 1912 1913 /* 1914 * If Caching mode is set, then invalid translations are tagged 1915 * with domain-id 0, hence we need to pre-allocate it. We also 1916 * use domain-id 0 as a marker for non-allocated domain-id, so 1917 * make sure it is not used for a real domain. 1918 */ 1919 set_bit(0, iommu->domain_ids); 1920 1921 /* 1922 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1923 * entry for first-level or pass-through translation modes should 1924 * be programmed with a domain id different from those used for 1925 * second-level or nested translation. We reserve a domain id for 1926 * this purpose. 1927 */ 1928 if (sm_supported(iommu)) 1929 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1930 1931 return 0; 1932 } 1933 1934 static void disable_dmar_iommu(struct intel_iommu *iommu) 1935 { 1936 struct device_domain_info *info, *tmp; 1937 unsigned long flags; 1938 1939 if (!iommu->domains || !iommu->domain_ids) 1940 return; 1941 1942 spin_lock_irqsave(&device_domain_lock, flags); 1943 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1944 if (info->iommu != iommu) 1945 continue; 1946 1947 if (!info->dev || !info->domain) 1948 continue; 1949 1950 __dmar_remove_one_dev_info(info); 1951 } 1952 spin_unlock_irqrestore(&device_domain_lock, flags); 1953 1954 if (iommu->gcmd & DMA_GCMD_TE) 1955 iommu_disable_translation(iommu); 1956 } 1957 1958 static void free_dmar_iommu(struct intel_iommu *iommu) 1959 { 1960 if ((iommu->domains) && (iommu->domain_ids)) { 1961 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1962 int i; 1963 1964 for (i = 0; i < elems; i++) 1965 kfree(iommu->domains[i]); 1966 kfree(iommu->domains); 1967 kfree(iommu->domain_ids); 1968 iommu->domains = NULL; 1969 iommu->domain_ids = NULL; 1970 } 1971 1972 g_iommus[iommu->seq_id] = NULL; 1973 1974 /* free context mapping */ 1975 free_context_table(iommu); 1976 1977 #ifdef CONFIG_INTEL_IOMMU_SVM 1978 if (pasid_supported(iommu)) { 1979 if (ecap_prs(iommu->ecap)) 1980 intel_svm_finish_prq(iommu); 1981 } 1982 if (vccap_pasid(iommu->vccap)) 1983 ioasid_unregister_allocator(&iommu->pasid_allocator); 1984 1985 #endif 1986 } 1987 1988 /* 1989 * Check and return whether first level is used by default for 1990 * DMA translation. 1991 */ 1992 static bool first_level_by_default(unsigned int type) 1993 { 1994 /* Only SL is available in legacy mode */ 1995 if (!scalable_mode_support()) 1996 return false; 1997 1998 /* Only level (either FL or SL) is available, just use it */ 1999 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 2000 return intel_cap_flts_sanity(); 2001 2002 /* Both levels are available, decide it based on domain type */ 2003 return type != IOMMU_DOMAIN_UNMANAGED; 2004 } 2005 2006 static struct dmar_domain *alloc_domain(unsigned int type) 2007 { 2008 struct dmar_domain *domain; 2009 2010 domain = alloc_domain_mem(); 2011 if (!domain) 2012 return NULL; 2013 2014 memset(domain, 0, sizeof(*domain)); 2015 domain->nid = NUMA_NO_NODE; 2016 if (first_level_by_default(type)) 2017 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 2018 domain->has_iotlb_device = false; 2019 INIT_LIST_HEAD(&domain->devices); 2020 INIT_LIST_HEAD(&domain->subdevices); 2021 2022 return domain; 2023 } 2024 2025 /* Must be called with iommu->lock */ 2026 static int domain_attach_iommu(struct dmar_domain *domain, 2027 struct intel_iommu *iommu) 2028 { 2029 unsigned long ndomains; 2030 int num; 2031 2032 assert_spin_locked(&device_domain_lock); 2033 assert_spin_locked(&iommu->lock); 2034 2035 domain->iommu_refcnt[iommu->seq_id] += 1; 2036 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 2037 ndomains = cap_ndoms(iommu->cap); 2038 num = find_first_zero_bit(iommu->domain_ids, ndomains); 2039 2040 if (num >= ndomains) { 2041 pr_err("%s: No free domain ids\n", iommu->name); 2042 domain->iommu_refcnt[iommu->seq_id] -= 1; 2043 return -ENOSPC; 2044 } 2045 2046 set_bit(num, iommu->domain_ids); 2047 set_iommu_domain(iommu, num, domain); 2048 2049 domain->iommu_did[iommu->seq_id] = num; 2050 domain->nid = iommu->node; 2051 2052 domain_update_iommu_cap(domain); 2053 } 2054 2055 return 0; 2056 } 2057 2058 static void domain_detach_iommu(struct dmar_domain *domain, 2059 struct intel_iommu *iommu) 2060 { 2061 int num; 2062 2063 assert_spin_locked(&device_domain_lock); 2064 assert_spin_locked(&iommu->lock); 2065 2066 domain->iommu_refcnt[iommu->seq_id] -= 1; 2067 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 2068 num = domain->iommu_did[iommu->seq_id]; 2069 clear_bit(num, iommu->domain_ids); 2070 set_iommu_domain(iommu, num, NULL); 2071 2072 domain_update_iommu_cap(domain); 2073 domain->iommu_did[iommu->seq_id] = 0; 2074 } 2075 } 2076 2077 static inline int guestwidth_to_adjustwidth(int gaw) 2078 { 2079 int agaw; 2080 int r = (gaw - 12) % 9; 2081 2082 if (r == 0) 2083 agaw = gaw; 2084 else 2085 agaw = gaw + 9 - r; 2086 if (agaw > 64) 2087 agaw = 64; 2088 return agaw; 2089 } 2090 2091 static void domain_exit(struct dmar_domain *domain) 2092 { 2093 2094 /* Remove associated devices and clear attached or cached domains */ 2095 domain_remove_dev_info(domain); 2096 2097 if (domain->pgd) { 2098 struct page *freelist; 2099 2100 freelist = domain_unmap(domain, 0, 2101 DOMAIN_MAX_PFN(domain->gaw), NULL); 2102 dma_free_pagelist(freelist); 2103 } 2104 2105 free_domain_mem(domain); 2106 } 2107 2108 /* 2109 * Get the PASID directory size for scalable mode context entry. 2110 * Value of X in the PDTS field of a scalable mode context entry 2111 * indicates PASID directory with 2^(X + 7) entries. 2112 */ 2113 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2114 { 2115 int pds, max_pde; 2116 2117 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2118 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2119 if (pds < 7) 2120 return 0; 2121 2122 return pds - 7; 2123 } 2124 2125 /* 2126 * Set the RID_PASID field of a scalable mode context entry. The 2127 * IOMMU hardware will use the PASID value set in this field for 2128 * DMA translations of DMA requests without PASID. 2129 */ 2130 static inline void 2131 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2132 { 2133 context->hi |= pasid & ((1 << 20) - 1); 2134 } 2135 2136 /* 2137 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2138 * entry. 2139 */ 2140 static inline void context_set_sm_dte(struct context_entry *context) 2141 { 2142 context->lo |= (1 << 2); 2143 } 2144 2145 /* 2146 * Set the PRE(Page Request Enable) field of a scalable mode context 2147 * entry. 2148 */ 2149 static inline void context_set_sm_pre(struct context_entry *context) 2150 { 2151 context->lo |= (1 << 4); 2152 } 2153 2154 /* Convert value to context PASID directory size field coding. */ 2155 #define context_pdts(pds) (((pds) & 0x7) << 9) 2156 2157 static int domain_context_mapping_one(struct dmar_domain *domain, 2158 struct intel_iommu *iommu, 2159 struct pasid_table *table, 2160 u8 bus, u8 devfn) 2161 { 2162 u16 did = domain->iommu_did[iommu->seq_id]; 2163 int translation = CONTEXT_TT_MULTI_LEVEL; 2164 struct device_domain_info *info = NULL; 2165 struct context_entry *context; 2166 unsigned long flags; 2167 int ret; 2168 2169 WARN_ON(did == 0); 2170 2171 if (hw_pass_through && domain_type_is_si(domain)) 2172 translation = CONTEXT_TT_PASS_THROUGH; 2173 2174 pr_debug("Set context mapping for %02x:%02x.%d\n", 2175 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2176 2177 BUG_ON(!domain->pgd); 2178 2179 spin_lock_irqsave(&device_domain_lock, flags); 2180 spin_lock(&iommu->lock); 2181 2182 ret = -ENOMEM; 2183 context = iommu_context_addr(iommu, bus, devfn, 1); 2184 if (!context) 2185 goto out_unlock; 2186 2187 ret = 0; 2188 if (context_present(context)) 2189 goto out_unlock; 2190 2191 /* 2192 * For kdump cases, old valid entries may be cached due to the 2193 * in-flight DMA and copied pgtable, but there is no unmapping 2194 * behaviour for them, thus we need an explicit cache flush for 2195 * the newly-mapped device. For kdump, at this point, the device 2196 * is supposed to finish reset at its driver probe stage, so no 2197 * in-flight DMA will exist, and we don't need to worry anymore 2198 * hereafter. 2199 */ 2200 if (context_copied(context)) { 2201 u16 did_old = context_domain_id(context); 2202 2203 if (did_old < cap_ndoms(iommu->cap)) { 2204 iommu->flush.flush_context(iommu, did_old, 2205 (((u16)bus) << 8) | devfn, 2206 DMA_CCMD_MASK_NOBIT, 2207 DMA_CCMD_DEVICE_INVL); 2208 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2209 DMA_TLB_DSI_FLUSH); 2210 } 2211 } 2212 2213 context_clear_entry(context); 2214 2215 if (sm_supported(iommu)) { 2216 unsigned long pds; 2217 2218 WARN_ON(!table); 2219 2220 /* Setup the PASID DIR pointer: */ 2221 pds = context_get_sm_pds(table); 2222 context->lo = (u64)virt_to_phys(table->table) | 2223 context_pdts(pds); 2224 2225 /* Setup the RID_PASID field: */ 2226 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2227 2228 /* 2229 * Setup the Device-TLB enable bit and Page request 2230 * Enable bit: 2231 */ 2232 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2233 if (info && info->ats_supported) 2234 context_set_sm_dte(context); 2235 if (info && info->pri_supported) 2236 context_set_sm_pre(context); 2237 } else { 2238 struct dma_pte *pgd = domain->pgd; 2239 int agaw; 2240 2241 context_set_domain_id(context, did); 2242 2243 if (translation != CONTEXT_TT_PASS_THROUGH) { 2244 /* 2245 * Skip top levels of page tables for iommu which has 2246 * less agaw than default. Unnecessary for PT mode. 2247 */ 2248 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2249 ret = -ENOMEM; 2250 pgd = phys_to_virt(dma_pte_addr(pgd)); 2251 if (!dma_pte_present(pgd)) 2252 goto out_unlock; 2253 } 2254 2255 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2256 if (info && info->ats_supported) 2257 translation = CONTEXT_TT_DEV_IOTLB; 2258 else 2259 translation = CONTEXT_TT_MULTI_LEVEL; 2260 2261 context_set_address_root(context, virt_to_phys(pgd)); 2262 context_set_address_width(context, agaw); 2263 } else { 2264 /* 2265 * In pass through mode, AW must be programmed to 2266 * indicate the largest AGAW value supported by 2267 * hardware. And ASR is ignored by hardware. 2268 */ 2269 context_set_address_width(context, iommu->msagaw); 2270 } 2271 2272 context_set_translation_type(context, translation); 2273 } 2274 2275 context_set_fault_enable(context); 2276 context_set_present(context); 2277 if (!ecap_coherent(iommu->ecap)) 2278 clflush_cache_range(context, sizeof(*context)); 2279 2280 /* 2281 * It's a non-present to present mapping. If hardware doesn't cache 2282 * non-present entry we only need to flush the write-buffer. If the 2283 * _does_ cache non-present entries, then it does so in the special 2284 * domain #0, which we have to flush: 2285 */ 2286 if (cap_caching_mode(iommu->cap)) { 2287 iommu->flush.flush_context(iommu, 0, 2288 (((u16)bus) << 8) | devfn, 2289 DMA_CCMD_MASK_NOBIT, 2290 DMA_CCMD_DEVICE_INVL); 2291 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2292 } else { 2293 iommu_flush_write_buffer(iommu); 2294 } 2295 iommu_enable_dev_iotlb(info); 2296 2297 ret = 0; 2298 2299 out_unlock: 2300 spin_unlock(&iommu->lock); 2301 spin_unlock_irqrestore(&device_domain_lock, flags); 2302 2303 return ret; 2304 } 2305 2306 struct domain_context_mapping_data { 2307 struct dmar_domain *domain; 2308 struct intel_iommu *iommu; 2309 struct pasid_table *table; 2310 }; 2311 2312 static int domain_context_mapping_cb(struct pci_dev *pdev, 2313 u16 alias, void *opaque) 2314 { 2315 struct domain_context_mapping_data *data = opaque; 2316 2317 return domain_context_mapping_one(data->domain, data->iommu, 2318 data->table, PCI_BUS_NUM(alias), 2319 alias & 0xff); 2320 } 2321 2322 static int 2323 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2324 { 2325 struct domain_context_mapping_data data; 2326 struct pasid_table *table; 2327 struct intel_iommu *iommu; 2328 u8 bus, devfn; 2329 2330 iommu = device_to_iommu(dev, &bus, &devfn); 2331 if (!iommu) 2332 return -ENODEV; 2333 2334 table = intel_pasid_get_table(dev); 2335 2336 if (!dev_is_pci(dev)) 2337 return domain_context_mapping_one(domain, iommu, table, 2338 bus, devfn); 2339 2340 data.domain = domain; 2341 data.iommu = iommu; 2342 data.table = table; 2343 2344 return pci_for_each_dma_alias(to_pci_dev(dev), 2345 &domain_context_mapping_cb, &data); 2346 } 2347 2348 static int domain_context_mapped_cb(struct pci_dev *pdev, 2349 u16 alias, void *opaque) 2350 { 2351 struct intel_iommu *iommu = opaque; 2352 2353 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2354 } 2355 2356 static int domain_context_mapped(struct device *dev) 2357 { 2358 struct intel_iommu *iommu; 2359 u8 bus, devfn; 2360 2361 iommu = device_to_iommu(dev, &bus, &devfn); 2362 if (!iommu) 2363 return -ENODEV; 2364 2365 if (!dev_is_pci(dev)) 2366 return device_context_mapped(iommu, bus, devfn); 2367 2368 return !pci_for_each_dma_alias(to_pci_dev(dev), 2369 domain_context_mapped_cb, iommu); 2370 } 2371 2372 /* Returns a number of VTD pages, but aligned to MM page size */ 2373 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2374 size_t size) 2375 { 2376 host_addr &= ~PAGE_MASK; 2377 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2378 } 2379 2380 /* Return largest possible superpage level for a given mapping */ 2381 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2382 unsigned long iov_pfn, 2383 unsigned long phy_pfn, 2384 unsigned long pages) 2385 { 2386 int support, level = 1; 2387 unsigned long pfnmerge; 2388 2389 support = domain->iommu_superpage; 2390 2391 /* To use a large page, the virtual *and* physical addresses 2392 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2393 of them will mean we have to use smaller pages. So just 2394 merge them and check both at once. */ 2395 pfnmerge = iov_pfn | phy_pfn; 2396 2397 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2398 pages >>= VTD_STRIDE_SHIFT; 2399 if (!pages) 2400 break; 2401 pfnmerge >>= VTD_STRIDE_SHIFT; 2402 level++; 2403 support--; 2404 } 2405 return level; 2406 } 2407 2408 /* 2409 * Ensure that old small page tables are removed to make room for superpage(s). 2410 * We're going to add new large pages, so make sure we don't remove their parent 2411 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2412 */ 2413 static void switch_to_super_page(struct dmar_domain *domain, 2414 unsigned long start_pfn, 2415 unsigned long end_pfn, int level) 2416 { 2417 unsigned long lvl_pages = lvl_to_nr_pages(level); 2418 struct dma_pte *pte = NULL; 2419 int i; 2420 2421 while (start_pfn <= end_pfn) { 2422 if (!pte) 2423 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2424 2425 if (dma_pte_present(pte)) { 2426 dma_pte_free_pagetable(domain, start_pfn, 2427 start_pfn + lvl_pages - 1, 2428 level + 1); 2429 2430 for_each_domain_iommu(i, domain) 2431 iommu_flush_iotlb_psi(g_iommus[i], domain, 2432 start_pfn, lvl_pages, 2433 0, 0); 2434 } 2435 2436 pte++; 2437 start_pfn += lvl_pages; 2438 if (first_pte_in_page(pte)) 2439 pte = NULL; 2440 } 2441 } 2442 2443 static int 2444 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2445 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2446 { 2447 struct dma_pte *first_pte = NULL, *pte = NULL; 2448 unsigned int largepage_lvl = 0; 2449 unsigned long lvl_pages = 0; 2450 phys_addr_t pteval; 2451 u64 attr; 2452 2453 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2454 2455 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2456 return -EINVAL; 2457 2458 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2459 attr |= DMA_FL_PTE_PRESENT; 2460 if (domain_use_first_level(domain)) { 2461 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2462 if (prot & DMA_PTE_WRITE) 2463 attr |= DMA_FL_PTE_DIRTY; 2464 } 2465 2466 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2467 2468 while (nr_pages > 0) { 2469 uint64_t tmp; 2470 2471 if (!pte) { 2472 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2473 phys_pfn, nr_pages); 2474 2475 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2476 if (!pte) 2477 return -ENOMEM; 2478 first_pte = pte; 2479 2480 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2481 2482 /* It is large page*/ 2483 if (largepage_lvl > 1) { 2484 unsigned long end_pfn; 2485 unsigned long pages_to_remove; 2486 2487 pteval |= DMA_PTE_LARGE_PAGE; 2488 pages_to_remove = min_t(unsigned long, nr_pages, 2489 nr_pte_to_next_page(pte) * lvl_pages); 2490 end_pfn = iov_pfn + pages_to_remove - 1; 2491 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2492 } else { 2493 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2494 } 2495 2496 } 2497 /* We don't need lock here, nobody else 2498 * touches the iova range 2499 */ 2500 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2501 if (tmp) { 2502 static int dumps = 5; 2503 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2504 iov_pfn, tmp, (unsigned long long)pteval); 2505 if (dumps) { 2506 dumps--; 2507 debug_dma_dump_mappings(NULL); 2508 } 2509 WARN_ON(1); 2510 } 2511 2512 nr_pages -= lvl_pages; 2513 iov_pfn += lvl_pages; 2514 phys_pfn += lvl_pages; 2515 pteval += lvl_pages * VTD_PAGE_SIZE; 2516 2517 /* If the next PTE would be the first in a new page, then we 2518 * need to flush the cache on the entries we've just written. 2519 * And then we'll need to recalculate 'pte', so clear it and 2520 * let it get set again in the if (!pte) block above. 2521 * 2522 * If we're done (!nr_pages) we need to flush the cache too. 2523 * 2524 * Also if we've been setting superpages, we may need to 2525 * recalculate 'pte' and switch back to smaller pages for the 2526 * end of the mapping, if the trailing size is not enough to 2527 * use another superpage (i.e. nr_pages < lvl_pages). 2528 */ 2529 pte++; 2530 if (!nr_pages || first_pte_in_page(pte) || 2531 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2532 domain_flush_cache(domain, first_pte, 2533 (void *)pte - (void *)first_pte); 2534 pte = NULL; 2535 } 2536 } 2537 2538 return 0; 2539 } 2540 2541 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2542 { 2543 struct intel_iommu *iommu = info->iommu; 2544 struct context_entry *context; 2545 unsigned long flags; 2546 u16 did_old; 2547 2548 if (!iommu) 2549 return; 2550 2551 spin_lock_irqsave(&iommu->lock, flags); 2552 context = iommu_context_addr(iommu, bus, devfn, 0); 2553 if (!context) { 2554 spin_unlock_irqrestore(&iommu->lock, flags); 2555 return; 2556 } 2557 2558 if (sm_supported(iommu)) { 2559 if (hw_pass_through && domain_type_is_si(info->domain)) 2560 did_old = FLPT_DEFAULT_DID; 2561 else 2562 did_old = info->domain->iommu_did[iommu->seq_id]; 2563 } else { 2564 did_old = context_domain_id(context); 2565 } 2566 2567 context_clear_entry(context); 2568 __iommu_flush_cache(iommu, context, sizeof(*context)); 2569 spin_unlock_irqrestore(&iommu->lock, flags); 2570 iommu->flush.flush_context(iommu, 2571 did_old, 2572 (((u16)bus) << 8) | devfn, 2573 DMA_CCMD_MASK_NOBIT, 2574 DMA_CCMD_DEVICE_INVL); 2575 2576 if (sm_supported(iommu)) 2577 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2578 2579 iommu->flush.flush_iotlb(iommu, 2580 did_old, 2581 0, 2582 0, 2583 DMA_TLB_DSI_FLUSH); 2584 2585 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2586 } 2587 2588 static inline void unlink_domain_info(struct device_domain_info *info) 2589 { 2590 assert_spin_locked(&device_domain_lock); 2591 list_del(&info->link); 2592 list_del(&info->global); 2593 if (info->dev) 2594 dev_iommu_priv_set(info->dev, NULL); 2595 } 2596 2597 static void domain_remove_dev_info(struct dmar_domain *domain) 2598 { 2599 struct device_domain_info *info, *tmp; 2600 unsigned long flags; 2601 2602 spin_lock_irqsave(&device_domain_lock, flags); 2603 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2604 __dmar_remove_one_dev_info(info); 2605 spin_unlock_irqrestore(&device_domain_lock, flags); 2606 } 2607 2608 struct dmar_domain *find_domain(struct device *dev) 2609 { 2610 struct device_domain_info *info; 2611 2612 if (unlikely(!dev || !dev->iommu)) 2613 return NULL; 2614 2615 if (unlikely(attach_deferred(dev))) 2616 return NULL; 2617 2618 /* No lock here, assumes no domain exit in normal case */ 2619 info = get_domain_info(dev); 2620 if (likely(info)) 2621 return info->domain; 2622 2623 return NULL; 2624 } 2625 2626 static inline struct device_domain_info * 2627 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2628 { 2629 struct device_domain_info *info; 2630 2631 list_for_each_entry(info, &device_domain_list, global) 2632 if (info->segment == segment && info->bus == bus && 2633 info->devfn == devfn) 2634 return info; 2635 2636 return NULL; 2637 } 2638 2639 static int domain_setup_first_level(struct intel_iommu *iommu, 2640 struct dmar_domain *domain, 2641 struct device *dev, 2642 u32 pasid) 2643 { 2644 struct dma_pte *pgd = domain->pgd; 2645 int agaw, level; 2646 int flags = 0; 2647 2648 /* 2649 * Skip top levels of page tables for iommu which has 2650 * less agaw than default. Unnecessary for PT mode. 2651 */ 2652 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2653 pgd = phys_to_virt(dma_pte_addr(pgd)); 2654 if (!dma_pte_present(pgd)) 2655 return -ENOMEM; 2656 } 2657 2658 level = agaw_to_level(agaw); 2659 if (level != 4 && level != 5) 2660 return -EINVAL; 2661 2662 if (pasid != PASID_RID2PASID) 2663 flags |= PASID_FLAG_SUPERVISOR_MODE; 2664 if (level == 5) 2665 flags |= PASID_FLAG_FL5LP; 2666 2667 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2668 flags |= PASID_FLAG_PAGE_SNOOP; 2669 2670 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2671 domain->iommu_did[iommu->seq_id], 2672 flags); 2673 } 2674 2675 static bool dev_is_real_dma_subdevice(struct device *dev) 2676 { 2677 return dev && dev_is_pci(dev) && 2678 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2679 } 2680 2681 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2682 int bus, int devfn, 2683 struct device *dev, 2684 struct dmar_domain *domain) 2685 { 2686 struct dmar_domain *found = NULL; 2687 struct device_domain_info *info; 2688 unsigned long flags; 2689 int ret; 2690 2691 info = alloc_devinfo_mem(); 2692 if (!info) 2693 return NULL; 2694 2695 if (!dev_is_real_dma_subdevice(dev)) { 2696 info->bus = bus; 2697 info->devfn = devfn; 2698 info->segment = iommu->segment; 2699 } else { 2700 struct pci_dev *pdev = to_pci_dev(dev); 2701 2702 info->bus = pdev->bus->number; 2703 info->devfn = pdev->devfn; 2704 info->segment = pci_domain_nr(pdev->bus); 2705 } 2706 2707 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2708 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2709 info->ats_qdep = 0; 2710 info->dev = dev; 2711 info->domain = domain; 2712 info->iommu = iommu; 2713 info->pasid_table = NULL; 2714 info->auxd_enabled = 0; 2715 INIT_LIST_HEAD(&info->subdevices); 2716 2717 if (dev && dev_is_pci(dev)) { 2718 struct pci_dev *pdev = to_pci_dev(info->dev); 2719 2720 if (ecap_dev_iotlb_support(iommu->ecap) && 2721 pci_ats_supported(pdev) && 2722 dmar_find_matched_atsr_unit(pdev)) 2723 info->ats_supported = 1; 2724 2725 if (sm_supported(iommu)) { 2726 if (pasid_supported(iommu)) { 2727 int features = pci_pasid_features(pdev); 2728 if (features >= 0) 2729 info->pasid_supported = features | 1; 2730 } 2731 2732 if (info->ats_supported && ecap_prs(iommu->ecap) && 2733 pci_pri_supported(pdev)) 2734 info->pri_supported = 1; 2735 } 2736 } 2737 2738 spin_lock_irqsave(&device_domain_lock, flags); 2739 if (dev) 2740 found = find_domain(dev); 2741 2742 if (!found) { 2743 struct device_domain_info *info2; 2744 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2745 info->devfn); 2746 if (info2) { 2747 found = info2->domain; 2748 info2->dev = dev; 2749 } 2750 } 2751 2752 if (found) { 2753 spin_unlock_irqrestore(&device_domain_lock, flags); 2754 free_devinfo_mem(info); 2755 /* Caller must free the original domain */ 2756 return found; 2757 } 2758 2759 spin_lock(&iommu->lock); 2760 ret = domain_attach_iommu(domain, iommu); 2761 spin_unlock(&iommu->lock); 2762 2763 if (ret) { 2764 spin_unlock_irqrestore(&device_domain_lock, flags); 2765 free_devinfo_mem(info); 2766 return NULL; 2767 } 2768 2769 list_add(&info->link, &domain->devices); 2770 list_add(&info->global, &device_domain_list); 2771 if (dev) 2772 dev_iommu_priv_set(dev, info); 2773 spin_unlock_irqrestore(&device_domain_lock, flags); 2774 2775 /* PASID table is mandatory for a PCI device in scalable mode. */ 2776 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2777 ret = intel_pasid_alloc_table(dev); 2778 if (ret) { 2779 dev_err(dev, "PASID table allocation failed\n"); 2780 dmar_remove_one_dev_info(dev); 2781 return NULL; 2782 } 2783 2784 /* Setup the PASID entry for requests without PASID: */ 2785 spin_lock_irqsave(&iommu->lock, flags); 2786 if (hw_pass_through && domain_type_is_si(domain)) 2787 ret = intel_pasid_setup_pass_through(iommu, domain, 2788 dev, PASID_RID2PASID); 2789 else if (domain_use_first_level(domain)) 2790 ret = domain_setup_first_level(iommu, domain, dev, 2791 PASID_RID2PASID); 2792 else 2793 ret = intel_pasid_setup_second_level(iommu, domain, 2794 dev, PASID_RID2PASID); 2795 spin_unlock_irqrestore(&iommu->lock, flags); 2796 if (ret) { 2797 dev_err(dev, "Setup RID2PASID failed\n"); 2798 dmar_remove_one_dev_info(dev); 2799 return NULL; 2800 } 2801 } 2802 2803 if (dev && domain_context_mapping(domain, dev)) { 2804 dev_err(dev, "Domain context map failed\n"); 2805 dmar_remove_one_dev_info(dev); 2806 return NULL; 2807 } 2808 2809 return domain; 2810 } 2811 2812 static int iommu_domain_identity_map(struct dmar_domain *domain, 2813 unsigned long first_vpfn, 2814 unsigned long last_vpfn) 2815 { 2816 /* 2817 * RMRR range might have overlap with physical memory range, 2818 * clear it first 2819 */ 2820 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2821 2822 return __domain_mapping(domain, first_vpfn, 2823 first_vpfn, last_vpfn - first_vpfn + 1, 2824 DMA_PTE_READ|DMA_PTE_WRITE); 2825 } 2826 2827 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2828 2829 static int __init si_domain_init(int hw) 2830 { 2831 struct dmar_rmrr_unit *rmrr; 2832 struct device *dev; 2833 int i, nid, ret; 2834 2835 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2836 if (!si_domain) 2837 return -EFAULT; 2838 2839 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2840 domain_exit(si_domain); 2841 return -EFAULT; 2842 } 2843 2844 if (hw) 2845 return 0; 2846 2847 for_each_online_node(nid) { 2848 unsigned long start_pfn, end_pfn; 2849 int i; 2850 2851 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2852 ret = iommu_domain_identity_map(si_domain, 2853 mm_to_dma_pfn(start_pfn), 2854 mm_to_dma_pfn(end_pfn)); 2855 if (ret) 2856 return ret; 2857 } 2858 } 2859 2860 /* 2861 * Identity map the RMRRs so that devices with RMRRs could also use 2862 * the si_domain. 2863 */ 2864 for_each_rmrr_units(rmrr) { 2865 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2866 i, dev) { 2867 unsigned long long start = rmrr->base_address; 2868 unsigned long long end = rmrr->end_address; 2869 2870 if (WARN_ON(end < start || 2871 end >> agaw_to_width(si_domain->agaw))) 2872 continue; 2873 2874 ret = iommu_domain_identity_map(si_domain, 2875 mm_to_dma_pfn(start >> PAGE_SHIFT), 2876 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2877 if (ret) 2878 return ret; 2879 } 2880 } 2881 2882 return 0; 2883 } 2884 2885 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2886 { 2887 struct dmar_domain *ndomain; 2888 struct intel_iommu *iommu; 2889 u8 bus, devfn; 2890 2891 iommu = device_to_iommu(dev, &bus, &devfn); 2892 if (!iommu) 2893 return -ENODEV; 2894 2895 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2896 if (ndomain != domain) 2897 return -EBUSY; 2898 2899 return 0; 2900 } 2901 2902 static bool device_has_rmrr(struct device *dev) 2903 { 2904 struct dmar_rmrr_unit *rmrr; 2905 struct device *tmp; 2906 int i; 2907 2908 rcu_read_lock(); 2909 for_each_rmrr_units(rmrr) { 2910 /* 2911 * Return TRUE if this RMRR contains the device that 2912 * is passed in. 2913 */ 2914 for_each_active_dev_scope(rmrr->devices, 2915 rmrr->devices_cnt, i, tmp) 2916 if (tmp == dev || 2917 is_downstream_to_pci_bridge(dev, tmp)) { 2918 rcu_read_unlock(); 2919 return true; 2920 } 2921 } 2922 rcu_read_unlock(); 2923 return false; 2924 } 2925 2926 /** 2927 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2928 * is relaxable (ie. is allowed to be not enforced under some conditions) 2929 * @dev: device handle 2930 * 2931 * We assume that PCI USB devices with RMRRs have them largely 2932 * for historical reasons and that the RMRR space is not actively used post 2933 * boot. This exclusion may change if vendors begin to abuse it. 2934 * 2935 * The same exception is made for graphics devices, with the requirement that 2936 * any use of the RMRR regions will be torn down before assigning the device 2937 * to a guest. 2938 * 2939 * Return: true if the RMRR is relaxable, false otherwise 2940 */ 2941 static bool device_rmrr_is_relaxable(struct device *dev) 2942 { 2943 struct pci_dev *pdev; 2944 2945 if (!dev_is_pci(dev)) 2946 return false; 2947 2948 pdev = to_pci_dev(dev); 2949 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2950 return true; 2951 else 2952 return false; 2953 } 2954 2955 /* 2956 * There are a couple cases where we need to restrict the functionality of 2957 * devices associated with RMRRs. The first is when evaluating a device for 2958 * identity mapping because problems exist when devices are moved in and out 2959 * of domains and their respective RMRR information is lost. This means that 2960 * a device with associated RMRRs will never be in a "passthrough" domain. 2961 * The second is use of the device through the IOMMU API. This interface 2962 * expects to have full control of the IOVA space for the device. We cannot 2963 * satisfy both the requirement that RMRR access is maintained and have an 2964 * unencumbered IOVA space. We also have no ability to quiesce the device's 2965 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2966 * We therefore prevent devices associated with an RMRR from participating in 2967 * the IOMMU API, which eliminates them from device assignment. 2968 * 2969 * In both cases, devices which have relaxable RMRRs are not concerned by this 2970 * restriction. See device_rmrr_is_relaxable comment. 2971 */ 2972 static bool device_is_rmrr_locked(struct device *dev) 2973 { 2974 if (!device_has_rmrr(dev)) 2975 return false; 2976 2977 if (device_rmrr_is_relaxable(dev)) 2978 return false; 2979 2980 return true; 2981 } 2982 2983 /* 2984 * Return the required default domain type for a specific device. 2985 * 2986 * @dev: the device in query 2987 * @startup: true if this is during early boot 2988 * 2989 * Returns: 2990 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2991 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2992 * - 0: both identity and dynamic domains work for this device 2993 */ 2994 static int device_def_domain_type(struct device *dev) 2995 { 2996 if (dev_is_pci(dev)) { 2997 struct pci_dev *pdev = to_pci_dev(dev); 2998 2999 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 3000 return IOMMU_DOMAIN_IDENTITY; 3001 3002 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 3003 return IOMMU_DOMAIN_IDENTITY; 3004 } 3005 3006 return 0; 3007 } 3008 3009 static void intel_iommu_init_qi(struct intel_iommu *iommu) 3010 { 3011 /* 3012 * Start from the sane iommu hardware state. 3013 * If the queued invalidation is already initialized by us 3014 * (for example, while enabling interrupt-remapping) then 3015 * we got the things already rolling from a sane state. 3016 */ 3017 if (!iommu->qi) { 3018 /* 3019 * Clear any previous faults. 3020 */ 3021 dmar_fault(-1, iommu); 3022 /* 3023 * Disable queued invalidation if supported and already enabled 3024 * before OS handover. 3025 */ 3026 dmar_disable_qi(iommu); 3027 } 3028 3029 if (dmar_enable_qi(iommu)) { 3030 /* 3031 * Queued Invalidate not enabled, use Register Based Invalidate 3032 */ 3033 iommu->flush.flush_context = __iommu_flush_context; 3034 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 3035 pr_info("%s: Using Register based invalidation\n", 3036 iommu->name); 3037 } else { 3038 iommu->flush.flush_context = qi_flush_context; 3039 iommu->flush.flush_iotlb = qi_flush_iotlb; 3040 pr_info("%s: Using Queued invalidation\n", iommu->name); 3041 } 3042 } 3043 3044 static int copy_context_table(struct intel_iommu *iommu, 3045 struct root_entry *old_re, 3046 struct context_entry **tbl, 3047 int bus, bool ext) 3048 { 3049 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 3050 struct context_entry *new_ce = NULL, ce; 3051 struct context_entry *old_ce = NULL; 3052 struct root_entry re; 3053 phys_addr_t old_ce_phys; 3054 3055 tbl_idx = ext ? bus * 2 : bus; 3056 memcpy(&re, old_re, sizeof(re)); 3057 3058 for (devfn = 0; devfn < 256; devfn++) { 3059 /* First calculate the correct index */ 3060 idx = (ext ? devfn * 2 : devfn) % 256; 3061 3062 if (idx == 0) { 3063 /* First save what we may have and clean up */ 3064 if (new_ce) { 3065 tbl[tbl_idx] = new_ce; 3066 __iommu_flush_cache(iommu, new_ce, 3067 VTD_PAGE_SIZE); 3068 pos = 1; 3069 } 3070 3071 if (old_ce) 3072 memunmap(old_ce); 3073 3074 ret = 0; 3075 if (devfn < 0x80) 3076 old_ce_phys = root_entry_lctp(&re); 3077 else 3078 old_ce_phys = root_entry_uctp(&re); 3079 3080 if (!old_ce_phys) { 3081 if (ext && devfn == 0) { 3082 /* No LCTP, try UCTP */ 3083 devfn = 0x7f; 3084 continue; 3085 } else { 3086 goto out; 3087 } 3088 } 3089 3090 ret = -ENOMEM; 3091 old_ce = memremap(old_ce_phys, PAGE_SIZE, 3092 MEMREMAP_WB); 3093 if (!old_ce) 3094 goto out; 3095 3096 new_ce = alloc_pgtable_page(iommu->node); 3097 if (!new_ce) 3098 goto out_unmap; 3099 3100 ret = 0; 3101 } 3102 3103 /* Now copy the context entry */ 3104 memcpy(&ce, old_ce + idx, sizeof(ce)); 3105 3106 if (!__context_present(&ce)) 3107 continue; 3108 3109 did = context_domain_id(&ce); 3110 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3111 set_bit(did, iommu->domain_ids); 3112 3113 /* 3114 * We need a marker for copied context entries. This 3115 * marker needs to work for the old format as well as 3116 * for extended context entries. 3117 * 3118 * Bit 67 of the context entry is used. In the old 3119 * format this bit is available to software, in the 3120 * extended format it is the PGE bit, but PGE is ignored 3121 * by HW if PASIDs are disabled (and thus still 3122 * available). 3123 * 3124 * So disable PASIDs first and then mark the entry 3125 * copied. This means that we don't copy PASID 3126 * translations from the old kernel, but this is fine as 3127 * faults there are not fatal. 3128 */ 3129 context_clear_pasid_enable(&ce); 3130 context_set_copied(&ce); 3131 3132 new_ce[idx] = ce; 3133 } 3134 3135 tbl[tbl_idx + pos] = new_ce; 3136 3137 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3138 3139 out_unmap: 3140 memunmap(old_ce); 3141 3142 out: 3143 return ret; 3144 } 3145 3146 static int copy_translation_tables(struct intel_iommu *iommu) 3147 { 3148 struct context_entry **ctxt_tbls; 3149 struct root_entry *old_rt; 3150 phys_addr_t old_rt_phys; 3151 int ctxt_table_entries; 3152 unsigned long flags; 3153 u64 rtaddr_reg; 3154 int bus, ret; 3155 bool new_ext, ext; 3156 3157 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3158 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3159 new_ext = !!ecap_ecs(iommu->ecap); 3160 3161 /* 3162 * The RTT bit can only be changed when translation is disabled, 3163 * but disabling translation means to open a window for data 3164 * corruption. So bail out and don't copy anything if we would 3165 * have to change the bit. 3166 */ 3167 if (new_ext != ext) 3168 return -EINVAL; 3169 3170 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3171 if (!old_rt_phys) 3172 return -EINVAL; 3173 3174 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3175 if (!old_rt) 3176 return -ENOMEM; 3177 3178 /* This is too big for the stack - allocate it from slab */ 3179 ctxt_table_entries = ext ? 512 : 256; 3180 ret = -ENOMEM; 3181 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3182 if (!ctxt_tbls) 3183 goto out_unmap; 3184 3185 for (bus = 0; bus < 256; bus++) { 3186 ret = copy_context_table(iommu, &old_rt[bus], 3187 ctxt_tbls, bus, ext); 3188 if (ret) { 3189 pr_err("%s: Failed to copy context table for bus %d\n", 3190 iommu->name, bus); 3191 continue; 3192 } 3193 } 3194 3195 spin_lock_irqsave(&iommu->lock, flags); 3196 3197 /* Context tables are copied, now write them to the root_entry table */ 3198 for (bus = 0; bus < 256; bus++) { 3199 int idx = ext ? bus * 2 : bus; 3200 u64 val; 3201 3202 if (ctxt_tbls[idx]) { 3203 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3204 iommu->root_entry[bus].lo = val; 3205 } 3206 3207 if (!ext || !ctxt_tbls[idx + 1]) 3208 continue; 3209 3210 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3211 iommu->root_entry[bus].hi = val; 3212 } 3213 3214 spin_unlock_irqrestore(&iommu->lock, flags); 3215 3216 kfree(ctxt_tbls); 3217 3218 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3219 3220 ret = 0; 3221 3222 out_unmap: 3223 memunmap(old_rt); 3224 3225 return ret; 3226 } 3227 3228 #ifdef CONFIG_INTEL_IOMMU_SVM 3229 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3230 { 3231 struct intel_iommu *iommu = data; 3232 ioasid_t ioasid; 3233 3234 if (!iommu) 3235 return INVALID_IOASID; 3236 /* 3237 * VT-d virtual command interface always uses the full 20 bit 3238 * PASID range. Host can partition guest PASID range based on 3239 * policies but it is out of guest's control. 3240 */ 3241 if (min < PASID_MIN || max > intel_pasid_max_id) 3242 return INVALID_IOASID; 3243 3244 if (vcmd_alloc_pasid(iommu, &ioasid)) 3245 return INVALID_IOASID; 3246 3247 return ioasid; 3248 } 3249 3250 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3251 { 3252 struct intel_iommu *iommu = data; 3253 3254 if (!iommu) 3255 return; 3256 /* 3257 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3258 * We can only free the PASID when all the devices are unbound. 3259 */ 3260 if (ioasid_find(NULL, ioasid, NULL)) { 3261 pr_alert("Cannot free active IOASID %d\n", ioasid); 3262 return; 3263 } 3264 vcmd_free_pasid(iommu, ioasid); 3265 } 3266 3267 static void register_pasid_allocator(struct intel_iommu *iommu) 3268 { 3269 /* 3270 * If we are running in the host, no need for custom allocator 3271 * in that PASIDs are allocated from the host system-wide. 3272 */ 3273 if (!cap_caching_mode(iommu->cap)) 3274 return; 3275 3276 if (!sm_supported(iommu)) { 3277 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3278 return; 3279 } 3280 3281 /* 3282 * Register a custom PASID allocator if we are running in a guest, 3283 * guest PASID must be obtained via virtual command interface. 3284 * There can be multiple vIOMMUs in each guest but only one allocator 3285 * is active. All vIOMMU allocators will eventually be calling the same 3286 * host allocator. 3287 */ 3288 if (!vccap_pasid(iommu->vccap)) 3289 return; 3290 3291 pr_info("Register custom PASID allocator\n"); 3292 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3293 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3294 iommu->pasid_allocator.pdata = (void *)iommu; 3295 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3296 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3297 /* 3298 * Disable scalable mode on this IOMMU if there 3299 * is no custom allocator. Mixing SM capable vIOMMU 3300 * and non-SM vIOMMU are not supported. 3301 */ 3302 intel_iommu_sm = 0; 3303 } 3304 } 3305 #endif 3306 3307 static int __init init_dmars(void) 3308 { 3309 struct dmar_drhd_unit *drhd; 3310 struct intel_iommu *iommu; 3311 int ret; 3312 3313 /* 3314 * for each drhd 3315 * allocate root 3316 * initialize and program root entry to not present 3317 * endfor 3318 */ 3319 for_each_drhd_unit(drhd) { 3320 /* 3321 * lock not needed as this is only incremented in the single 3322 * threaded kernel __init code path all other access are read 3323 * only 3324 */ 3325 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3326 g_num_of_iommus++; 3327 continue; 3328 } 3329 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3330 } 3331 3332 /* Preallocate enough resources for IOMMU hot-addition */ 3333 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3334 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3335 3336 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3337 GFP_KERNEL); 3338 if (!g_iommus) { 3339 ret = -ENOMEM; 3340 goto error; 3341 } 3342 3343 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3344 if (ret) 3345 goto free_iommu; 3346 3347 for_each_iommu(iommu, drhd) { 3348 if (drhd->ignored) { 3349 iommu_disable_translation(iommu); 3350 continue; 3351 } 3352 3353 /* 3354 * Find the max pasid size of all IOMMU's in the system. 3355 * We need to ensure the system pasid table is no bigger 3356 * than the smallest supported. 3357 */ 3358 if (pasid_supported(iommu)) { 3359 u32 temp = 2 << ecap_pss(iommu->ecap); 3360 3361 intel_pasid_max_id = min_t(u32, temp, 3362 intel_pasid_max_id); 3363 } 3364 3365 g_iommus[iommu->seq_id] = iommu; 3366 3367 intel_iommu_init_qi(iommu); 3368 3369 ret = iommu_init_domains(iommu); 3370 if (ret) 3371 goto free_iommu; 3372 3373 init_translation_status(iommu); 3374 3375 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3376 iommu_disable_translation(iommu); 3377 clear_translation_pre_enabled(iommu); 3378 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3379 iommu->name); 3380 } 3381 3382 /* 3383 * TBD: 3384 * we could share the same root & context tables 3385 * among all IOMMU's. Need to Split it later. 3386 */ 3387 ret = iommu_alloc_root_entry(iommu); 3388 if (ret) 3389 goto free_iommu; 3390 3391 if (translation_pre_enabled(iommu)) { 3392 pr_info("Translation already enabled - trying to copy translation structures\n"); 3393 3394 ret = copy_translation_tables(iommu); 3395 if (ret) { 3396 /* 3397 * We found the IOMMU with translation 3398 * enabled - but failed to copy over the 3399 * old root-entry table. Try to proceed 3400 * by disabling translation now and 3401 * allocating a clean root-entry table. 3402 * This might cause DMAR faults, but 3403 * probably the dump will still succeed. 3404 */ 3405 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3406 iommu->name); 3407 iommu_disable_translation(iommu); 3408 clear_translation_pre_enabled(iommu); 3409 } else { 3410 pr_info("Copied translation tables from previous kernel for %s\n", 3411 iommu->name); 3412 } 3413 } 3414 3415 if (!ecap_pass_through(iommu->ecap)) 3416 hw_pass_through = 0; 3417 intel_svm_check(iommu); 3418 } 3419 3420 /* 3421 * Now that qi is enabled on all iommus, set the root entry and flush 3422 * caches. This is required on some Intel X58 chipsets, otherwise the 3423 * flush_context function will loop forever and the boot hangs. 3424 */ 3425 for_each_active_iommu(iommu, drhd) { 3426 iommu_flush_write_buffer(iommu); 3427 #ifdef CONFIG_INTEL_IOMMU_SVM 3428 register_pasid_allocator(iommu); 3429 #endif 3430 iommu_set_root_entry(iommu); 3431 } 3432 3433 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3434 dmar_map_gfx = 0; 3435 #endif 3436 3437 if (!dmar_map_gfx) 3438 iommu_identity_mapping |= IDENTMAP_GFX; 3439 3440 check_tylersburg_isoch(); 3441 3442 ret = si_domain_init(hw_pass_through); 3443 if (ret) 3444 goto free_iommu; 3445 3446 /* 3447 * for each drhd 3448 * enable fault log 3449 * global invalidate context cache 3450 * global invalidate iotlb 3451 * enable translation 3452 */ 3453 for_each_iommu(iommu, drhd) { 3454 if (drhd->ignored) { 3455 /* 3456 * we always have to disable PMRs or DMA may fail on 3457 * this device 3458 */ 3459 if (force_on) 3460 iommu_disable_protect_mem_regions(iommu); 3461 continue; 3462 } 3463 3464 iommu_flush_write_buffer(iommu); 3465 3466 #ifdef CONFIG_INTEL_IOMMU_SVM 3467 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3468 /* 3469 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3470 * could cause possible lock race condition. 3471 */ 3472 up_write(&dmar_global_lock); 3473 ret = intel_svm_enable_prq(iommu); 3474 down_write(&dmar_global_lock); 3475 if (ret) 3476 goto free_iommu; 3477 } 3478 #endif 3479 ret = dmar_set_interrupt(iommu); 3480 if (ret) 3481 goto free_iommu; 3482 } 3483 3484 return 0; 3485 3486 free_iommu: 3487 for_each_active_iommu(iommu, drhd) { 3488 disable_dmar_iommu(iommu); 3489 free_dmar_iommu(iommu); 3490 } 3491 3492 kfree(g_iommus); 3493 3494 error: 3495 return ret; 3496 } 3497 3498 static inline int iommu_domain_cache_init(void) 3499 { 3500 int ret = 0; 3501 3502 iommu_domain_cache = kmem_cache_create("iommu_domain", 3503 sizeof(struct dmar_domain), 3504 0, 3505 SLAB_HWCACHE_ALIGN, 3506 3507 NULL); 3508 if (!iommu_domain_cache) { 3509 pr_err("Couldn't create iommu_domain cache\n"); 3510 ret = -ENOMEM; 3511 } 3512 3513 return ret; 3514 } 3515 3516 static inline int iommu_devinfo_cache_init(void) 3517 { 3518 int ret = 0; 3519 3520 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3521 sizeof(struct device_domain_info), 3522 0, 3523 SLAB_HWCACHE_ALIGN, 3524 NULL); 3525 if (!iommu_devinfo_cache) { 3526 pr_err("Couldn't create devinfo cache\n"); 3527 ret = -ENOMEM; 3528 } 3529 3530 return ret; 3531 } 3532 3533 static int __init iommu_init_mempool(void) 3534 { 3535 int ret; 3536 ret = iova_cache_get(); 3537 if (ret) 3538 return ret; 3539 3540 ret = iommu_domain_cache_init(); 3541 if (ret) 3542 goto domain_error; 3543 3544 ret = iommu_devinfo_cache_init(); 3545 if (!ret) 3546 return ret; 3547 3548 kmem_cache_destroy(iommu_domain_cache); 3549 domain_error: 3550 iova_cache_put(); 3551 3552 return -ENOMEM; 3553 } 3554 3555 static void __init iommu_exit_mempool(void) 3556 { 3557 kmem_cache_destroy(iommu_devinfo_cache); 3558 kmem_cache_destroy(iommu_domain_cache); 3559 iova_cache_put(); 3560 } 3561 3562 static void __init init_no_remapping_devices(void) 3563 { 3564 struct dmar_drhd_unit *drhd; 3565 struct device *dev; 3566 int i; 3567 3568 for_each_drhd_unit(drhd) { 3569 if (!drhd->include_all) { 3570 for_each_active_dev_scope(drhd->devices, 3571 drhd->devices_cnt, i, dev) 3572 break; 3573 /* ignore DMAR unit if no devices exist */ 3574 if (i == drhd->devices_cnt) 3575 drhd->ignored = 1; 3576 } 3577 } 3578 3579 for_each_active_drhd_unit(drhd) { 3580 if (drhd->include_all) 3581 continue; 3582 3583 for_each_active_dev_scope(drhd->devices, 3584 drhd->devices_cnt, i, dev) 3585 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3586 break; 3587 if (i < drhd->devices_cnt) 3588 continue; 3589 3590 /* This IOMMU has *only* gfx devices. Either bypass it or 3591 set the gfx_mapped flag, as appropriate */ 3592 drhd->gfx_dedicated = 1; 3593 if (!dmar_map_gfx) 3594 drhd->ignored = 1; 3595 } 3596 } 3597 3598 #ifdef CONFIG_SUSPEND 3599 static int init_iommu_hw(void) 3600 { 3601 struct dmar_drhd_unit *drhd; 3602 struct intel_iommu *iommu = NULL; 3603 3604 for_each_active_iommu(iommu, drhd) 3605 if (iommu->qi) 3606 dmar_reenable_qi(iommu); 3607 3608 for_each_iommu(iommu, drhd) { 3609 if (drhd->ignored) { 3610 /* 3611 * we always have to disable PMRs or DMA may fail on 3612 * this device 3613 */ 3614 if (force_on) 3615 iommu_disable_protect_mem_regions(iommu); 3616 continue; 3617 } 3618 3619 iommu_flush_write_buffer(iommu); 3620 iommu_set_root_entry(iommu); 3621 iommu_enable_translation(iommu); 3622 iommu_disable_protect_mem_regions(iommu); 3623 } 3624 3625 return 0; 3626 } 3627 3628 static void iommu_flush_all(void) 3629 { 3630 struct dmar_drhd_unit *drhd; 3631 struct intel_iommu *iommu; 3632 3633 for_each_active_iommu(iommu, drhd) { 3634 iommu->flush.flush_context(iommu, 0, 0, 0, 3635 DMA_CCMD_GLOBAL_INVL); 3636 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3637 DMA_TLB_GLOBAL_FLUSH); 3638 } 3639 } 3640 3641 static int iommu_suspend(void) 3642 { 3643 struct dmar_drhd_unit *drhd; 3644 struct intel_iommu *iommu = NULL; 3645 unsigned long flag; 3646 3647 for_each_active_iommu(iommu, drhd) { 3648 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3649 GFP_KERNEL); 3650 if (!iommu->iommu_state) 3651 goto nomem; 3652 } 3653 3654 iommu_flush_all(); 3655 3656 for_each_active_iommu(iommu, drhd) { 3657 iommu_disable_translation(iommu); 3658 3659 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3660 3661 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3662 readl(iommu->reg + DMAR_FECTL_REG); 3663 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3664 readl(iommu->reg + DMAR_FEDATA_REG); 3665 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3666 readl(iommu->reg + DMAR_FEADDR_REG); 3667 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3668 readl(iommu->reg + DMAR_FEUADDR_REG); 3669 3670 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3671 } 3672 return 0; 3673 3674 nomem: 3675 for_each_active_iommu(iommu, drhd) 3676 kfree(iommu->iommu_state); 3677 3678 return -ENOMEM; 3679 } 3680 3681 static void iommu_resume(void) 3682 { 3683 struct dmar_drhd_unit *drhd; 3684 struct intel_iommu *iommu = NULL; 3685 unsigned long flag; 3686 3687 if (init_iommu_hw()) { 3688 if (force_on) 3689 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3690 else 3691 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3692 return; 3693 } 3694 3695 for_each_active_iommu(iommu, drhd) { 3696 3697 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3698 3699 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3700 iommu->reg + DMAR_FECTL_REG); 3701 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3702 iommu->reg + DMAR_FEDATA_REG); 3703 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3704 iommu->reg + DMAR_FEADDR_REG); 3705 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3706 iommu->reg + DMAR_FEUADDR_REG); 3707 3708 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3709 } 3710 3711 for_each_active_iommu(iommu, drhd) 3712 kfree(iommu->iommu_state); 3713 } 3714 3715 static struct syscore_ops iommu_syscore_ops = { 3716 .resume = iommu_resume, 3717 .suspend = iommu_suspend, 3718 }; 3719 3720 static void __init init_iommu_pm_ops(void) 3721 { 3722 register_syscore_ops(&iommu_syscore_ops); 3723 } 3724 3725 #else 3726 static inline void init_iommu_pm_ops(void) {} 3727 #endif /* CONFIG_PM */ 3728 3729 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3730 { 3731 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3732 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3733 rmrr->end_address <= rmrr->base_address || 3734 arch_rmrr_sanity_check(rmrr)) 3735 return -EINVAL; 3736 3737 return 0; 3738 } 3739 3740 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3741 { 3742 struct acpi_dmar_reserved_memory *rmrr; 3743 struct dmar_rmrr_unit *rmrru; 3744 3745 rmrr = (struct acpi_dmar_reserved_memory *)header; 3746 if (rmrr_sanity_check(rmrr)) { 3747 pr_warn(FW_BUG 3748 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3749 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3750 rmrr->base_address, rmrr->end_address, 3751 dmi_get_system_info(DMI_BIOS_VENDOR), 3752 dmi_get_system_info(DMI_BIOS_VERSION), 3753 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3754 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3755 } 3756 3757 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3758 if (!rmrru) 3759 goto out; 3760 3761 rmrru->hdr = header; 3762 3763 rmrru->base_address = rmrr->base_address; 3764 rmrru->end_address = rmrr->end_address; 3765 3766 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3767 ((void *)rmrr) + rmrr->header.length, 3768 &rmrru->devices_cnt); 3769 if (rmrru->devices_cnt && rmrru->devices == NULL) 3770 goto free_rmrru; 3771 3772 list_add(&rmrru->list, &dmar_rmrr_units); 3773 3774 return 0; 3775 free_rmrru: 3776 kfree(rmrru); 3777 out: 3778 return -ENOMEM; 3779 } 3780 3781 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3782 { 3783 struct dmar_atsr_unit *atsru; 3784 struct acpi_dmar_atsr *tmp; 3785 3786 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3787 dmar_rcu_check()) { 3788 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3789 if (atsr->segment != tmp->segment) 3790 continue; 3791 if (atsr->header.length != tmp->header.length) 3792 continue; 3793 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3794 return atsru; 3795 } 3796 3797 return NULL; 3798 } 3799 3800 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3801 { 3802 struct acpi_dmar_atsr *atsr; 3803 struct dmar_atsr_unit *atsru; 3804 3805 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3806 return 0; 3807 3808 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3809 atsru = dmar_find_atsr(atsr); 3810 if (atsru) 3811 return 0; 3812 3813 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3814 if (!atsru) 3815 return -ENOMEM; 3816 3817 /* 3818 * If memory is allocated from slab by ACPI _DSM method, we need to 3819 * copy the memory content because the memory buffer will be freed 3820 * on return. 3821 */ 3822 atsru->hdr = (void *)(atsru + 1); 3823 memcpy(atsru->hdr, hdr, hdr->length); 3824 atsru->include_all = atsr->flags & 0x1; 3825 if (!atsru->include_all) { 3826 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3827 (void *)atsr + atsr->header.length, 3828 &atsru->devices_cnt); 3829 if (atsru->devices_cnt && atsru->devices == NULL) { 3830 kfree(atsru); 3831 return -ENOMEM; 3832 } 3833 } 3834 3835 list_add_rcu(&atsru->list, &dmar_atsr_units); 3836 3837 return 0; 3838 } 3839 3840 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3841 { 3842 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3843 kfree(atsru); 3844 } 3845 3846 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3847 { 3848 struct acpi_dmar_atsr *atsr; 3849 struct dmar_atsr_unit *atsru; 3850 3851 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3852 atsru = dmar_find_atsr(atsr); 3853 if (atsru) { 3854 list_del_rcu(&atsru->list); 3855 synchronize_rcu(); 3856 intel_iommu_free_atsr(atsru); 3857 } 3858 3859 return 0; 3860 } 3861 3862 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3863 { 3864 int i; 3865 struct device *dev; 3866 struct acpi_dmar_atsr *atsr; 3867 struct dmar_atsr_unit *atsru; 3868 3869 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3870 atsru = dmar_find_atsr(atsr); 3871 if (!atsru) 3872 return 0; 3873 3874 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3875 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3876 i, dev) 3877 return -EBUSY; 3878 } 3879 3880 return 0; 3881 } 3882 3883 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3884 { 3885 struct dmar_satc_unit *satcu; 3886 struct acpi_dmar_satc *tmp; 3887 3888 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3889 dmar_rcu_check()) { 3890 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3891 if (satc->segment != tmp->segment) 3892 continue; 3893 if (satc->header.length != tmp->header.length) 3894 continue; 3895 if (memcmp(satc, tmp, satc->header.length) == 0) 3896 return satcu; 3897 } 3898 3899 return NULL; 3900 } 3901 3902 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3903 { 3904 struct acpi_dmar_satc *satc; 3905 struct dmar_satc_unit *satcu; 3906 3907 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3908 return 0; 3909 3910 satc = container_of(hdr, struct acpi_dmar_satc, header); 3911 satcu = dmar_find_satc(satc); 3912 if (satcu) 3913 return 0; 3914 3915 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3916 if (!satcu) 3917 return -ENOMEM; 3918 3919 satcu->hdr = (void *)(satcu + 1); 3920 memcpy(satcu->hdr, hdr, hdr->length); 3921 satcu->atc_required = satc->flags & 0x1; 3922 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3923 (void *)satc + satc->header.length, 3924 &satcu->devices_cnt); 3925 if (satcu->devices_cnt && !satcu->devices) { 3926 kfree(satcu); 3927 return -ENOMEM; 3928 } 3929 list_add_rcu(&satcu->list, &dmar_satc_units); 3930 3931 return 0; 3932 } 3933 3934 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3935 { 3936 int sp, ret; 3937 struct intel_iommu *iommu = dmaru->iommu; 3938 3939 if (g_iommus[iommu->seq_id]) 3940 return 0; 3941 3942 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3943 if (ret) 3944 goto out; 3945 3946 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3947 pr_warn("%s: Doesn't support hardware pass through.\n", 3948 iommu->name); 3949 return -ENXIO; 3950 } 3951 if (!ecap_sc_support(iommu->ecap) && 3952 domain_update_iommu_snooping(iommu)) { 3953 pr_warn("%s: Doesn't support snooping.\n", 3954 iommu->name); 3955 return -ENXIO; 3956 } 3957 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3958 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3959 pr_warn("%s: Doesn't support large page.\n", 3960 iommu->name); 3961 return -ENXIO; 3962 } 3963 3964 /* 3965 * Disable translation if already enabled prior to OS handover. 3966 */ 3967 if (iommu->gcmd & DMA_GCMD_TE) 3968 iommu_disable_translation(iommu); 3969 3970 g_iommus[iommu->seq_id] = iommu; 3971 ret = iommu_init_domains(iommu); 3972 if (ret == 0) 3973 ret = iommu_alloc_root_entry(iommu); 3974 if (ret) 3975 goto out; 3976 3977 intel_svm_check(iommu); 3978 3979 if (dmaru->ignored) { 3980 /* 3981 * we always have to disable PMRs or DMA may fail on this device 3982 */ 3983 if (force_on) 3984 iommu_disable_protect_mem_regions(iommu); 3985 return 0; 3986 } 3987 3988 intel_iommu_init_qi(iommu); 3989 iommu_flush_write_buffer(iommu); 3990 3991 #ifdef CONFIG_INTEL_IOMMU_SVM 3992 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3993 ret = intel_svm_enable_prq(iommu); 3994 if (ret) 3995 goto disable_iommu; 3996 } 3997 #endif 3998 ret = dmar_set_interrupt(iommu); 3999 if (ret) 4000 goto disable_iommu; 4001 4002 iommu_set_root_entry(iommu); 4003 iommu_enable_translation(iommu); 4004 4005 iommu_disable_protect_mem_regions(iommu); 4006 return 0; 4007 4008 disable_iommu: 4009 disable_dmar_iommu(iommu); 4010 out: 4011 free_dmar_iommu(iommu); 4012 return ret; 4013 } 4014 4015 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4016 { 4017 int ret = 0; 4018 struct intel_iommu *iommu = dmaru->iommu; 4019 4020 if (!intel_iommu_enabled) 4021 return 0; 4022 if (iommu == NULL) 4023 return -EINVAL; 4024 4025 if (insert) { 4026 ret = intel_iommu_add(dmaru); 4027 } else { 4028 disable_dmar_iommu(iommu); 4029 free_dmar_iommu(iommu); 4030 } 4031 4032 return ret; 4033 } 4034 4035 static void intel_iommu_free_dmars(void) 4036 { 4037 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4038 struct dmar_atsr_unit *atsru, *atsr_n; 4039 struct dmar_satc_unit *satcu, *satc_n; 4040 4041 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4042 list_del(&rmrru->list); 4043 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4044 kfree(rmrru); 4045 } 4046 4047 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4048 list_del(&atsru->list); 4049 intel_iommu_free_atsr(atsru); 4050 } 4051 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 4052 list_del(&satcu->list); 4053 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 4054 kfree(satcu); 4055 } 4056 } 4057 4058 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4059 { 4060 int i, ret = 1; 4061 struct pci_bus *bus; 4062 struct pci_dev *bridge = NULL; 4063 struct device *tmp; 4064 struct acpi_dmar_atsr *atsr; 4065 struct dmar_atsr_unit *atsru; 4066 4067 dev = pci_physfn(dev); 4068 for (bus = dev->bus; bus; bus = bus->parent) { 4069 bridge = bus->self; 4070 /* If it's an integrated device, allow ATS */ 4071 if (!bridge) 4072 return 1; 4073 /* Connected via non-PCIe: no ATS */ 4074 if (!pci_is_pcie(bridge) || 4075 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4076 return 0; 4077 /* If we found the root port, look it up in the ATSR */ 4078 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4079 break; 4080 } 4081 4082 rcu_read_lock(); 4083 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4084 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4085 if (atsr->segment != pci_domain_nr(dev->bus)) 4086 continue; 4087 4088 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4089 if (tmp == &bridge->dev) 4090 goto out; 4091 4092 if (atsru->include_all) 4093 goto out; 4094 } 4095 ret = 0; 4096 out: 4097 rcu_read_unlock(); 4098 4099 return ret; 4100 } 4101 4102 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4103 { 4104 int ret; 4105 struct dmar_rmrr_unit *rmrru; 4106 struct dmar_atsr_unit *atsru; 4107 struct dmar_satc_unit *satcu; 4108 struct acpi_dmar_atsr *atsr; 4109 struct acpi_dmar_reserved_memory *rmrr; 4110 struct acpi_dmar_satc *satc; 4111 4112 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4113 return 0; 4114 4115 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4116 rmrr = container_of(rmrru->hdr, 4117 struct acpi_dmar_reserved_memory, header); 4118 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4119 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4120 ((void *)rmrr) + rmrr->header.length, 4121 rmrr->segment, rmrru->devices, 4122 rmrru->devices_cnt); 4123 if (ret < 0) 4124 return ret; 4125 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4126 dmar_remove_dev_scope(info, rmrr->segment, 4127 rmrru->devices, rmrru->devices_cnt); 4128 } 4129 } 4130 4131 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4132 if (atsru->include_all) 4133 continue; 4134 4135 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4136 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4137 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4138 (void *)atsr + atsr->header.length, 4139 atsr->segment, atsru->devices, 4140 atsru->devices_cnt); 4141 if (ret > 0) 4142 break; 4143 else if (ret < 0) 4144 return ret; 4145 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4146 if (dmar_remove_dev_scope(info, atsr->segment, 4147 atsru->devices, atsru->devices_cnt)) 4148 break; 4149 } 4150 } 4151 list_for_each_entry(satcu, &dmar_satc_units, list) { 4152 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 4153 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4154 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 4155 (void *)satc + satc->header.length, 4156 satc->segment, satcu->devices, 4157 satcu->devices_cnt); 4158 if (ret > 0) 4159 break; 4160 else if (ret < 0) 4161 return ret; 4162 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4163 if (dmar_remove_dev_scope(info, satc->segment, 4164 satcu->devices, satcu->devices_cnt)) 4165 break; 4166 } 4167 } 4168 4169 return 0; 4170 } 4171 4172 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4173 unsigned long val, void *v) 4174 { 4175 struct memory_notify *mhp = v; 4176 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4177 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4178 mhp->nr_pages - 1); 4179 4180 switch (val) { 4181 case MEM_GOING_ONLINE: 4182 if (iommu_domain_identity_map(si_domain, 4183 start_vpfn, last_vpfn)) { 4184 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4185 start_vpfn, last_vpfn); 4186 return NOTIFY_BAD; 4187 } 4188 break; 4189 4190 case MEM_OFFLINE: 4191 case MEM_CANCEL_ONLINE: 4192 { 4193 struct dmar_drhd_unit *drhd; 4194 struct intel_iommu *iommu; 4195 struct page *freelist; 4196 4197 freelist = domain_unmap(si_domain, 4198 start_vpfn, last_vpfn, 4199 NULL); 4200 4201 rcu_read_lock(); 4202 for_each_active_iommu(iommu, drhd) 4203 iommu_flush_iotlb_psi(iommu, si_domain, 4204 start_vpfn, mhp->nr_pages, 4205 !freelist, 0); 4206 rcu_read_unlock(); 4207 dma_free_pagelist(freelist); 4208 } 4209 break; 4210 } 4211 4212 return NOTIFY_OK; 4213 } 4214 4215 static struct notifier_block intel_iommu_memory_nb = { 4216 .notifier_call = intel_iommu_memory_notifier, 4217 .priority = 0 4218 }; 4219 4220 static void intel_disable_iommus(void) 4221 { 4222 struct intel_iommu *iommu = NULL; 4223 struct dmar_drhd_unit *drhd; 4224 4225 for_each_iommu(iommu, drhd) 4226 iommu_disable_translation(iommu); 4227 } 4228 4229 void intel_iommu_shutdown(void) 4230 { 4231 struct dmar_drhd_unit *drhd; 4232 struct intel_iommu *iommu = NULL; 4233 4234 if (no_iommu || dmar_disabled) 4235 return; 4236 4237 down_write(&dmar_global_lock); 4238 4239 /* Disable PMRs explicitly here. */ 4240 for_each_iommu(iommu, drhd) 4241 iommu_disable_protect_mem_regions(iommu); 4242 4243 /* Make sure the IOMMUs are switched off */ 4244 intel_disable_iommus(); 4245 4246 up_write(&dmar_global_lock); 4247 } 4248 4249 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4250 { 4251 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4252 4253 return container_of(iommu_dev, struct intel_iommu, iommu); 4254 } 4255 4256 static ssize_t version_show(struct device *dev, 4257 struct device_attribute *attr, char *buf) 4258 { 4259 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4260 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4261 return sprintf(buf, "%d:%d\n", 4262 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4263 } 4264 static DEVICE_ATTR_RO(version); 4265 4266 static ssize_t address_show(struct device *dev, 4267 struct device_attribute *attr, char *buf) 4268 { 4269 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4270 return sprintf(buf, "%llx\n", iommu->reg_phys); 4271 } 4272 static DEVICE_ATTR_RO(address); 4273 4274 static ssize_t cap_show(struct device *dev, 4275 struct device_attribute *attr, char *buf) 4276 { 4277 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4278 return sprintf(buf, "%llx\n", iommu->cap); 4279 } 4280 static DEVICE_ATTR_RO(cap); 4281 4282 static ssize_t ecap_show(struct device *dev, 4283 struct device_attribute *attr, char *buf) 4284 { 4285 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4286 return sprintf(buf, "%llx\n", iommu->ecap); 4287 } 4288 static DEVICE_ATTR_RO(ecap); 4289 4290 static ssize_t domains_supported_show(struct device *dev, 4291 struct device_attribute *attr, char *buf) 4292 { 4293 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4294 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4295 } 4296 static DEVICE_ATTR_RO(domains_supported); 4297 4298 static ssize_t domains_used_show(struct device *dev, 4299 struct device_attribute *attr, char *buf) 4300 { 4301 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4302 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4303 cap_ndoms(iommu->cap))); 4304 } 4305 static DEVICE_ATTR_RO(domains_used); 4306 4307 static struct attribute *intel_iommu_attrs[] = { 4308 &dev_attr_version.attr, 4309 &dev_attr_address.attr, 4310 &dev_attr_cap.attr, 4311 &dev_attr_ecap.attr, 4312 &dev_attr_domains_supported.attr, 4313 &dev_attr_domains_used.attr, 4314 NULL, 4315 }; 4316 4317 static struct attribute_group intel_iommu_group = { 4318 .name = "intel-iommu", 4319 .attrs = intel_iommu_attrs, 4320 }; 4321 4322 const struct attribute_group *intel_iommu_groups[] = { 4323 &intel_iommu_group, 4324 NULL, 4325 }; 4326 4327 static inline bool has_external_pci(void) 4328 { 4329 struct pci_dev *pdev = NULL; 4330 4331 for_each_pci_dev(pdev) 4332 if (pdev->external_facing) 4333 return true; 4334 4335 return false; 4336 } 4337 4338 static int __init platform_optin_force_iommu(void) 4339 { 4340 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4341 return 0; 4342 4343 if (no_iommu || dmar_disabled) 4344 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4345 4346 /* 4347 * If Intel-IOMMU is disabled by default, we will apply identity 4348 * map for all devices except those marked as being untrusted. 4349 */ 4350 if (dmar_disabled) 4351 iommu_set_default_passthrough(false); 4352 4353 dmar_disabled = 0; 4354 no_iommu = 0; 4355 4356 return 1; 4357 } 4358 4359 static int __init probe_acpi_namespace_devices(void) 4360 { 4361 struct dmar_drhd_unit *drhd; 4362 /* To avoid a -Wunused-but-set-variable warning. */ 4363 struct intel_iommu *iommu __maybe_unused; 4364 struct device *dev; 4365 int i, ret = 0; 4366 4367 for_each_active_iommu(iommu, drhd) { 4368 for_each_active_dev_scope(drhd->devices, 4369 drhd->devices_cnt, i, dev) { 4370 struct acpi_device_physical_node *pn; 4371 struct iommu_group *group; 4372 struct acpi_device *adev; 4373 4374 if (dev->bus != &acpi_bus_type) 4375 continue; 4376 4377 adev = to_acpi_device(dev); 4378 mutex_lock(&adev->physical_node_lock); 4379 list_for_each_entry(pn, 4380 &adev->physical_node_list, node) { 4381 group = iommu_group_get(pn->dev); 4382 if (group) { 4383 iommu_group_put(group); 4384 continue; 4385 } 4386 4387 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4388 ret = iommu_probe_device(pn->dev); 4389 if (ret) 4390 break; 4391 } 4392 mutex_unlock(&adev->physical_node_lock); 4393 4394 if (ret) 4395 return ret; 4396 } 4397 } 4398 4399 return 0; 4400 } 4401 4402 int __init intel_iommu_init(void) 4403 { 4404 int ret = -ENODEV; 4405 struct dmar_drhd_unit *drhd; 4406 struct intel_iommu *iommu; 4407 4408 /* 4409 * Intel IOMMU is required for a TXT/tboot launch or platform 4410 * opt in, so enforce that. 4411 */ 4412 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4413 platform_optin_force_iommu(); 4414 4415 if (iommu_init_mempool()) { 4416 if (force_on) 4417 panic("tboot: Failed to initialize iommu memory\n"); 4418 return -ENOMEM; 4419 } 4420 4421 down_write(&dmar_global_lock); 4422 if (dmar_table_init()) { 4423 if (force_on) 4424 panic("tboot: Failed to initialize DMAR table\n"); 4425 goto out_free_dmar; 4426 } 4427 4428 if (dmar_dev_scope_init() < 0) { 4429 if (force_on) 4430 panic("tboot: Failed to initialize DMAR device scope\n"); 4431 goto out_free_dmar; 4432 } 4433 4434 up_write(&dmar_global_lock); 4435 4436 /* 4437 * The bus notifier takes the dmar_global_lock, so lockdep will 4438 * complain later when we register it under the lock. 4439 */ 4440 dmar_register_bus_notifier(); 4441 4442 down_write(&dmar_global_lock); 4443 4444 if (!no_iommu) 4445 intel_iommu_debugfs_init(); 4446 4447 if (no_iommu || dmar_disabled) { 4448 /* 4449 * We exit the function here to ensure IOMMU's remapping and 4450 * mempool aren't setup, which means that the IOMMU's PMRs 4451 * won't be disabled via the call to init_dmars(). So disable 4452 * it explicitly here. The PMRs were setup by tboot prior to 4453 * calling SENTER, but the kernel is expected to reset/tear 4454 * down the PMRs. 4455 */ 4456 if (intel_iommu_tboot_noforce) { 4457 for_each_iommu(iommu, drhd) 4458 iommu_disable_protect_mem_regions(iommu); 4459 } 4460 4461 /* 4462 * Make sure the IOMMUs are switched off, even when we 4463 * boot into a kexec kernel and the previous kernel left 4464 * them enabled 4465 */ 4466 intel_disable_iommus(); 4467 goto out_free_dmar; 4468 } 4469 4470 if (list_empty(&dmar_rmrr_units)) 4471 pr_info("No RMRR found\n"); 4472 4473 if (list_empty(&dmar_atsr_units)) 4474 pr_info("No ATSR found\n"); 4475 4476 if (list_empty(&dmar_satc_units)) 4477 pr_info("No SATC found\n"); 4478 4479 if (dmar_map_gfx) 4480 intel_iommu_gfx_mapped = 1; 4481 4482 init_no_remapping_devices(); 4483 4484 ret = init_dmars(); 4485 if (ret) { 4486 if (force_on) 4487 panic("tboot: Failed to initialize DMARs\n"); 4488 pr_err("Initialization failed\n"); 4489 goto out_free_dmar; 4490 } 4491 up_write(&dmar_global_lock); 4492 4493 init_iommu_pm_ops(); 4494 4495 down_read(&dmar_global_lock); 4496 for_each_active_iommu(iommu, drhd) { 4497 /* 4498 * The flush queue implementation does not perform 4499 * page-selective invalidations that are required for efficient 4500 * TLB flushes in virtual environments. The benefit of batching 4501 * is likely to be much lower than the overhead of synchronizing 4502 * the virtual and physical IOMMU page-tables. 4503 */ 4504 if (cap_caching_mode(iommu->cap)) { 4505 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4506 iommu_set_dma_strict(); 4507 } 4508 iommu_device_sysfs_add(&iommu->iommu, NULL, 4509 intel_iommu_groups, 4510 "%s", iommu->name); 4511 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4512 } 4513 up_read(&dmar_global_lock); 4514 4515 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4516 if (si_domain && !hw_pass_through) 4517 register_memory_notifier(&intel_iommu_memory_nb); 4518 4519 down_read(&dmar_global_lock); 4520 if (probe_acpi_namespace_devices()) 4521 pr_warn("ACPI name space devices didn't probe correctly\n"); 4522 4523 /* Finally, we enable the DMA remapping hardware. */ 4524 for_each_iommu(iommu, drhd) { 4525 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4526 iommu_enable_translation(iommu); 4527 4528 iommu_disable_protect_mem_regions(iommu); 4529 } 4530 up_read(&dmar_global_lock); 4531 4532 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4533 4534 intel_iommu_enabled = 1; 4535 4536 return 0; 4537 4538 out_free_dmar: 4539 intel_iommu_free_dmars(); 4540 up_write(&dmar_global_lock); 4541 iommu_exit_mempool(); 4542 return ret; 4543 } 4544 4545 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4546 { 4547 struct device_domain_info *info = opaque; 4548 4549 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4550 return 0; 4551 } 4552 4553 /* 4554 * NB - intel-iommu lacks any sort of reference counting for the users of 4555 * dependent devices. If multiple endpoints have intersecting dependent 4556 * devices, unbinding the driver from any one of them will possibly leave 4557 * the others unable to operate. 4558 */ 4559 static void domain_context_clear(struct device_domain_info *info) 4560 { 4561 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4562 return; 4563 4564 pci_for_each_dma_alias(to_pci_dev(info->dev), 4565 &domain_context_clear_one_cb, info); 4566 } 4567 4568 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4569 { 4570 struct dmar_domain *domain; 4571 struct intel_iommu *iommu; 4572 unsigned long flags; 4573 4574 assert_spin_locked(&device_domain_lock); 4575 4576 if (WARN_ON(!info)) 4577 return; 4578 4579 iommu = info->iommu; 4580 domain = info->domain; 4581 4582 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4583 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4584 intel_pasid_tear_down_entry(iommu, info->dev, 4585 PASID_RID2PASID, false); 4586 4587 iommu_disable_dev_iotlb(info); 4588 domain_context_clear(info); 4589 intel_pasid_free_table(info->dev); 4590 } 4591 4592 unlink_domain_info(info); 4593 4594 spin_lock_irqsave(&iommu->lock, flags); 4595 domain_detach_iommu(domain, iommu); 4596 spin_unlock_irqrestore(&iommu->lock, flags); 4597 4598 free_devinfo_mem(info); 4599 } 4600 4601 static void dmar_remove_one_dev_info(struct device *dev) 4602 { 4603 struct device_domain_info *info; 4604 unsigned long flags; 4605 4606 spin_lock_irqsave(&device_domain_lock, flags); 4607 info = get_domain_info(dev); 4608 if (info) 4609 __dmar_remove_one_dev_info(info); 4610 spin_unlock_irqrestore(&device_domain_lock, flags); 4611 } 4612 4613 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4614 { 4615 int adjust_width; 4616 4617 /* calculate AGAW */ 4618 domain->gaw = guest_width; 4619 adjust_width = guestwidth_to_adjustwidth(guest_width); 4620 domain->agaw = width_to_agaw(adjust_width); 4621 4622 domain->iommu_coherency = false; 4623 domain->iommu_snooping = false; 4624 domain->iommu_superpage = 0; 4625 domain->max_addr = 0; 4626 4627 /* always allocate the top pgd */ 4628 domain->pgd = alloc_pgtable_page(domain->nid); 4629 if (!domain->pgd) 4630 return -ENOMEM; 4631 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4632 return 0; 4633 } 4634 4635 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4636 { 4637 struct dmar_domain *dmar_domain; 4638 struct iommu_domain *domain; 4639 4640 switch (type) { 4641 case IOMMU_DOMAIN_DMA: 4642 case IOMMU_DOMAIN_DMA_FQ: 4643 case IOMMU_DOMAIN_UNMANAGED: 4644 dmar_domain = alloc_domain(type); 4645 if (!dmar_domain) { 4646 pr_err("Can't allocate dmar_domain\n"); 4647 return NULL; 4648 } 4649 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4650 pr_err("Domain initialization failed\n"); 4651 domain_exit(dmar_domain); 4652 return NULL; 4653 } 4654 4655 domain = &dmar_domain->domain; 4656 domain->geometry.aperture_start = 0; 4657 domain->geometry.aperture_end = 4658 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4659 domain->geometry.force_aperture = true; 4660 4661 return domain; 4662 case IOMMU_DOMAIN_IDENTITY: 4663 return &si_domain->domain; 4664 default: 4665 return NULL; 4666 } 4667 4668 return NULL; 4669 } 4670 4671 static void intel_iommu_domain_free(struct iommu_domain *domain) 4672 { 4673 if (domain != &si_domain->domain) 4674 domain_exit(to_dmar_domain(domain)); 4675 } 4676 4677 /* 4678 * Check whether a @domain could be attached to the @dev through the 4679 * aux-domain attach/detach APIs. 4680 */ 4681 static inline bool 4682 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4683 { 4684 struct device_domain_info *info = get_domain_info(dev); 4685 4686 return info && info->auxd_enabled && 4687 domain->type == IOMMU_DOMAIN_UNMANAGED; 4688 } 4689 4690 static inline struct subdev_domain_info * 4691 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4692 { 4693 struct subdev_domain_info *sinfo; 4694 4695 if (!list_empty(&domain->subdevices)) { 4696 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4697 if (sinfo->pdev == dev) 4698 return sinfo; 4699 } 4700 } 4701 4702 return NULL; 4703 } 4704 4705 static int auxiliary_link_device(struct dmar_domain *domain, 4706 struct device *dev) 4707 { 4708 struct device_domain_info *info = get_domain_info(dev); 4709 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4710 4711 assert_spin_locked(&device_domain_lock); 4712 if (WARN_ON(!info)) 4713 return -EINVAL; 4714 4715 if (!sinfo) { 4716 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4717 if (!sinfo) 4718 return -ENOMEM; 4719 sinfo->domain = domain; 4720 sinfo->pdev = dev; 4721 list_add(&sinfo->link_phys, &info->subdevices); 4722 list_add(&sinfo->link_domain, &domain->subdevices); 4723 } 4724 4725 return ++sinfo->users; 4726 } 4727 4728 static int auxiliary_unlink_device(struct dmar_domain *domain, 4729 struct device *dev) 4730 { 4731 struct device_domain_info *info = get_domain_info(dev); 4732 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4733 int ret; 4734 4735 assert_spin_locked(&device_domain_lock); 4736 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4737 return -EINVAL; 4738 4739 ret = --sinfo->users; 4740 if (!ret) { 4741 list_del(&sinfo->link_phys); 4742 list_del(&sinfo->link_domain); 4743 kfree(sinfo); 4744 } 4745 4746 return ret; 4747 } 4748 4749 static int aux_domain_add_dev(struct dmar_domain *domain, 4750 struct device *dev) 4751 { 4752 int ret; 4753 unsigned long flags; 4754 struct intel_iommu *iommu; 4755 4756 iommu = device_to_iommu(dev, NULL, NULL); 4757 if (!iommu) 4758 return -ENODEV; 4759 4760 if (domain->default_pasid <= 0) { 4761 u32 pasid; 4762 4763 /* No private data needed for the default pasid */ 4764 pasid = ioasid_alloc(NULL, PASID_MIN, 4765 pci_max_pasids(to_pci_dev(dev)) - 1, 4766 NULL); 4767 if (pasid == INVALID_IOASID) { 4768 pr_err("Can't allocate default pasid\n"); 4769 return -ENODEV; 4770 } 4771 domain->default_pasid = pasid; 4772 } 4773 4774 spin_lock_irqsave(&device_domain_lock, flags); 4775 ret = auxiliary_link_device(domain, dev); 4776 if (ret <= 0) 4777 goto link_failed; 4778 4779 /* 4780 * Subdevices from the same physical device can be attached to the 4781 * same domain. For such cases, only the first subdevice attachment 4782 * needs to go through the full steps in this function. So if ret > 4783 * 1, just goto out. 4784 */ 4785 if (ret > 1) 4786 goto out; 4787 4788 /* 4789 * iommu->lock must be held to attach domain to iommu and setup the 4790 * pasid entry for second level translation. 4791 */ 4792 spin_lock(&iommu->lock); 4793 ret = domain_attach_iommu(domain, iommu); 4794 if (ret) 4795 goto attach_failed; 4796 4797 /* Setup the PASID entry for mediated devices: */ 4798 if (domain_use_first_level(domain)) 4799 ret = domain_setup_first_level(iommu, domain, dev, 4800 domain->default_pasid); 4801 else 4802 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4803 domain->default_pasid); 4804 if (ret) 4805 goto table_failed; 4806 4807 spin_unlock(&iommu->lock); 4808 out: 4809 spin_unlock_irqrestore(&device_domain_lock, flags); 4810 4811 return 0; 4812 4813 table_failed: 4814 domain_detach_iommu(domain, iommu); 4815 attach_failed: 4816 spin_unlock(&iommu->lock); 4817 auxiliary_unlink_device(domain, dev); 4818 link_failed: 4819 spin_unlock_irqrestore(&device_domain_lock, flags); 4820 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4821 ioasid_put(domain->default_pasid); 4822 4823 return ret; 4824 } 4825 4826 static void aux_domain_remove_dev(struct dmar_domain *domain, 4827 struct device *dev) 4828 { 4829 struct device_domain_info *info; 4830 struct intel_iommu *iommu; 4831 unsigned long flags; 4832 4833 if (!is_aux_domain(dev, &domain->domain)) 4834 return; 4835 4836 spin_lock_irqsave(&device_domain_lock, flags); 4837 info = get_domain_info(dev); 4838 iommu = info->iommu; 4839 4840 if (!auxiliary_unlink_device(domain, dev)) { 4841 spin_lock(&iommu->lock); 4842 intel_pasid_tear_down_entry(iommu, dev, 4843 domain->default_pasid, false); 4844 domain_detach_iommu(domain, iommu); 4845 spin_unlock(&iommu->lock); 4846 } 4847 4848 spin_unlock_irqrestore(&device_domain_lock, flags); 4849 4850 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4851 ioasid_put(domain->default_pasid); 4852 } 4853 4854 static int prepare_domain_attach_device(struct iommu_domain *domain, 4855 struct device *dev) 4856 { 4857 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4858 struct intel_iommu *iommu; 4859 int addr_width; 4860 4861 iommu = device_to_iommu(dev, NULL, NULL); 4862 if (!iommu) 4863 return -ENODEV; 4864 4865 if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) && 4866 !ecap_nest(iommu->ecap)) { 4867 dev_err(dev, "%s: iommu not support nested translation\n", 4868 iommu->name); 4869 return -EINVAL; 4870 } 4871 4872 /* check if this iommu agaw is sufficient for max mapped address */ 4873 addr_width = agaw_to_width(iommu->agaw); 4874 if (addr_width > cap_mgaw(iommu->cap)) 4875 addr_width = cap_mgaw(iommu->cap); 4876 4877 if (dmar_domain->max_addr > (1LL << addr_width)) { 4878 dev_err(dev, "%s: iommu width (%d) is not " 4879 "sufficient for the mapped address (%llx)\n", 4880 __func__, addr_width, dmar_domain->max_addr); 4881 return -EFAULT; 4882 } 4883 dmar_domain->gaw = addr_width; 4884 4885 /* 4886 * Knock out extra levels of page tables if necessary 4887 */ 4888 while (iommu->agaw < dmar_domain->agaw) { 4889 struct dma_pte *pte; 4890 4891 pte = dmar_domain->pgd; 4892 if (dma_pte_present(pte)) { 4893 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4894 free_pgtable_page(pte); 4895 } 4896 dmar_domain->agaw--; 4897 } 4898 4899 return 0; 4900 } 4901 4902 static int intel_iommu_attach_device(struct iommu_domain *domain, 4903 struct device *dev) 4904 { 4905 int ret; 4906 4907 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4908 device_is_rmrr_locked(dev)) { 4909 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4910 return -EPERM; 4911 } 4912 4913 if (is_aux_domain(dev, domain)) 4914 return -EPERM; 4915 4916 /* normally dev is not mapped */ 4917 if (unlikely(domain_context_mapped(dev))) { 4918 struct dmar_domain *old_domain; 4919 4920 old_domain = find_domain(dev); 4921 if (old_domain) 4922 dmar_remove_one_dev_info(dev); 4923 } 4924 4925 ret = prepare_domain_attach_device(domain, dev); 4926 if (ret) 4927 return ret; 4928 4929 return domain_add_dev_info(to_dmar_domain(domain), dev); 4930 } 4931 4932 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4933 struct device *dev) 4934 { 4935 int ret; 4936 4937 if (!is_aux_domain(dev, domain)) 4938 return -EPERM; 4939 4940 ret = prepare_domain_attach_device(domain, dev); 4941 if (ret) 4942 return ret; 4943 4944 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4945 } 4946 4947 static void intel_iommu_detach_device(struct iommu_domain *domain, 4948 struct device *dev) 4949 { 4950 dmar_remove_one_dev_info(dev); 4951 } 4952 4953 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4954 struct device *dev) 4955 { 4956 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4957 } 4958 4959 #ifdef CONFIG_INTEL_IOMMU_SVM 4960 /* 4961 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4962 * VT-d granularity. Invalidation is typically included in the unmap operation 4963 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4964 * owns the first level page tables. Invalidations of translation caches in the 4965 * guest are trapped and passed down to the host. 4966 * 4967 * vIOMMU in the guest will only expose first level page tables, therefore 4968 * we do not support IOTLB granularity for request without PASID (second level). 4969 * 4970 * For example, to find the VT-d granularity encoding for IOTLB 4971 * type and page selective granularity within PASID: 4972 * X: indexed by iommu cache type 4973 * Y: indexed by enum iommu_inv_granularity 4974 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4975 */ 4976 4977 static const int 4978 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4979 /* 4980 * PASID based IOTLB invalidation: PASID selective (per PASID), 4981 * page selective (address granularity) 4982 */ 4983 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4984 /* PASID based dev TLBs */ 4985 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4986 /* PASID cache */ 4987 {-EINVAL, -EINVAL, -EINVAL} 4988 }; 4989 4990 static inline int to_vtd_granularity(int type, int granu) 4991 { 4992 return inv_type_granu_table[type][granu]; 4993 } 4994 4995 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4996 { 4997 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4998 4999 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5000 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5001 * granu size in contiguous memory. 5002 */ 5003 return order_base_2(nr_pages); 5004 } 5005 5006 static int 5007 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5008 struct iommu_cache_invalidate_info *inv_info) 5009 { 5010 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5011 struct device_domain_info *info; 5012 struct intel_iommu *iommu; 5013 unsigned long flags; 5014 int cache_type; 5015 u8 bus, devfn; 5016 u16 did, sid; 5017 int ret = 0; 5018 u64 size = 0; 5019 5020 if (!inv_info || !dmar_domain) 5021 return -EINVAL; 5022 5023 if (!dev || !dev_is_pci(dev)) 5024 return -ENODEV; 5025 5026 iommu = device_to_iommu(dev, &bus, &devfn); 5027 if (!iommu) 5028 return -ENODEV; 5029 5030 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5031 return -EINVAL; 5032 5033 spin_lock_irqsave(&device_domain_lock, flags); 5034 spin_lock(&iommu->lock); 5035 info = get_domain_info(dev); 5036 if (!info) { 5037 ret = -EINVAL; 5038 goto out_unlock; 5039 } 5040 did = dmar_domain->iommu_did[iommu->seq_id]; 5041 sid = PCI_DEVID(bus, devfn); 5042 5043 /* Size is only valid in address selective invalidation */ 5044 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5045 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 5046 inv_info->granu.addr_info.nb_granules); 5047 5048 for_each_set_bit(cache_type, 5049 (unsigned long *)&inv_info->cache, 5050 IOMMU_CACHE_INV_TYPE_NR) { 5051 int granu = 0; 5052 u64 pasid = 0; 5053 u64 addr = 0; 5054 5055 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5056 if (granu == -EINVAL) { 5057 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5058 cache_type, inv_info->granularity); 5059 break; 5060 } 5061 5062 /* 5063 * PASID is stored in different locations based on the 5064 * granularity. 5065 */ 5066 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5067 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5068 pasid = inv_info->granu.pasid_info.pasid; 5069 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5070 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5071 pasid = inv_info->granu.addr_info.pasid; 5072 5073 switch (BIT(cache_type)) { 5074 case IOMMU_CACHE_INV_TYPE_IOTLB: 5075 /* HW will ignore LSB bits based on address mask */ 5076 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5077 size && 5078 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5079 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5080 inv_info->granu.addr_info.addr, size); 5081 } 5082 5083 /* 5084 * If granu is PASID-selective, address is ignored. 5085 * We use npages = -1 to indicate that. 5086 */ 5087 qi_flush_piotlb(iommu, did, pasid, 5088 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 5089 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5090 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5091 5092 if (!info->ats_enabled) 5093 break; 5094 /* 5095 * Always flush device IOTLB if ATS is enabled. vIOMMU 5096 * in the guest may assume IOTLB flush is inclusive, 5097 * which is more efficient. 5098 */ 5099 fallthrough; 5100 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5101 /* 5102 * PASID based device TLB invalidation does not support 5103 * IOMMU_INV_GRANU_PASID granularity but only supports 5104 * IOMMU_INV_GRANU_ADDR. 5105 * The equivalent of that is we set the size to be the 5106 * entire range of 64 bit. User only provides PASID info 5107 * without address info. So we set addr to 0. 5108 */ 5109 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5110 size = 64 - VTD_PAGE_SHIFT; 5111 addr = 0; 5112 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5113 addr = inv_info->granu.addr_info.addr; 5114 } 5115 5116 if (info->ats_enabled) 5117 qi_flush_dev_iotlb_pasid(iommu, sid, 5118 info->pfsid, pasid, 5119 info->ats_qdep, addr, 5120 size); 5121 else 5122 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5123 break; 5124 default: 5125 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5126 cache_type); 5127 ret = -EINVAL; 5128 } 5129 } 5130 out_unlock: 5131 spin_unlock(&iommu->lock); 5132 spin_unlock_irqrestore(&device_domain_lock, flags); 5133 5134 return ret; 5135 } 5136 #endif 5137 5138 static int intel_iommu_map(struct iommu_domain *domain, 5139 unsigned long iova, phys_addr_t hpa, 5140 size_t size, int iommu_prot, gfp_t gfp) 5141 { 5142 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5143 u64 max_addr; 5144 int prot = 0; 5145 5146 if (iommu_prot & IOMMU_READ) 5147 prot |= DMA_PTE_READ; 5148 if (iommu_prot & IOMMU_WRITE) 5149 prot |= DMA_PTE_WRITE; 5150 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5151 prot |= DMA_PTE_SNP; 5152 5153 max_addr = iova + size; 5154 if (dmar_domain->max_addr < max_addr) { 5155 u64 end; 5156 5157 /* check if minimum agaw is sufficient for mapped address */ 5158 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5159 if (end < max_addr) { 5160 pr_err("%s: iommu width (%d) is not " 5161 "sufficient for the mapped address (%llx)\n", 5162 __func__, dmar_domain->gaw, max_addr); 5163 return -EFAULT; 5164 } 5165 dmar_domain->max_addr = max_addr; 5166 } 5167 /* Round up size to next multiple of PAGE_SIZE, if it and 5168 the low bits of hpa would take us onto the next page */ 5169 size = aligned_nrpages(hpa, size); 5170 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5171 hpa >> VTD_PAGE_SHIFT, size, prot); 5172 } 5173 5174 static int intel_iommu_map_pages(struct iommu_domain *domain, 5175 unsigned long iova, phys_addr_t paddr, 5176 size_t pgsize, size_t pgcount, 5177 int prot, gfp_t gfp, size_t *mapped) 5178 { 5179 unsigned long pgshift = __ffs(pgsize); 5180 size_t size = pgcount << pgshift; 5181 int ret; 5182 5183 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 5184 return -EINVAL; 5185 5186 if (!IS_ALIGNED(iova | paddr, pgsize)) 5187 return -EINVAL; 5188 5189 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 5190 if (!ret && mapped) 5191 *mapped = size; 5192 5193 return ret; 5194 } 5195 5196 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5197 unsigned long iova, size_t size, 5198 struct iommu_iotlb_gather *gather) 5199 { 5200 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5201 unsigned long start_pfn, last_pfn; 5202 int level = 0; 5203 5204 /* Cope with horrid API which requires us to unmap more than the 5205 size argument if it happens to be a large-page mapping. */ 5206 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5207 5208 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5209 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5210 5211 start_pfn = iova >> VTD_PAGE_SHIFT; 5212 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5213 5214 gather->freelist = domain_unmap(dmar_domain, start_pfn, 5215 last_pfn, gather->freelist); 5216 5217 if (dmar_domain->max_addr == iova + size) 5218 dmar_domain->max_addr = iova; 5219 5220 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5221 5222 return size; 5223 } 5224 5225 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 5226 unsigned long iova, 5227 size_t pgsize, size_t pgcount, 5228 struct iommu_iotlb_gather *gather) 5229 { 5230 unsigned long pgshift = __ffs(pgsize); 5231 size_t size = pgcount << pgshift; 5232 5233 return intel_iommu_unmap(domain, iova, size, gather); 5234 } 5235 5236 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5237 struct iommu_iotlb_gather *gather) 5238 { 5239 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5240 unsigned long iova_pfn = IOVA_PFN(gather->start); 5241 size_t size = gather->end - gather->start; 5242 unsigned long start_pfn; 5243 unsigned long nrpages; 5244 int iommu_id; 5245 5246 nrpages = aligned_nrpages(gather->start, size); 5247 start_pfn = mm_to_dma_pfn(iova_pfn); 5248 5249 for_each_domain_iommu(iommu_id, dmar_domain) 5250 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5251 start_pfn, nrpages, !gather->freelist, 0); 5252 5253 dma_free_pagelist(gather->freelist); 5254 } 5255 5256 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5257 dma_addr_t iova) 5258 { 5259 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5260 struct dma_pte *pte; 5261 int level = 0; 5262 u64 phys = 0; 5263 5264 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5265 if (pte && dma_pte_present(pte)) 5266 phys = dma_pte_addr(pte) + 5267 (iova & (BIT_MASK(level_to_offset_bits(level) + 5268 VTD_PAGE_SHIFT) - 1)); 5269 5270 return phys; 5271 } 5272 5273 static bool intel_iommu_capable(enum iommu_cap cap) 5274 { 5275 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5276 return domain_update_iommu_snooping(NULL); 5277 if (cap == IOMMU_CAP_INTR_REMAP) 5278 return irq_remapping_enabled == 1; 5279 5280 return false; 5281 } 5282 5283 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5284 { 5285 struct intel_iommu *iommu; 5286 5287 iommu = device_to_iommu(dev, NULL, NULL); 5288 if (!iommu) 5289 return ERR_PTR(-ENODEV); 5290 5291 if (translation_pre_enabled(iommu)) 5292 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5293 5294 return &iommu->iommu; 5295 } 5296 5297 static void intel_iommu_release_device(struct device *dev) 5298 { 5299 struct intel_iommu *iommu; 5300 5301 iommu = device_to_iommu(dev, NULL, NULL); 5302 if (!iommu) 5303 return; 5304 5305 dmar_remove_one_dev_info(dev); 5306 5307 set_dma_ops(dev, NULL); 5308 } 5309 5310 static void intel_iommu_probe_finalize(struct device *dev) 5311 { 5312 set_dma_ops(dev, NULL); 5313 iommu_setup_dma_ops(dev, 0, U64_MAX); 5314 } 5315 5316 static void intel_iommu_get_resv_regions(struct device *device, 5317 struct list_head *head) 5318 { 5319 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5320 struct iommu_resv_region *reg; 5321 struct dmar_rmrr_unit *rmrr; 5322 struct device *i_dev; 5323 int i; 5324 5325 down_read(&dmar_global_lock); 5326 for_each_rmrr_units(rmrr) { 5327 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5328 i, i_dev) { 5329 struct iommu_resv_region *resv; 5330 enum iommu_resv_type type; 5331 size_t length; 5332 5333 if (i_dev != device && 5334 !is_downstream_to_pci_bridge(device, i_dev)) 5335 continue; 5336 5337 length = rmrr->end_address - rmrr->base_address + 1; 5338 5339 type = device_rmrr_is_relaxable(device) ? 5340 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5341 5342 resv = iommu_alloc_resv_region(rmrr->base_address, 5343 length, prot, type); 5344 if (!resv) 5345 break; 5346 5347 list_add_tail(&resv->list, head); 5348 } 5349 } 5350 up_read(&dmar_global_lock); 5351 5352 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5353 if (dev_is_pci(device)) { 5354 struct pci_dev *pdev = to_pci_dev(device); 5355 5356 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5357 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5358 IOMMU_RESV_DIRECT_RELAXABLE); 5359 if (reg) 5360 list_add_tail(®->list, head); 5361 } 5362 } 5363 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5364 5365 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5366 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5367 0, IOMMU_RESV_MSI); 5368 if (!reg) 5369 return; 5370 list_add_tail(®->list, head); 5371 } 5372 5373 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5374 { 5375 struct device_domain_info *info; 5376 struct context_entry *context; 5377 struct dmar_domain *domain; 5378 unsigned long flags; 5379 u64 ctx_lo; 5380 int ret; 5381 5382 domain = find_domain(dev); 5383 if (!domain) 5384 return -EINVAL; 5385 5386 spin_lock_irqsave(&device_domain_lock, flags); 5387 spin_lock(&iommu->lock); 5388 5389 ret = -EINVAL; 5390 info = get_domain_info(dev); 5391 if (!info || !info->pasid_supported) 5392 goto out; 5393 5394 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5395 if (WARN_ON(!context)) 5396 goto out; 5397 5398 ctx_lo = context[0].lo; 5399 5400 if (!(ctx_lo & CONTEXT_PASIDE)) { 5401 ctx_lo |= CONTEXT_PASIDE; 5402 context[0].lo = ctx_lo; 5403 wmb(); 5404 iommu->flush.flush_context(iommu, 5405 domain->iommu_did[iommu->seq_id], 5406 PCI_DEVID(info->bus, info->devfn), 5407 DMA_CCMD_MASK_NOBIT, 5408 DMA_CCMD_DEVICE_INVL); 5409 } 5410 5411 /* Enable PASID support in the device, if it wasn't already */ 5412 if (!info->pasid_enabled) 5413 iommu_enable_dev_iotlb(info); 5414 5415 ret = 0; 5416 5417 out: 5418 spin_unlock(&iommu->lock); 5419 spin_unlock_irqrestore(&device_domain_lock, flags); 5420 5421 return ret; 5422 } 5423 5424 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5425 { 5426 if (dev_is_pci(dev)) 5427 return pci_device_group(dev); 5428 return generic_device_group(dev); 5429 } 5430 5431 static int intel_iommu_enable_auxd(struct device *dev) 5432 { 5433 struct device_domain_info *info; 5434 struct intel_iommu *iommu; 5435 unsigned long flags; 5436 int ret; 5437 5438 iommu = device_to_iommu(dev, NULL, NULL); 5439 if (!iommu || dmar_disabled) 5440 return -EINVAL; 5441 5442 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5443 return -EINVAL; 5444 5445 ret = intel_iommu_enable_pasid(iommu, dev); 5446 if (ret) 5447 return -ENODEV; 5448 5449 spin_lock_irqsave(&device_domain_lock, flags); 5450 info = get_domain_info(dev); 5451 info->auxd_enabled = 1; 5452 spin_unlock_irqrestore(&device_domain_lock, flags); 5453 5454 return 0; 5455 } 5456 5457 static int intel_iommu_disable_auxd(struct device *dev) 5458 { 5459 struct device_domain_info *info; 5460 unsigned long flags; 5461 5462 spin_lock_irqsave(&device_domain_lock, flags); 5463 info = get_domain_info(dev); 5464 if (!WARN_ON(!info)) 5465 info->auxd_enabled = 0; 5466 spin_unlock_irqrestore(&device_domain_lock, flags); 5467 5468 return 0; 5469 } 5470 5471 static int intel_iommu_enable_sva(struct device *dev) 5472 { 5473 struct device_domain_info *info = get_domain_info(dev); 5474 struct intel_iommu *iommu; 5475 int ret; 5476 5477 if (!info || dmar_disabled) 5478 return -EINVAL; 5479 5480 iommu = info->iommu; 5481 if (!iommu) 5482 return -EINVAL; 5483 5484 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 5485 return -ENODEV; 5486 5487 if (intel_iommu_enable_pasid(iommu, dev)) 5488 return -ENODEV; 5489 5490 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 5491 return -EINVAL; 5492 5493 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 5494 if (!ret) 5495 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 5496 5497 return ret; 5498 } 5499 5500 static int intel_iommu_disable_sva(struct device *dev) 5501 { 5502 struct device_domain_info *info = get_domain_info(dev); 5503 struct intel_iommu *iommu = info->iommu; 5504 int ret; 5505 5506 ret = iommu_unregister_device_fault_handler(dev); 5507 if (!ret) 5508 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 5509 5510 return ret; 5511 } 5512 5513 static int intel_iommu_enable_iopf(struct device *dev) 5514 { 5515 struct device_domain_info *info = get_domain_info(dev); 5516 5517 if (info && info->pri_supported) 5518 return 0; 5519 5520 return -ENODEV; 5521 } 5522 5523 static int 5524 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5525 { 5526 switch (feat) { 5527 case IOMMU_DEV_FEAT_AUX: 5528 return intel_iommu_enable_auxd(dev); 5529 5530 case IOMMU_DEV_FEAT_IOPF: 5531 return intel_iommu_enable_iopf(dev); 5532 5533 case IOMMU_DEV_FEAT_SVA: 5534 return intel_iommu_enable_sva(dev); 5535 5536 default: 5537 return -ENODEV; 5538 } 5539 } 5540 5541 static int 5542 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5543 { 5544 switch (feat) { 5545 case IOMMU_DEV_FEAT_AUX: 5546 return intel_iommu_disable_auxd(dev); 5547 5548 case IOMMU_DEV_FEAT_IOPF: 5549 return 0; 5550 5551 case IOMMU_DEV_FEAT_SVA: 5552 return intel_iommu_disable_sva(dev); 5553 5554 default: 5555 return -ENODEV; 5556 } 5557 } 5558 5559 static bool 5560 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5561 { 5562 struct device_domain_info *info = get_domain_info(dev); 5563 5564 if (feat == IOMMU_DEV_FEAT_AUX) 5565 return scalable_mode_support() && info && info->auxd_enabled; 5566 5567 return false; 5568 } 5569 5570 static int 5571 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5572 { 5573 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5574 5575 return dmar_domain->default_pasid > 0 ? 5576 dmar_domain->default_pasid : -EINVAL; 5577 } 5578 5579 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5580 struct device *dev) 5581 { 5582 return attach_deferred(dev); 5583 } 5584 5585 static int 5586 intel_iommu_enable_nesting(struct iommu_domain *domain) 5587 { 5588 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5589 unsigned long flags; 5590 int ret = -ENODEV; 5591 5592 spin_lock_irqsave(&device_domain_lock, flags); 5593 if (list_empty(&dmar_domain->devices)) { 5594 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5595 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5596 ret = 0; 5597 } 5598 spin_unlock_irqrestore(&device_domain_lock, flags); 5599 5600 return ret; 5601 } 5602 5603 /* 5604 * Check that the device does not live on an external facing PCI port that is 5605 * marked as untrusted. Such devices should not be able to apply quirks and 5606 * thus not be able to bypass the IOMMU restrictions. 5607 */ 5608 static bool risky_device(struct pci_dev *pdev) 5609 { 5610 if (pdev->untrusted) { 5611 pci_info(pdev, 5612 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5613 pdev->vendor, pdev->device); 5614 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5615 return true; 5616 } 5617 return false; 5618 } 5619 5620 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 5621 unsigned long iova, size_t size) 5622 { 5623 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5624 unsigned long pages = aligned_nrpages(iova, size); 5625 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 5626 struct intel_iommu *iommu; 5627 int iommu_id; 5628 5629 for_each_domain_iommu(iommu_id, dmar_domain) { 5630 iommu = g_iommus[iommu_id]; 5631 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 5632 } 5633 } 5634 5635 const struct iommu_ops intel_iommu_ops = { 5636 .capable = intel_iommu_capable, 5637 .domain_alloc = intel_iommu_domain_alloc, 5638 .domain_free = intel_iommu_domain_free, 5639 .enable_nesting = intel_iommu_enable_nesting, 5640 .attach_dev = intel_iommu_attach_device, 5641 .detach_dev = intel_iommu_detach_device, 5642 .aux_attach_dev = intel_iommu_aux_attach_device, 5643 .aux_detach_dev = intel_iommu_aux_detach_device, 5644 .aux_get_pasid = intel_iommu_aux_get_pasid, 5645 .map_pages = intel_iommu_map_pages, 5646 .unmap_pages = intel_iommu_unmap_pages, 5647 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 5648 .flush_iotlb_all = intel_flush_iotlb_all, 5649 .iotlb_sync = intel_iommu_tlb_sync, 5650 .iova_to_phys = intel_iommu_iova_to_phys, 5651 .probe_device = intel_iommu_probe_device, 5652 .probe_finalize = intel_iommu_probe_finalize, 5653 .release_device = intel_iommu_release_device, 5654 .get_resv_regions = intel_iommu_get_resv_regions, 5655 .put_resv_regions = generic_iommu_put_resv_regions, 5656 .device_group = intel_iommu_device_group, 5657 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5658 .dev_enable_feat = intel_iommu_dev_enable_feat, 5659 .dev_disable_feat = intel_iommu_dev_disable_feat, 5660 .is_attach_deferred = intel_iommu_is_attach_deferred, 5661 .def_domain_type = device_def_domain_type, 5662 .pgsize_bitmap = SZ_4K, 5663 #ifdef CONFIG_INTEL_IOMMU_SVM 5664 .cache_invalidate = intel_iommu_sva_invalidate, 5665 .sva_bind_gpasid = intel_svm_bind_gpasid, 5666 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5667 .sva_bind = intel_svm_bind, 5668 .sva_unbind = intel_svm_unbind, 5669 .sva_get_pasid = intel_svm_get_pasid, 5670 .page_response = intel_svm_page_response, 5671 #endif 5672 }; 5673 5674 static void quirk_iommu_igfx(struct pci_dev *dev) 5675 { 5676 if (risky_device(dev)) 5677 return; 5678 5679 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5680 dmar_map_gfx = 0; 5681 } 5682 5683 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5684 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5685 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5691 5692 /* Broadwell igfx malfunctions with dmar */ 5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5717 5718 static void quirk_iommu_rwbf(struct pci_dev *dev) 5719 { 5720 if (risky_device(dev)) 5721 return; 5722 5723 /* 5724 * Mobile 4 Series Chipset neglects to set RWBF capability, 5725 * but needs it. Same seems to hold for the desktop versions. 5726 */ 5727 pci_info(dev, "Forcing write-buffer flush capability\n"); 5728 rwbf_quirk = 1; 5729 } 5730 5731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5736 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5737 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5738 5739 #define GGC 0x52 5740 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5741 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5742 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5743 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5744 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5745 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5746 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5747 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5748 5749 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5750 { 5751 unsigned short ggc; 5752 5753 if (risky_device(dev)) 5754 return; 5755 5756 if (pci_read_config_word(dev, GGC, &ggc)) 5757 return; 5758 5759 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5760 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5761 dmar_map_gfx = 0; 5762 } else if (dmar_map_gfx) { 5763 /* we have to ensure the gfx device is idle before we flush */ 5764 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5765 iommu_set_dma_strict(); 5766 } 5767 } 5768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5772 5773 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5774 { 5775 unsigned short ver; 5776 5777 if (!IS_GFX_DEVICE(dev)) 5778 return; 5779 5780 ver = (dev->device >> 8) & 0xff; 5781 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5782 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5783 ver != 0x9a) 5784 return; 5785 5786 if (risky_device(dev)) 5787 return; 5788 5789 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5790 iommu_skip_te_disable = 1; 5791 } 5792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5793 5794 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5795 ISOCH DMAR unit for the Azalia sound device, but not give it any 5796 TLB entries, which causes it to deadlock. Check for that. We do 5797 this in a function called from init_dmars(), instead of in a PCI 5798 quirk, because we don't want to print the obnoxious "BIOS broken" 5799 message if VT-d is actually disabled. 5800 */ 5801 static void __init check_tylersburg_isoch(void) 5802 { 5803 struct pci_dev *pdev; 5804 uint32_t vtisochctrl; 5805 5806 /* If there's no Azalia in the system anyway, forget it. */ 5807 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5808 if (!pdev) 5809 return; 5810 5811 if (risky_device(pdev)) { 5812 pci_dev_put(pdev); 5813 return; 5814 } 5815 5816 pci_dev_put(pdev); 5817 5818 /* System Management Registers. Might be hidden, in which case 5819 we can't do the sanity check. But that's OK, because the 5820 known-broken BIOSes _don't_ actually hide it, so far. */ 5821 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5822 if (!pdev) 5823 return; 5824 5825 if (risky_device(pdev)) { 5826 pci_dev_put(pdev); 5827 return; 5828 } 5829 5830 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5831 pci_dev_put(pdev); 5832 return; 5833 } 5834 5835 pci_dev_put(pdev); 5836 5837 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5838 if (vtisochctrl & 1) 5839 return; 5840 5841 /* Drop all bits other than the number of TLB entries */ 5842 vtisochctrl &= 0x1c; 5843 5844 /* If we have the recommended number of TLB entries (16), fine. */ 5845 if (vtisochctrl == 0x10) 5846 return; 5847 5848 /* Zero TLB entries? You get to ride the short bus to school. */ 5849 if (!vtisochctrl) { 5850 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5851 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5852 dmi_get_system_info(DMI_BIOS_VENDOR), 5853 dmi_get_system_info(DMI_BIOS_VERSION), 5854 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5855 iommu_identity_mapping |= IDENTMAP_AZALIA; 5856 return; 5857 } 5858 5859 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5860 vtisochctrl); 5861 } 5862