1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/intel-svm.h> 37 #include <linux/syscore_ops.h> 38 #include <linux/tboot.h> 39 #include <linux/dmi.h> 40 #include <linux/pci-ats.h> 41 #include <linux/memblock.h> 42 #include <linux/dma-direct.h> 43 #include <linux/crash_dump.h> 44 #include <linux/numa.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 49 #include "../irq_remapping.h" 50 #include "../iommu-sva-lib.h" 51 #include "pasid.h" 52 #include "cap_audit.h" 53 54 #define ROOT_SIZE VTD_PAGE_SIZE 55 #define CONTEXT_SIZE VTD_PAGE_SIZE 56 57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 61 62 #define IOAPIC_RANGE_START (0xfee00000) 63 #define IOAPIC_RANGE_END (0xfeefffff) 64 #define IOVA_START_ADDR (0x1000) 65 66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 67 68 #define MAX_AGAW_WIDTH 64 69 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 70 71 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 73 74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 75 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 76 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 77 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 78 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 79 80 /* IO virtual address start page frame number */ 81 #define IOVA_START_PFN (1) 82 83 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 84 85 /* page table handling */ 86 #define LEVEL_STRIDE (9) 87 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 88 89 static inline int agaw_to_level(int agaw) 90 { 91 return agaw + 2; 92 } 93 94 static inline int agaw_to_width(int agaw) 95 { 96 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 97 } 98 99 static inline int width_to_agaw(int width) 100 { 101 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 102 } 103 104 static inline unsigned int level_to_offset_bits(int level) 105 { 106 return (level - 1) * LEVEL_STRIDE; 107 } 108 109 static inline int pfn_level_offset(u64 pfn, int level) 110 { 111 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 112 } 113 114 static inline u64 level_mask(int level) 115 { 116 return -1ULL << level_to_offset_bits(level); 117 } 118 119 static inline u64 level_size(int level) 120 { 121 return 1ULL << level_to_offset_bits(level); 122 } 123 124 static inline u64 align_to_level(u64 pfn, int level) 125 { 126 return (pfn + level_size(level) - 1) & level_mask(level); 127 } 128 129 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 130 { 131 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 132 } 133 134 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 135 are never going to work. */ 136 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 137 { 138 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 139 } 140 141 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 142 { 143 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 144 } 145 static inline unsigned long page_to_dma_pfn(struct page *pg) 146 { 147 return mm_to_dma_pfn(page_to_pfn(pg)); 148 } 149 static inline unsigned long virt_to_dma_pfn(void *p) 150 { 151 return page_to_dma_pfn(virt_to_page(p)); 152 } 153 154 /* global iommu list, set NULL for ignored DMAR units */ 155 static struct intel_iommu **g_iommus; 156 157 static void __init check_tylersburg_isoch(void); 158 static int rwbf_quirk; 159 static inline struct device_domain_info * 160 dmar_search_domain_by_dev_info(int segment, int bus, int devfn); 161 162 /* 163 * set to 1 to panic kernel if can't successfully enable VT-d 164 * (used when kernel is launched w/ TXT) 165 */ 166 static int force_on = 0; 167 static int intel_iommu_tboot_noforce; 168 static int no_platform_optin; 169 170 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 171 172 /* 173 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 174 * if marked present. 175 */ 176 static phys_addr_t root_entry_lctp(struct root_entry *re) 177 { 178 if (!(re->lo & 1)) 179 return 0; 180 181 return re->lo & VTD_PAGE_MASK; 182 } 183 184 /* 185 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 186 * if marked present. 187 */ 188 static phys_addr_t root_entry_uctp(struct root_entry *re) 189 { 190 if (!(re->hi & 1)) 191 return 0; 192 193 return re->hi & VTD_PAGE_MASK; 194 } 195 196 static inline void context_clear_pasid_enable(struct context_entry *context) 197 { 198 context->lo &= ~(1ULL << 11); 199 } 200 201 static inline bool context_pasid_enabled(struct context_entry *context) 202 { 203 return !!(context->lo & (1ULL << 11)); 204 } 205 206 static inline void context_set_copied(struct context_entry *context) 207 { 208 context->hi |= (1ull << 3); 209 } 210 211 static inline bool context_copied(struct context_entry *context) 212 { 213 return !!(context->hi & (1ULL << 3)); 214 } 215 216 static inline bool __context_present(struct context_entry *context) 217 { 218 return (context->lo & 1); 219 } 220 221 bool context_present(struct context_entry *context) 222 { 223 return context_pasid_enabled(context) ? 224 __context_present(context) : 225 __context_present(context) && !context_copied(context); 226 } 227 228 static inline void context_set_present(struct context_entry *context) 229 { 230 context->lo |= 1; 231 } 232 233 static inline void context_set_fault_enable(struct context_entry *context) 234 { 235 context->lo &= (((u64)-1) << 2) | 1; 236 } 237 238 static inline void context_set_translation_type(struct context_entry *context, 239 unsigned long value) 240 { 241 context->lo &= (((u64)-1) << 4) | 3; 242 context->lo |= (value & 3) << 2; 243 } 244 245 static inline void context_set_address_root(struct context_entry *context, 246 unsigned long value) 247 { 248 context->lo &= ~VTD_PAGE_MASK; 249 context->lo |= value & VTD_PAGE_MASK; 250 } 251 252 static inline void context_set_address_width(struct context_entry *context, 253 unsigned long value) 254 { 255 context->hi |= value & 7; 256 } 257 258 static inline void context_set_domain_id(struct context_entry *context, 259 unsigned long value) 260 { 261 context->hi |= (value & ((1 << 16) - 1)) << 8; 262 } 263 264 static inline int context_domain_id(struct context_entry *c) 265 { 266 return((c->hi >> 8) & 0xffff); 267 } 268 269 static inline void context_clear_entry(struct context_entry *context) 270 { 271 context->lo = 0; 272 context->hi = 0; 273 } 274 275 /* 276 * This domain is a statically identity mapping domain. 277 * 1. This domain creats a static 1:1 mapping to all usable memory. 278 * 2. It maps to each iommu if successful. 279 * 3. Each iommu mapps to this domain if successful. 280 */ 281 static struct dmar_domain *si_domain; 282 static int hw_pass_through = 1; 283 284 #define for_each_domain_iommu(idx, domain) \ 285 for (idx = 0; idx < g_num_of_iommus; idx++) \ 286 if (domain->iommu_refcnt[idx]) 287 288 struct dmar_rmrr_unit { 289 struct list_head list; /* list of rmrr units */ 290 struct acpi_dmar_header *hdr; /* ACPI header */ 291 u64 base_address; /* reserved base address*/ 292 u64 end_address; /* reserved end address */ 293 struct dmar_dev_scope *devices; /* target devices */ 294 int devices_cnt; /* target device count */ 295 }; 296 297 struct dmar_atsr_unit { 298 struct list_head list; /* list of ATSR units */ 299 struct acpi_dmar_header *hdr; /* ACPI header */ 300 struct dmar_dev_scope *devices; /* target devices */ 301 int devices_cnt; /* target device count */ 302 u8 include_all:1; /* include all ports */ 303 }; 304 305 struct dmar_satc_unit { 306 struct list_head list; /* list of SATC units */ 307 struct acpi_dmar_header *hdr; /* ACPI header */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 struct intel_iommu *iommu; /* the corresponding iommu */ 310 int devices_cnt; /* target device count */ 311 u8 atc_required:1; /* ATS is required */ 312 }; 313 314 static LIST_HEAD(dmar_atsr_units); 315 static LIST_HEAD(dmar_rmrr_units); 316 static LIST_HEAD(dmar_satc_units); 317 318 #define for_each_rmrr_units(rmrr) \ 319 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 320 321 /* bitmap for indexing intel_iommus */ 322 static int g_num_of_iommus; 323 324 static void domain_exit(struct dmar_domain *domain); 325 static void domain_remove_dev_info(struct dmar_domain *domain); 326 static void dmar_remove_one_dev_info(struct device *dev); 327 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 328 static int intel_iommu_attach_device(struct iommu_domain *domain, 329 struct device *dev); 330 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 331 dma_addr_t iova); 332 333 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); 334 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); 335 336 int intel_iommu_enabled = 0; 337 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 338 339 static int dmar_map_gfx = 1; 340 static int intel_iommu_superpage = 1; 341 static int iommu_identity_mapping; 342 static int iommu_skip_te_disable; 343 344 #define IDENTMAP_GFX 2 345 #define IDENTMAP_AZALIA 4 346 347 int intel_iommu_gfx_mapped; 348 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 349 350 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 351 struct device_domain_info *get_domain_info(struct device *dev) 352 { 353 struct device_domain_info *info; 354 355 if (!dev) 356 return NULL; 357 358 info = dev_iommu_priv_get(dev); 359 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 360 return NULL; 361 362 return info; 363 } 364 365 DEFINE_SPINLOCK(device_domain_lock); 366 static LIST_HEAD(device_domain_list); 367 368 /* 369 * Iterate over elements in device_domain_list and call the specified 370 * callback @fn against each element. 371 */ 372 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 373 void *data), void *data) 374 { 375 int ret = 0; 376 unsigned long flags; 377 struct device_domain_info *info; 378 379 spin_lock_irqsave(&device_domain_lock, flags); 380 list_for_each_entry(info, &device_domain_list, global) { 381 ret = fn(info, data); 382 if (ret) { 383 spin_unlock_irqrestore(&device_domain_lock, flags); 384 return ret; 385 } 386 } 387 spin_unlock_irqrestore(&device_domain_lock, flags); 388 389 return 0; 390 } 391 392 const struct iommu_ops intel_iommu_ops; 393 394 static bool translation_pre_enabled(struct intel_iommu *iommu) 395 { 396 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 397 } 398 399 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 400 { 401 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 402 } 403 404 static void init_translation_status(struct intel_iommu *iommu) 405 { 406 u32 gsts; 407 408 gsts = readl(iommu->reg + DMAR_GSTS_REG); 409 if (gsts & DMA_GSTS_TES) 410 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 411 } 412 413 static int __init intel_iommu_setup(char *str) 414 { 415 if (!str) 416 return -EINVAL; 417 418 while (*str) { 419 if (!strncmp(str, "on", 2)) { 420 dmar_disabled = 0; 421 pr_info("IOMMU enabled\n"); 422 } else if (!strncmp(str, "off", 3)) { 423 dmar_disabled = 1; 424 no_platform_optin = 1; 425 pr_info("IOMMU disabled\n"); 426 } else if (!strncmp(str, "igfx_off", 8)) { 427 dmar_map_gfx = 0; 428 pr_info("Disable GFX device mapping\n"); 429 } else if (!strncmp(str, "forcedac", 8)) { 430 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); 431 iommu_dma_forcedac = true; 432 } else if (!strncmp(str, "strict", 6)) { 433 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); 434 iommu_set_dma_strict(); 435 } else if (!strncmp(str, "sp_off", 6)) { 436 pr_info("Disable supported super page\n"); 437 intel_iommu_superpage = 0; 438 } else if (!strncmp(str, "sm_on", 5)) { 439 pr_info("Enable scalable mode if hardware supports\n"); 440 intel_iommu_sm = 1; 441 } else if (!strncmp(str, "sm_off", 6)) { 442 pr_info("Scalable mode is disallowed\n"); 443 intel_iommu_sm = 0; 444 } else if (!strncmp(str, "tboot_noforce", 13)) { 445 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 446 intel_iommu_tboot_noforce = 1; 447 } else { 448 pr_notice("Unknown option - '%s'\n", str); 449 } 450 451 str += strcspn(str, ","); 452 while (*str == ',') 453 str++; 454 } 455 456 return 1; 457 } 458 __setup("intel_iommu=", intel_iommu_setup); 459 460 static struct kmem_cache *iommu_domain_cache; 461 static struct kmem_cache *iommu_devinfo_cache; 462 463 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 464 { 465 struct dmar_domain **domains; 466 int idx = did >> 8; 467 468 domains = iommu->domains[idx]; 469 if (!domains) 470 return NULL; 471 472 return domains[did & 0xff]; 473 } 474 475 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 476 struct dmar_domain *domain) 477 { 478 struct dmar_domain **domains; 479 int idx = did >> 8; 480 481 if (!iommu->domains[idx]) { 482 size_t size = 256 * sizeof(struct dmar_domain *); 483 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 484 } 485 486 domains = iommu->domains[idx]; 487 if (WARN_ON(!domains)) 488 return; 489 else 490 domains[did & 0xff] = domain; 491 } 492 493 void *alloc_pgtable_page(int node) 494 { 495 struct page *page; 496 void *vaddr = NULL; 497 498 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 499 if (page) 500 vaddr = page_address(page); 501 return vaddr; 502 } 503 504 void free_pgtable_page(void *vaddr) 505 { 506 free_page((unsigned long)vaddr); 507 } 508 509 static inline void *alloc_domain_mem(void) 510 { 511 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 512 } 513 514 static void free_domain_mem(void *vaddr) 515 { 516 kmem_cache_free(iommu_domain_cache, vaddr); 517 } 518 519 static inline void * alloc_devinfo_mem(void) 520 { 521 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 522 } 523 524 static inline void free_devinfo_mem(void *vaddr) 525 { 526 kmem_cache_free(iommu_devinfo_cache, vaddr); 527 } 528 529 static inline int domain_type_is_si(struct dmar_domain *domain) 530 { 531 return domain->domain.type == IOMMU_DOMAIN_IDENTITY; 532 } 533 534 static inline bool domain_use_first_level(struct dmar_domain *domain) 535 { 536 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 537 } 538 539 static inline int domain_pfn_supported(struct dmar_domain *domain, 540 unsigned long pfn) 541 { 542 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 543 544 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 545 } 546 547 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 548 { 549 unsigned long sagaw; 550 int agaw; 551 552 sagaw = cap_sagaw(iommu->cap); 553 for (agaw = width_to_agaw(max_gaw); 554 agaw >= 0; agaw--) { 555 if (test_bit(agaw, &sagaw)) 556 break; 557 } 558 559 return agaw; 560 } 561 562 /* 563 * Calculate max SAGAW for each iommu. 564 */ 565 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 566 { 567 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 568 } 569 570 /* 571 * calculate agaw for each iommu. 572 * "SAGAW" may be different across iommus, use a default agaw, and 573 * get a supported less agaw for iommus that don't support the default agaw. 574 */ 575 int iommu_calculate_agaw(struct intel_iommu *iommu) 576 { 577 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 578 } 579 580 /* This functionin only returns single iommu in a domain */ 581 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 582 { 583 int iommu_id; 584 585 /* si_domain and vm domain should not get here. */ 586 if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) 587 return NULL; 588 589 for_each_domain_iommu(iommu_id, domain) 590 break; 591 592 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 593 return NULL; 594 595 return g_iommus[iommu_id]; 596 } 597 598 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 599 { 600 return sm_supported(iommu) ? 601 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 602 } 603 604 static void domain_update_iommu_coherency(struct dmar_domain *domain) 605 { 606 struct dmar_drhd_unit *drhd; 607 struct intel_iommu *iommu; 608 bool found = false; 609 int i; 610 611 domain->iommu_coherency = true; 612 613 for_each_domain_iommu(i, domain) { 614 found = true; 615 if (!iommu_paging_structure_coherency(g_iommus[i])) { 616 domain->iommu_coherency = false; 617 break; 618 } 619 } 620 if (found) 621 return; 622 623 /* No hardware attached; use lowest common denominator */ 624 rcu_read_lock(); 625 for_each_active_iommu(iommu, drhd) { 626 if (!iommu_paging_structure_coherency(iommu)) { 627 domain->iommu_coherency = false; 628 break; 629 } 630 } 631 rcu_read_unlock(); 632 } 633 634 static bool domain_update_iommu_snooping(struct intel_iommu *skip) 635 { 636 struct dmar_drhd_unit *drhd; 637 struct intel_iommu *iommu; 638 bool ret = true; 639 640 rcu_read_lock(); 641 for_each_active_iommu(iommu, drhd) { 642 if (iommu != skip) { 643 /* 644 * If the hardware is operating in the scalable mode, 645 * the snooping control is always supported since we 646 * always set PASID-table-entry.PGSNP bit if the domain 647 * is managed outside (UNMANAGED). 648 */ 649 if (!sm_supported(iommu) && 650 !ecap_sc_support(iommu->ecap)) { 651 ret = false; 652 break; 653 } 654 } 655 } 656 rcu_read_unlock(); 657 658 return ret; 659 } 660 661 static int domain_update_iommu_superpage(struct dmar_domain *domain, 662 struct intel_iommu *skip) 663 { 664 struct dmar_drhd_unit *drhd; 665 struct intel_iommu *iommu; 666 int mask = 0x3; 667 668 if (!intel_iommu_superpage) 669 return 0; 670 671 /* set iommu_superpage to the smallest common denominator */ 672 rcu_read_lock(); 673 for_each_active_iommu(iommu, drhd) { 674 if (iommu != skip) { 675 if (domain && domain_use_first_level(domain)) { 676 if (!cap_fl1gp_support(iommu->cap)) 677 mask = 0x1; 678 } else { 679 mask &= cap_super_page_val(iommu->cap); 680 } 681 682 if (!mask) 683 break; 684 } 685 } 686 rcu_read_unlock(); 687 688 return fls(mask); 689 } 690 691 static int domain_update_device_node(struct dmar_domain *domain) 692 { 693 struct device_domain_info *info; 694 int nid = NUMA_NO_NODE; 695 696 assert_spin_locked(&device_domain_lock); 697 698 if (list_empty(&domain->devices)) 699 return NUMA_NO_NODE; 700 701 list_for_each_entry(info, &domain->devices, link) { 702 if (!info->dev) 703 continue; 704 705 /* 706 * There could possibly be multiple device numa nodes as devices 707 * within the same domain may sit behind different IOMMUs. There 708 * isn't perfect answer in such situation, so we select first 709 * come first served policy. 710 */ 711 nid = dev_to_node(info->dev); 712 if (nid != NUMA_NO_NODE) 713 break; 714 } 715 716 return nid; 717 } 718 719 static void domain_update_iotlb(struct dmar_domain *domain); 720 721 /* Return the super pagesize bitmap if supported. */ 722 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 723 { 724 unsigned long bitmap = 0; 725 726 /* 727 * 1-level super page supports page size of 2MiB, 2-level super page 728 * supports page size of both 2MiB and 1GiB. 729 */ 730 if (domain->iommu_superpage == 1) 731 bitmap |= SZ_2M; 732 else if (domain->iommu_superpage == 2) 733 bitmap |= SZ_2M | SZ_1G; 734 735 return bitmap; 736 } 737 738 /* Some capabilities may be different across iommus */ 739 static void domain_update_iommu_cap(struct dmar_domain *domain) 740 { 741 domain_update_iommu_coherency(domain); 742 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 743 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 744 745 /* 746 * If RHSA is missing, we should default to the device numa domain 747 * as fall back. 748 */ 749 if (domain->nid == NUMA_NO_NODE) 750 domain->nid = domain_update_device_node(domain); 751 752 /* 753 * First-level translation restricts the input-address to a 754 * canonical address (i.e., address bits 63:N have the same 755 * value as address bit [N-1], where N is 48-bits with 4-level 756 * paging and 57-bits with 5-level paging). Hence, skip bit 757 * [N-1]. 758 */ 759 if (domain_use_first_level(domain)) 760 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 761 else 762 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 763 764 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 765 domain_update_iotlb(domain); 766 } 767 768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 769 u8 devfn, int alloc) 770 { 771 struct root_entry *root = &iommu->root_entry[bus]; 772 struct context_entry *context; 773 u64 *entry; 774 775 entry = &root->lo; 776 if (sm_supported(iommu)) { 777 if (devfn >= 0x80) { 778 devfn -= 0x80; 779 entry = &root->hi; 780 } 781 devfn *= 2; 782 } 783 if (*entry & 1) 784 context = phys_to_virt(*entry & VTD_PAGE_MASK); 785 else { 786 unsigned long phy_addr; 787 if (!alloc) 788 return NULL; 789 790 context = alloc_pgtable_page(iommu->node); 791 if (!context) 792 return NULL; 793 794 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 795 phy_addr = virt_to_phys((void *)context); 796 *entry = phy_addr | 1; 797 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 798 } 799 return &context[devfn]; 800 } 801 802 static bool attach_deferred(struct device *dev) 803 { 804 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 805 } 806 807 /** 808 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 809 * sub-hierarchy of a candidate PCI-PCI bridge 810 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 811 * @bridge: the candidate PCI-PCI bridge 812 * 813 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 814 */ 815 static bool 816 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 817 { 818 struct pci_dev *pdev, *pbridge; 819 820 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 821 return false; 822 823 pdev = to_pci_dev(dev); 824 pbridge = to_pci_dev(bridge); 825 826 if (pbridge->subordinate && 827 pbridge->subordinate->number <= pdev->bus->number && 828 pbridge->subordinate->busn_res.end >= pdev->bus->number) 829 return true; 830 831 return false; 832 } 833 834 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 835 { 836 struct dmar_drhd_unit *drhd; 837 u32 vtbar; 838 int rc; 839 840 /* We know that this device on this chipset has its own IOMMU. 841 * If we find it under a different IOMMU, then the BIOS is lying 842 * to us. Hope that the IOMMU for this device is actually 843 * disabled, and it needs no translation... 844 */ 845 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 846 if (rc) { 847 /* "can't" happen */ 848 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 849 return false; 850 } 851 vtbar &= 0xffff0000; 852 853 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 854 drhd = dmar_find_matched_drhd_unit(pdev); 855 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 856 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 857 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 858 return true; 859 } 860 861 return false; 862 } 863 864 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 865 { 866 if (!iommu || iommu->drhd->ignored) 867 return true; 868 869 if (dev_is_pci(dev)) { 870 struct pci_dev *pdev = to_pci_dev(dev); 871 872 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 873 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 874 quirk_ioat_snb_local_iommu(pdev)) 875 return true; 876 } 877 878 return false; 879 } 880 881 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 882 { 883 struct dmar_drhd_unit *drhd = NULL; 884 struct pci_dev *pdev = NULL; 885 struct intel_iommu *iommu; 886 struct device *tmp; 887 u16 segment = 0; 888 int i; 889 890 if (!dev) 891 return NULL; 892 893 if (dev_is_pci(dev)) { 894 struct pci_dev *pf_pdev; 895 896 pdev = pci_real_dma_dev(to_pci_dev(dev)); 897 898 /* VFs aren't listed in scope tables; we need to look up 899 * the PF instead to find the IOMMU. */ 900 pf_pdev = pci_physfn(pdev); 901 dev = &pf_pdev->dev; 902 segment = pci_domain_nr(pdev->bus); 903 } else if (has_acpi_companion(dev)) 904 dev = &ACPI_COMPANION(dev)->dev; 905 906 rcu_read_lock(); 907 for_each_iommu(iommu, drhd) { 908 if (pdev && segment != drhd->segment) 909 continue; 910 911 for_each_active_dev_scope(drhd->devices, 912 drhd->devices_cnt, i, tmp) { 913 if (tmp == dev) { 914 /* For a VF use its original BDF# not that of the PF 915 * which we used for the IOMMU lookup. Strictly speaking 916 * we could do this for all PCI devices; we only need to 917 * get the BDF# from the scope table for ACPI matches. */ 918 if (pdev && pdev->is_virtfn) 919 goto got_pdev; 920 921 if (bus && devfn) { 922 *bus = drhd->devices[i].bus; 923 *devfn = drhd->devices[i].devfn; 924 } 925 goto out; 926 } 927 928 if (is_downstream_to_pci_bridge(dev, tmp)) 929 goto got_pdev; 930 } 931 932 if (pdev && drhd->include_all) { 933 got_pdev: 934 if (bus && devfn) { 935 *bus = pdev->bus->number; 936 *devfn = pdev->devfn; 937 } 938 goto out; 939 } 940 } 941 iommu = NULL; 942 out: 943 if (iommu_is_dummy(iommu, dev)) 944 iommu = NULL; 945 946 rcu_read_unlock(); 947 948 return iommu; 949 } 950 951 static void domain_flush_cache(struct dmar_domain *domain, 952 void *addr, int size) 953 { 954 if (!domain->iommu_coherency) 955 clflush_cache_range(addr, size); 956 } 957 958 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 959 { 960 struct context_entry *context; 961 int ret = 0; 962 unsigned long flags; 963 964 spin_lock_irqsave(&iommu->lock, flags); 965 context = iommu_context_addr(iommu, bus, devfn, 0); 966 if (context) 967 ret = context_present(context); 968 spin_unlock_irqrestore(&iommu->lock, flags); 969 return ret; 970 } 971 972 static void free_context_table(struct intel_iommu *iommu) 973 { 974 int i; 975 unsigned long flags; 976 struct context_entry *context; 977 978 spin_lock_irqsave(&iommu->lock, flags); 979 if (!iommu->root_entry) { 980 goto out; 981 } 982 for (i = 0; i < ROOT_ENTRY_NR; i++) { 983 context = iommu_context_addr(iommu, i, 0, 0); 984 if (context) 985 free_pgtable_page(context); 986 987 if (!sm_supported(iommu)) 988 continue; 989 990 context = iommu_context_addr(iommu, i, 0x80, 0); 991 if (context) 992 free_pgtable_page(context); 993 994 } 995 free_pgtable_page(iommu->root_entry); 996 iommu->root_entry = NULL; 997 out: 998 spin_unlock_irqrestore(&iommu->lock, flags); 999 } 1000 1001 #ifdef CONFIG_DMAR_DEBUG 1002 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) 1003 { 1004 struct device_domain_info *info; 1005 struct dma_pte *parent, *pte; 1006 struct dmar_domain *domain; 1007 int offset, level; 1008 1009 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); 1010 if (!info || !info->domain) { 1011 pr_info("device [%02x:%02x.%d] not probed\n", 1012 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 1013 return; 1014 } 1015 1016 domain = info->domain; 1017 level = agaw_to_level(domain->agaw); 1018 parent = domain->pgd; 1019 if (!parent) { 1020 pr_info("no page table setup\n"); 1021 return; 1022 } 1023 1024 while (1) { 1025 offset = pfn_level_offset(pfn, level); 1026 pte = &parent[offset]; 1027 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { 1028 pr_info("PTE not present at level %d\n", level); 1029 break; 1030 } 1031 1032 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); 1033 1034 if (level == 1) 1035 break; 1036 1037 parent = phys_to_virt(dma_pte_addr(pte)); 1038 level--; 1039 } 1040 } 1041 1042 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, 1043 unsigned long long addr, u32 pasid) 1044 { 1045 struct pasid_dir_entry *dir, *pde; 1046 struct pasid_entry *entries, *pte; 1047 struct context_entry *ctx_entry; 1048 struct root_entry *rt_entry; 1049 u8 devfn = source_id & 0xff; 1050 u8 bus = source_id >> 8; 1051 int i, dir_index, index; 1052 1053 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); 1054 1055 /* root entry dump */ 1056 rt_entry = &iommu->root_entry[bus]; 1057 if (!rt_entry) { 1058 pr_info("root table entry is not present\n"); 1059 return; 1060 } 1061 1062 if (sm_supported(iommu)) 1063 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", 1064 rt_entry->hi, rt_entry->lo); 1065 else 1066 pr_info("root entry: 0x%016llx", rt_entry->lo); 1067 1068 /* context entry dump */ 1069 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); 1070 if (!ctx_entry) { 1071 pr_info("context table entry is not present\n"); 1072 return; 1073 } 1074 1075 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", 1076 ctx_entry->hi, ctx_entry->lo); 1077 1078 /* legacy mode does not require PASID entries */ 1079 if (!sm_supported(iommu)) 1080 goto pgtable_walk; 1081 1082 /* get the pointer to pasid directory entry */ 1083 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); 1084 if (!dir) { 1085 pr_info("pasid directory entry is not present\n"); 1086 return; 1087 } 1088 /* For request-without-pasid, get the pasid from context entry */ 1089 if (intel_iommu_sm && pasid == INVALID_IOASID) 1090 pasid = PASID_RID2PASID; 1091 1092 dir_index = pasid >> PASID_PDE_SHIFT; 1093 pde = &dir[dir_index]; 1094 pr_info("pasid dir entry: 0x%016llx\n", pde->val); 1095 1096 /* get the pointer to the pasid table entry */ 1097 entries = get_pasid_table_from_pde(pde); 1098 if (!entries) { 1099 pr_info("pasid table entry is not present\n"); 1100 return; 1101 } 1102 index = pasid & PASID_PTE_MASK; 1103 pte = &entries[index]; 1104 for (i = 0; i < ARRAY_SIZE(pte->val); i++) 1105 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); 1106 1107 pgtable_walk: 1108 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); 1109 } 1110 #endif 1111 1112 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 1113 unsigned long pfn, int *target_level) 1114 { 1115 struct dma_pte *parent, *pte; 1116 int level = agaw_to_level(domain->agaw); 1117 int offset; 1118 1119 BUG_ON(!domain->pgd); 1120 1121 if (!domain_pfn_supported(domain, pfn)) 1122 /* Address beyond IOMMU's addressing capabilities. */ 1123 return NULL; 1124 1125 parent = domain->pgd; 1126 1127 while (1) { 1128 void *tmp_page; 1129 1130 offset = pfn_level_offset(pfn, level); 1131 pte = &parent[offset]; 1132 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1133 break; 1134 if (level == *target_level) 1135 break; 1136 1137 if (!dma_pte_present(pte)) { 1138 uint64_t pteval; 1139 1140 tmp_page = alloc_pgtable_page(domain->nid); 1141 1142 if (!tmp_page) 1143 return NULL; 1144 1145 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1146 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1147 if (domain_use_first_level(domain)) { 1148 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1149 if (iommu_is_dma_domain(&domain->domain)) 1150 pteval |= DMA_FL_PTE_ACCESS; 1151 } 1152 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1153 /* Someone else set it while we were thinking; use theirs. */ 1154 free_pgtable_page(tmp_page); 1155 else 1156 domain_flush_cache(domain, pte, sizeof(*pte)); 1157 } 1158 if (level == 1) 1159 break; 1160 1161 parent = phys_to_virt(dma_pte_addr(pte)); 1162 level--; 1163 } 1164 1165 if (!*target_level) 1166 *target_level = level; 1167 1168 return pte; 1169 } 1170 1171 /* return address's pte at specific level */ 1172 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1173 unsigned long pfn, 1174 int level, int *large_page) 1175 { 1176 struct dma_pte *parent, *pte; 1177 int total = agaw_to_level(domain->agaw); 1178 int offset; 1179 1180 parent = domain->pgd; 1181 while (level <= total) { 1182 offset = pfn_level_offset(pfn, total); 1183 pte = &parent[offset]; 1184 if (level == total) 1185 return pte; 1186 1187 if (!dma_pte_present(pte)) { 1188 *large_page = total; 1189 break; 1190 } 1191 1192 if (dma_pte_superpage(pte)) { 1193 *large_page = total; 1194 return pte; 1195 } 1196 1197 parent = phys_to_virt(dma_pte_addr(pte)); 1198 total--; 1199 } 1200 return NULL; 1201 } 1202 1203 /* clear last level pte, a tlb flush should be followed */ 1204 static void dma_pte_clear_range(struct dmar_domain *domain, 1205 unsigned long start_pfn, 1206 unsigned long last_pfn) 1207 { 1208 unsigned int large_page; 1209 struct dma_pte *first_pte, *pte; 1210 1211 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1212 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1213 BUG_ON(start_pfn > last_pfn); 1214 1215 /* we don't need lock here; nobody else touches the iova range */ 1216 do { 1217 large_page = 1; 1218 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1219 if (!pte) { 1220 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1221 continue; 1222 } 1223 do { 1224 dma_clear_pte(pte); 1225 start_pfn += lvl_to_nr_pages(large_page); 1226 pte++; 1227 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1228 1229 domain_flush_cache(domain, first_pte, 1230 (void *)pte - (void *)first_pte); 1231 1232 } while (start_pfn && start_pfn <= last_pfn); 1233 } 1234 1235 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1236 int retain_level, struct dma_pte *pte, 1237 unsigned long pfn, unsigned long start_pfn, 1238 unsigned long last_pfn) 1239 { 1240 pfn = max(start_pfn, pfn); 1241 pte = &pte[pfn_level_offset(pfn, level)]; 1242 1243 do { 1244 unsigned long level_pfn; 1245 struct dma_pte *level_pte; 1246 1247 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1248 goto next; 1249 1250 level_pfn = pfn & level_mask(level); 1251 level_pte = phys_to_virt(dma_pte_addr(pte)); 1252 1253 if (level > 2) { 1254 dma_pte_free_level(domain, level - 1, retain_level, 1255 level_pte, level_pfn, start_pfn, 1256 last_pfn); 1257 } 1258 1259 /* 1260 * Free the page table if we're below the level we want to 1261 * retain and the range covers the entire table. 1262 */ 1263 if (level < retain_level && !(start_pfn > level_pfn || 1264 last_pfn < level_pfn + level_size(level) - 1)) { 1265 dma_clear_pte(pte); 1266 domain_flush_cache(domain, pte, sizeof(*pte)); 1267 free_pgtable_page(level_pte); 1268 } 1269 next: 1270 pfn += level_size(level); 1271 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1272 } 1273 1274 /* 1275 * clear last level (leaf) ptes and free page table pages below the 1276 * level we wish to keep intact. 1277 */ 1278 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1279 unsigned long start_pfn, 1280 unsigned long last_pfn, 1281 int retain_level) 1282 { 1283 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1284 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1285 BUG_ON(start_pfn > last_pfn); 1286 1287 dma_pte_clear_range(domain, start_pfn, last_pfn); 1288 1289 /* We don't need lock here; nobody else touches the iova range */ 1290 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1291 domain->pgd, 0, start_pfn, last_pfn); 1292 1293 /* free pgd */ 1294 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1295 free_pgtable_page(domain->pgd); 1296 domain->pgd = NULL; 1297 } 1298 } 1299 1300 /* When a page at a given level is being unlinked from its parent, we don't 1301 need to *modify* it at all. All we need to do is make a list of all the 1302 pages which can be freed just as soon as we've flushed the IOTLB and we 1303 know the hardware page-walk will no longer touch them. 1304 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1305 be freed. */ 1306 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1307 int level, struct dma_pte *pte, 1308 struct page *freelist) 1309 { 1310 struct page *pg; 1311 1312 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1313 pg->freelist = freelist; 1314 freelist = pg; 1315 1316 if (level == 1) 1317 return freelist; 1318 1319 pte = page_address(pg); 1320 do { 1321 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1322 freelist = dma_pte_list_pagetables(domain, level - 1, 1323 pte, freelist); 1324 pte++; 1325 } while (!first_pte_in_page(pte)); 1326 1327 return freelist; 1328 } 1329 1330 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1331 struct dma_pte *pte, unsigned long pfn, 1332 unsigned long start_pfn, 1333 unsigned long last_pfn, 1334 struct page *freelist) 1335 { 1336 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1337 1338 pfn = max(start_pfn, pfn); 1339 pte = &pte[pfn_level_offset(pfn, level)]; 1340 1341 do { 1342 unsigned long level_pfn; 1343 1344 if (!dma_pte_present(pte)) 1345 goto next; 1346 1347 level_pfn = pfn & level_mask(level); 1348 1349 /* If range covers entire pagetable, free it */ 1350 if (start_pfn <= level_pfn && 1351 last_pfn >= level_pfn + level_size(level) - 1) { 1352 /* These suborbinate page tables are going away entirely. Don't 1353 bother to clear them; we're just going to *free* them. */ 1354 if (level > 1 && !dma_pte_superpage(pte)) 1355 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1356 1357 dma_clear_pte(pte); 1358 if (!first_pte) 1359 first_pte = pte; 1360 last_pte = pte; 1361 } else if (level > 1) { 1362 /* Recurse down into a level that isn't *entirely* obsolete */ 1363 freelist = dma_pte_clear_level(domain, level - 1, 1364 phys_to_virt(dma_pte_addr(pte)), 1365 level_pfn, start_pfn, last_pfn, 1366 freelist); 1367 } 1368 next: 1369 pfn += level_size(level); 1370 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1371 1372 if (first_pte) 1373 domain_flush_cache(domain, first_pte, 1374 (void *)++last_pte - (void *)first_pte); 1375 1376 return freelist; 1377 } 1378 1379 /* We can't just free the pages because the IOMMU may still be walking 1380 the page tables, and may have cached the intermediate levels. The 1381 pages can only be freed after the IOTLB flush has been done. */ 1382 static struct page *domain_unmap(struct dmar_domain *domain, 1383 unsigned long start_pfn, 1384 unsigned long last_pfn, 1385 struct page *freelist) 1386 { 1387 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1388 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1389 BUG_ON(start_pfn > last_pfn); 1390 1391 /* we don't need lock here; nobody else touches the iova range */ 1392 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1393 domain->pgd, 0, start_pfn, last_pfn, 1394 freelist); 1395 1396 /* free pgd */ 1397 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1398 struct page *pgd_page = virt_to_page(domain->pgd); 1399 pgd_page->freelist = freelist; 1400 freelist = pgd_page; 1401 1402 domain->pgd = NULL; 1403 } 1404 1405 return freelist; 1406 } 1407 1408 static void dma_free_pagelist(struct page *freelist) 1409 { 1410 struct page *pg; 1411 1412 while ((pg = freelist)) { 1413 freelist = pg->freelist; 1414 free_pgtable_page(page_address(pg)); 1415 } 1416 } 1417 1418 /* iommu handling */ 1419 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1420 { 1421 struct root_entry *root; 1422 unsigned long flags; 1423 1424 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1425 if (!root) { 1426 pr_err("Allocating root entry for %s failed\n", 1427 iommu->name); 1428 return -ENOMEM; 1429 } 1430 1431 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1432 1433 spin_lock_irqsave(&iommu->lock, flags); 1434 iommu->root_entry = root; 1435 spin_unlock_irqrestore(&iommu->lock, flags); 1436 1437 return 0; 1438 } 1439 1440 static void iommu_set_root_entry(struct intel_iommu *iommu) 1441 { 1442 u64 addr; 1443 u32 sts; 1444 unsigned long flag; 1445 1446 addr = virt_to_phys(iommu->root_entry); 1447 if (sm_supported(iommu)) 1448 addr |= DMA_RTADDR_SMT; 1449 1450 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1451 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1452 1453 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1454 1455 /* Make sure hardware complete it */ 1456 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1457 readl, (sts & DMA_GSTS_RTPS), sts); 1458 1459 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1460 1461 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 1462 if (sm_supported(iommu)) 1463 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 1464 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 1465 } 1466 1467 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1468 { 1469 u32 val; 1470 unsigned long flag; 1471 1472 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1473 return; 1474 1475 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1476 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1477 1478 /* Make sure hardware complete it */ 1479 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1480 readl, (!(val & DMA_GSTS_WBFS)), val); 1481 1482 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1483 } 1484 1485 /* return value determine if we need a write buffer flush */ 1486 static void __iommu_flush_context(struct intel_iommu *iommu, 1487 u16 did, u16 source_id, u8 function_mask, 1488 u64 type) 1489 { 1490 u64 val = 0; 1491 unsigned long flag; 1492 1493 switch (type) { 1494 case DMA_CCMD_GLOBAL_INVL: 1495 val = DMA_CCMD_GLOBAL_INVL; 1496 break; 1497 case DMA_CCMD_DOMAIN_INVL: 1498 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1499 break; 1500 case DMA_CCMD_DEVICE_INVL: 1501 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1502 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1503 break; 1504 default: 1505 BUG(); 1506 } 1507 val |= DMA_CCMD_ICC; 1508 1509 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1510 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1511 1512 /* Make sure hardware complete it */ 1513 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1514 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1515 1516 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1517 } 1518 1519 /* return value determine if we need a write buffer flush */ 1520 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1521 u64 addr, unsigned int size_order, u64 type) 1522 { 1523 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1524 u64 val = 0, val_iva = 0; 1525 unsigned long flag; 1526 1527 switch (type) { 1528 case DMA_TLB_GLOBAL_FLUSH: 1529 /* global flush doesn't need set IVA_REG */ 1530 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1531 break; 1532 case DMA_TLB_DSI_FLUSH: 1533 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1534 break; 1535 case DMA_TLB_PSI_FLUSH: 1536 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1537 /* IH bit is passed in as part of address */ 1538 val_iva = size_order | addr; 1539 break; 1540 default: 1541 BUG(); 1542 } 1543 /* Note: set drain read/write */ 1544 #if 0 1545 /* 1546 * This is probably to be super secure.. Looks like we can 1547 * ignore it without any impact. 1548 */ 1549 if (cap_read_drain(iommu->cap)) 1550 val |= DMA_TLB_READ_DRAIN; 1551 #endif 1552 if (cap_write_drain(iommu->cap)) 1553 val |= DMA_TLB_WRITE_DRAIN; 1554 1555 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1556 /* Note: Only uses first TLB reg currently */ 1557 if (val_iva) 1558 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1559 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1560 1561 /* Make sure hardware complete it */ 1562 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1563 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1564 1565 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1566 1567 /* check IOTLB invalidation granularity */ 1568 if (DMA_TLB_IAIG(val) == 0) 1569 pr_err("Flush IOTLB failed\n"); 1570 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1571 pr_debug("TLB flush request %Lx, actual %Lx\n", 1572 (unsigned long long)DMA_TLB_IIRG(type), 1573 (unsigned long long)DMA_TLB_IAIG(val)); 1574 } 1575 1576 static struct device_domain_info * 1577 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1578 u8 bus, u8 devfn) 1579 { 1580 struct device_domain_info *info; 1581 1582 assert_spin_locked(&device_domain_lock); 1583 1584 if (!iommu->qi) 1585 return NULL; 1586 1587 list_for_each_entry(info, &domain->devices, link) 1588 if (info->iommu == iommu && info->bus == bus && 1589 info->devfn == devfn) { 1590 if (info->ats_supported && info->dev) 1591 return info; 1592 break; 1593 } 1594 1595 return NULL; 1596 } 1597 1598 static void domain_update_iotlb(struct dmar_domain *domain) 1599 { 1600 struct device_domain_info *info; 1601 bool has_iotlb_device = false; 1602 1603 assert_spin_locked(&device_domain_lock); 1604 1605 list_for_each_entry(info, &domain->devices, link) 1606 if (info->ats_enabled) { 1607 has_iotlb_device = true; 1608 break; 1609 } 1610 1611 if (!has_iotlb_device) { 1612 struct subdev_domain_info *sinfo; 1613 1614 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1615 info = get_domain_info(sinfo->pdev); 1616 if (info && info->ats_enabled) { 1617 has_iotlb_device = true; 1618 break; 1619 } 1620 } 1621 } 1622 1623 domain->has_iotlb_device = has_iotlb_device; 1624 } 1625 1626 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1627 { 1628 struct pci_dev *pdev; 1629 1630 assert_spin_locked(&device_domain_lock); 1631 1632 if (!info || !dev_is_pci(info->dev)) 1633 return; 1634 1635 pdev = to_pci_dev(info->dev); 1636 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1637 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1638 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1639 * reserved, which should be set to 0. 1640 */ 1641 if (!ecap_dit(info->iommu->ecap)) 1642 info->pfsid = 0; 1643 else { 1644 struct pci_dev *pf_pdev; 1645 1646 /* pdev will be returned if device is not a vf */ 1647 pf_pdev = pci_physfn(pdev); 1648 info->pfsid = pci_dev_id(pf_pdev); 1649 } 1650 1651 #ifdef CONFIG_INTEL_IOMMU_SVM 1652 /* The PCIe spec, in its wisdom, declares that the behaviour of 1653 the device if you enable PASID support after ATS support is 1654 undefined. So always enable PASID support on devices which 1655 have it, even if we can't yet know if we're ever going to 1656 use it. */ 1657 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1658 info->pasid_enabled = 1; 1659 1660 if (info->pri_supported && 1661 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1662 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) 1663 info->pri_enabled = 1; 1664 #endif 1665 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1666 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1667 info->ats_enabled = 1; 1668 domain_update_iotlb(info->domain); 1669 info->ats_qdep = pci_ats_queue_depth(pdev); 1670 } 1671 } 1672 1673 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1674 { 1675 struct pci_dev *pdev; 1676 1677 assert_spin_locked(&device_domain_lock); 1678 1679 if (!dev_is_pci(info->dev)) 1680 return; 1681 1682 pdev = to_pci_dev(info->dev); 1683 1684 if (info->ats_enabled) { 1685 pci_disable_ats(pdev); 1686 info->ats_enabled = 0; 1687 domain_update_iotlb(info->domain); 1688 } 1689 #ifdef CONFIG_INTEL_IOMMU_SVM 1690 if (info->pri_enabled) { 1691 pci_disable_pri(pdev); 1692 info->pri_enabled = 0; 1693 } 1694 if (info->pasid_enabled) { 1695 pci_disable_pasid(pdev); 1696 info->pasid_enabled = 0; 1697 } 1698 #endif 1699 } 1700 1701 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1702 u64 addr, unsigned int mask) 1703 { 1704 u16 sid, qdep; 1705 1706 if (!info || !info->ats_enabled) 1707 return; 1708 1709 sid = info->bus << 8 | info->devfn; 1710 qdep = info->ats_qdep; 1711 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1712 qdep, addr, mask); 1713 } 1714 1715 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1716 u64 addr, unsigned mask) 1717 { 1718 unsigned long flags; 1719 struct device_domain_info *info; 1720 struct subdev_domain_info *sinfo; 1721 1722 if (!domain->has_iotlb_device) 1723 return; 1724 1725 spin_lock_irqsave(&device_domain_lock, flags); 1726 list_for_each_entry(info, &domain->devices, link) 1727 __iommu_flush_dev_iotlb(info, addr, mask); 1728 1729 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1730 info = get_domain_info(sinfo->pdev); 1731 __iommu_flush_dev_iotlb(info, addr, mask); 1732 } 1733 spin_unlock_irqrestore(&device_domain_lock, flags); 1734 } 1735 1736 static void domain_flush_piotlb(struct intel_iommu *iommu, 1737 struct dmar_domain *domain, 1738 u64 addr, unsigned long npages, bool ih) 1739 { 1740 u16 did = domain->iommu_did[iommu->seq_id]; 1741 1742 if (domain->default_pasid) 1743 qi_flush_piotlb(iommu, did, domain->default_pasid, 1744 addr, npages, ih); 1745 1746 if (!list_empty(&domain->devices)) 1747 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1748 } 1749 1750 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1751 struct dmar_domain *domain, 1752 unsigned long pfn, unsigned int pages, 1753 int ih, int map) 1754 { 1755 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1756 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1757 u16 did = domain->iommu_did[iommu->seq_id]; 1758 1759 BUG_ON(pages == 0); 1760 1761 if (ih) 1762 ih = 1 << 6; 1763 1764 if (domain_use_first_level(domain)) { 1765 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1766 } else { 1767 /* 1768 * Fallback to domain selective flush if no PSI support or 1769 * the size is too big. PSI requires page size to be 2 ^ x, 1770 * and the base address is naturally aligned to the size. 1771 */ 1772 if (!cap_pgsel_inv(iommu->cap) || 1773 mask > cap_max_amask_val(iommu->cap)) 1774 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1775 DMA_TLB_DSI_FLUSH); 1776 else 1777 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1778 DMA_TLB_PSI_FLUSH); 1779 } 1780 1781 /* 1782 * In caching mode, changes of pages from non-present to present require 1783 * flush. However, device IOTLB doesn't need to be flushed in this case. 1784 */ 1785 if (!cap_caching_mode(iommu->cap) || !map) 1786 iommu_flush_dev_iotlb(domain, addr, mask); 1787 } 1788 1789 /* Notification for newly created mappings */ 1790 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1791 struct dmar_domain *domain, 1792 unsigned long pfn, unsigned int pages) 1793 { 1794 /* 1795 * It's a non-present to present mapping. Only flush if caching mode 1796 * and second level. 1797 */ 1798 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1799 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1800 else 1801 iommu_flush_write_buffer(iommu); 1802 } 1803 1804 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1805 { 1806 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1807 int idx; 1808 1809 for_each_domain_iommu(idx, dmar_domain) { 1810 struct intel_iommu *iommu = g_iommus[idx]; 1811 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1812 1813 if (domain_use_first_level(dmar_domain)) 1814 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1815 else 1816 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1817 DMA_TLB_DSI_FLUSH); 1818 1819 if (!cap_caching_mode(iommu->cap)) 1820 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1821 0, MAX_AGAW_PFN_WIDTH); 1822 } 1823 } 1824 1825 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1826 { 1827 u32 pmen; 1828 unsigned long flags; 1829 1830 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1831 return; 1832 1833 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1834 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1835 pmen &= ~DMA_PMEN_EPM; 1836 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1837 1838 /* wait for the protected region status bit to clear */ 1839 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1840 readl, !(pmen & DMA_PMEN_PRS), pmen); 1841 1842 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1843 } 1844 1845 static void iommu_enable_translation(struct intel_iommu *iommu) 1846 { 1847 u32 sts; 1848 unsigned long flags; 1849 1850 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1851 iommu->gcmd |= DMA_GCMD_TE; 1852 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1853 1854 /* Make sure hardware complete it */ 1855 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1856 readl, (sts & DMA_GSTS_TES), sts); 1857 1858 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1859 } 1860 1861 static void iommu_disable_translation(struct intel_iommu *iommu) 1862 { 1863 u32 sts; 1864 unsigned long flag; 1865 1866 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1867 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1868 return; 1869 1870 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1871 iommu->gcmd &= ~DMA_GCMD_TE; 1872 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1873 1874 /* Make sure hardware complete it */ 1875 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1876 readl, (!(sts & DMA_GSTS_TES)), sts); 1877 1878 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1879 } 1880 1881 static int iommu_init_domains(struct intel_iommu *iommu) 1882 { 1883 u32 ndomains, nlongs; 1884 size_t size; 1885 1886 ndomains = cap_ndoms(iommu->cap); 1887 pr_debug("%s: Number of Domains supported <%d>\n", 1888 iommu->name, ndomains); 1889 nlongs = BITS_TO_LONGS(ndomains); 1890 1891 spin_lock_init(&iommu->lock); 1892 1893 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1894 if (!iommu->domain_ids) 1895 return -ENOMEM; 1896 1897 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1898 iommu->domains = kzalloc(size, GFP_KERNEL); 1899 1900 if (iommu->domains) { 1901 size = 256 * sizeof(struct dmar_domain *); 1902 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1903 } 1904 1905 if (!iommu->domains || !iommu->domains[0]) { 1906 pr_err("%s: Allocating domain array failed\n", 1907 iommu->name); 1908 kfree(iommu->domain_ids); 1909 kfree(iommu->domains); 1910 iommu->domain_ids = NULL; 1911 iommu->domains = NULL; 1912 return -ENOMEM; 1913 } 1914 1915 /* 1916 * If Caching mode is set, then invalid translations are tagged 1917 * with domain-id 0, hence we need to pre-allocate it. We also 1918 * use domain-id 0 as a marker for non-allocated domain-id, so 1919 * make sure it is not used for a real domain. 1920 */ 1921 set_bit(0, iommu->domain_ids); 1922 1923 /* 1924 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1925 * entry for first-level or pass-through translation modes should 1926 * be programmed with a domain id different from those used for 1927 * second-level or nested translation. We reserve a domain id for 1928 * this purpose. 1929 */ 1930 if (sm_supported(iommu)) 1931 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1932 1933 return 0; 1934 } 1935 1936 static void disable_dmar_iommu(struct intel_iommu *iommu) 1937 { 1938 struct device_domain_info *info, *tmp; 1939 unsigned long flags; 1940 1941 if (!iommu->domains || !iommu->domain_ids) 1942 return; 1943 1944 spin_lock_irqsave(&device_domain_lock, flags); 1945 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1946 if (info->iommu != iommu) 1947 continue; 1948 1949 if (!info->dev || !info->domain) 1950 continue; 1951 1952 __dmar_remove_one_dev_info(info); 1953 } 1954 spin_unlock_irqrestore(&device_domain_lock, flags); 1955 1956 if (iommu->gcmd & DMA_GCMD_TE) 1957 iommu_disable_translation(iommu); 1958 } 1959 1960 static void free_dmar_iommu(struct intel_iommu *iommu) 1961 { 1962 if ((iommu->domains) && (iommu->domain_ids)) { 1963 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1964 int i; 1965 1966 for (i = 0; i < elems; i++) 1967 kfree(iommu->domains[i]); 1968 kfree(iommu->domains); 1969 kfree(iommu->domain_ids); 1970 iommu->domains = NULL; 1971 iommu->domain_ids = NULL; 1972 } 1973 1974 g_iommus[iommu->seq_id] = NULL; 1975 1976 /* free context mapping */ 1977 free_context_table(iommu); 1978 1979 #ifdef CONFIG_INTEL_IOMMU_SVM 1980 if (pasid_supported(iommu)) { 1981 if (ecap_prs(iommu->ecap)) 1982 intel_svm_finish_prq(iommu); 1983 } 1984 if (vccap_pasid(iommu->vccap)) 1985 ioasid_unregister_allocator(&iommu->pasid_allocator); 1986 1987 #endif 1988 } 1989 1990 /* 1991 * Check and return whether first level is used by default for 1992 * DMA translation. 1993 */ 1994 static bool first_level_by_default(unsigned int type) 1995 { 1996 /* Only SL is available in legacy mode */ 1997 if (!scalable_mode_support()) 1998 return false; 1999 2000 /* Only level (either FL or SL) is available, just use it */ 2001 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) 2002 return intel_cap_flts_sanity(); 2003 2004 /* Both levels are available, decide it based on domain type */ 2005 return type != IOMMU_DOMAIN_UNMANAGED; 2006 } 2007 2008 static struct dmar_domain *alloc_domain(unsigned int type) 2009 { 2010 struct dmar_domain *domain; 2011 2012 domain = alloc_domain_mem(); 2013 if (!domain) 2014 return NULL; 2015 2016 memset(domain, 0, sizeof(*domain)); 2017 domain->nid = NUMA_NO_NODE; 2018 if (first_level_by_default(type)) 2019 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 2020 domain->has_iotlb_device = false; 2021 INIT_LIST_HEAD(&domain->devices); 2022 INIT_LIST_HEAD(&domain->subdevices); 2023 2024 return domain; 2025 } 2026 2027 /* Must be called with iommu->lock */ 2028 static int domain_attach_iommu(struct dmar_domain *domain, 2029 struct intel_iommu *iommu) 2030 { 2031 unsigned long ndomains; 2032 int num; 2033 2034 assert_spin_locked(&device_domain_lock); 2035 assert_spin_locked(&iommu->lock); 2036 2037 domain->iommu_refcnt[iommu->seq_id] += 1; 2038 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 2039 ndomains = cap_ndoms(iommu->cap); 2040 num = find_first_zero_bit(iommu->domain_ids, ndomains); 2041 2042 if (num >= ndomains) { 2043 pr_err("%s: No free domain ids\n", iommu->name); 2044 domain->iommu_refcnt[iommu->seq_id] -= 1; 2045 return -ENOSPC; 2046 } 2047 2048 set_bit(num, iommu->domain_ids); 2049 set_iommu_domain(iommu, num, domain); 2050 2051 domain->iommu_did[iommu->seq_id] = num; 2052 domain->nid = iommu->node; 2053 2054 domain_update_iommu_cap(domain); 2055 } 2056 2057 return 0; 2058 } 2059 2060 static void domain_detach_iommu(struct dmar_domain *domain, 2061 struct intel_iommu *iommu) 2062 { 2063 int num; 2064 2065 assert_spin_locked(&device_domain_lock); 2066 assert_spin_locked(&iommu->lock); 2067 2068 domain->iommu_refcnt[iommu->seq_id] -= 1; 2069 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 2070 num = domain->iommu_did[iommu->seq_id]; 2071 clear_bit(num, iommu->domain_ids); 2072 set_iommu_domain(iommu, num, NULL); 2073 2074 domain_update_iommu_cap(domain); 2075 domain->iommu_did[iommu->seq_id] = 0; 2076 } 2077 } 2078 2079 static inline int guestwidth_to_adjustwidth(int gaw) 2080 { 2081 int agaw; 2082 int r = (gaw - 12) % 9; 2083 2084 if (r == 0) 2085 agaw = gaw; 2086 else 2087 agaw = gaw + 9 - r; 2088 if (agaw > 64) 2089 agaw = 64; 2090 return agaw; 2091 } 2092 2093 static void domain_exit(struct dmar_domain *domain) 2094 { 2095 2096 /* Remove associated devices and clear attached or cached domains */ 2097 domain_remove_dev_info(domain); 2098 2099 if (domain->pgd) { 2100 struct page *freelist; 2101 2102 freelist = domain_unmap(domain, 0, 2103 DOMAIN_MAX_PFN(domain->gaw), NULL); 2104 dma_free_pagelist(freelist); 2105 } 2106 2107 free_domain_mem(domain); 2108 } 2109 2110 /* 2111 * Get the PASID directory size for scalable mode context entry. 2112 * Value of X in the PDTS field of a scalable mode context entry 2113 * indicates PASID directory with 2^(X + 7) entries. 2114 */ 2115 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2116 { 2117 int pds, max_pde; 2118 2119 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2120 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2121 if (pds < 7) 2122 return 0; 2123 2124 return pds - 7; 2125 } 2126 2127 /* 2128 * Set the RID_PASID field of a scalable mode context entry. The 2129 * IOMMU hardware will use the PASID value set in this field for 2130 * DMA translations of DMA requests without PASID. 2131 */ 2132 static inline void 2133 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2134 { 2135 context->hi |= pasid & ((1 << 20) - 1); 2136 } 2137 2138 /* 2139 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2140 * entry. 2141 */ 2142 static inline void context_set_sm_dte(struct context_entry *context) 2143 { 2144 context->lo |= (1 << 2); 2145 } 2146 2147 /* 2148 * Set the PRE(Page Request Enable) field of a scalable mode context 2149 * entry. 2150 */ 2151 static inline void context_set_sm_pre(struct context_entry *context) 2152 { 2153 context->lo |= (1 << 4); 2154 } 2155 2156 /* Convert value to context PASID directory size field coding. */ 2157 #define context_pdts(pds) (((pds) & 0x7) << 9) 2158 2159 static int domain_context_mapping_one(struct dmar_domain *domain, 2160 struct intel_iommu *iommu, 2161 struct pasid_table *table, 2162 u8 bus, u8 devfn) 2163 { 2164 u16 did = domain->iommu_did[iommu->seq_id]; 2165 int translation = CONTEXT_TT_MULTI_LEVEL; 2166 struct device_domain_info *info = NULL; 2167 struct context_entry *context; 2168 unsigned long flags; 2169 int ret; 2170 2171 WARN_ON(did == 0); 2172 2173 if (hw_pass_through && domain_type_is_si(domain)) 2174 translation = CONTEXT_TT_PASS_THROUGH; 2175 2176 pr_debug("Set context mapping for %02x:%02x.%d\n", 2177 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2178 2179 BUG_ON(!domain->pgd); 2180 2181 spin_lock_irqsave(&device_domain_lock, flags); 2182 spin_lock(&iommu->lock); 2183 2184 ret = -ENOMEM; 2185 context = iommu_context_addr(iommu, bus, devfn, 1); 2186 if (!context) 2187 goto out_unlock; 2188 2189 ret = 0; 2190 if (context_present(context)) 2191 goto out_unlock; 2192 2193 /* 2194 * For kdump cases, old valid entries may be cached due to the 2195 * in-flight DMA and copied pgtable, but there is no unmapping 2196 * behaviour for them, thus we need an explicit cache flush for 2197 * the newly-mapped device. For kdump, at this point, the device 2198 * is supposed to finish reset at its driver probe stage, so no 2199 * in-flight DMA will exist, and we don't need to worry anymore 2200 * hereafter. 2201 */ 2202 if (context_copied(context)) { 2203 u16 did_old = context_domain_id(context); 2204 2205 if (did_old < cap_ndoms(iommu->cap)) { 2206 iommu->flush.flush_context(iommu, did_old, 2207 (((u16)bus) << 8) | devfn, 2208 DMA_CCMD_MASK_NOBIT, 2209 DMA_CCMD_DEVICE_INVL); 2210 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2211 DMA_TLB_DSI_FLUSH); 2212 } 2213 } 2214 2215 context_clear_entry(context); 2216 2217 if (sm_supported(iommu)) { 2218 unsigned long pds; 2219 2220 WARN_ON(!table); 2221 2222 /* Setup the PASID DIR pointer: */ 2223 pds = context_get_sm_pds(table); 2224 context->lo = (u64)virt_to_phys(table->table) | 2225 context_pdts(pds); 2226 2227 /* Setup the RID_PASID field: */ 2228 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2229 2230 /* 2231 * Setup the Device-TLB enable bit and Page request 2232 * Enable bit: 2233 */ 2234 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2235 if (info && info->ats_supported) 2236 context_set_sm_dte(context); 2237 if (info && info->pri_supported) 2238 context_set_sm_pre(context); 2239 } else { 2240 struct dma_pte *pgd = domain->pgd; 2241 int agaw; 2242 2243 context_set_domain_id(context, did); 2244 2245 if (translation != CONTEXT_TT_PASS_THROUGH) { 2246 /* 2247 * Skip top levels of page tables for iommu which has 2248 * less agaw than default. Unnecessary for PT mode. 2249 */ 2250 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2251 ret = -ENOMEM; 2252 pgd = phys_to_virt(dma_pte_addr(pgd)); 2253 if (!dma_pte_present(pgd)) 2254 goto out_unlock; 2255 } 2256 2257 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2258 if (info && info->ats_supported) 2259 translation = CONTEXT_TT_DEV_IOTLB; 2260 else 2261 translation = CONTEXT_TT_MULTI_LEVEL; 2262 2263 context_set_address_root(context, virt_to_phys(pgd)); 2264 context_set_address_width(context, agaw); 2265 } else { 2266 /* 2267 * In pass through mode, AW must be programmed to 2268 * indicate the largest AGAW value supported by 2269 * hardware. And ASR is ignored by hardware. 2270 */ 2271 context_set_address_width(context, iommu->msagaw); 2272 } 2273 2274 context_set_translation_type(context, translation); 2275 } 2276 2277 context_set_fault_enable(context); 2278 context_set_present(context); 2279 if (!ecap_coherent(iommu->ecap)) 2280 clflush_cache_range(context, sizeof(*context)); 2281 2282 /* 2283 * It's a non-present to present mapping. If hardware doesn't cache 2284 * non-present entry we only need to flush the write-buffer. If the 2285 * _does_ cache non-present entries, then it does so in the special 2286 * domain #0, which we have to flush: 2287 */ 2288 if (cap_caching_mode(iommu->cap)) { 2289 iommu->flush.flush_context(iommu, 0, 2290 (((u16)bus) << 8) | devfn, 2291 DMA_CCMD_MASK_NOBIT, 2292 DMA_CCMD_DEVICE_INVL); 2293 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2294 } else { 2295 iommu_flush_write_buffer(iommu); 2296 } 2297 iommu_enable_dev_iotlb(info); 2298 2299 ret = 0; 2300 2301 out_unlock: 2302 spin_unlock(&iommu->lock); 2303 spin_unlock_irqrestore(&device_domain_lock, flags); 2304 2305 return ret; 2306 } 2307 2308 struct domain_context_mapping_data { 2309 struct dmar_domain *domain; 2310 struct intel_iommu *iommu; 2311 struct pasid_table *table; 2312 }; 2313 2314 static int domain_context_mapping_cb(struct pci_dev *pdev, 2315 u16 alias, void *opaque) 2316 { 2317 struct domain_context_mapping_data *data = opaque; 2318 2319 return domain_context_mapping_one(data->domain, data->iommu, 2320 data->table, PCI_BUS_NUM(alias), 2321 alias & 0xff); 2322 } 2323 2324 static int 2325 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2326 { 2327 struct domain_context_mapping_data data; 2328 struct pasid_table *table; 2329 struct intel_iommu *iommu; 2330 u8 bus, devfn; 2331 2332 iommu = device_to_iommu(dev, &bus, &devfn); 2333 if (!iommu) 2334 return -ENODEV; 2335 2336 table = intel_pasid_get_table(dev); 2337 2338 if (!dev_is_pci(dev)) 2339 return domain_context_mapping_one(domain, iommu, table, 2340 bus, devfn); 2341 2342 data.domain = domain; 2343 data.iommu = iommu; 2344 data.table = table; 2345 2346 return pci_for_each_dma_alias(to_pci_dev(dev), 2347 &domain_context_mapping_cb, &data); 2348 } 2349 2350 static int domain_context_mapped_cb(struct pci_dev *pdev, 2351 u16 alias, void *opaque) 2352 { 2353 struct intel_iommu *iommu = opaque; 2354 2355 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2356 } 2357 2358 static int domain_context_mapped(struct device *dev) 2359 { 2360 struct intel_iommu *iommu; 2361 u8 bus, devfn; 2362 2363 iommu = device_to_iommu(dev, &bus, &devfn); 2364 if (!iommu) 2365 return -ENODEV; 2366 2367 if (!dev_is_pci(dev)) 2368 return device_context_mapped(iommu, bus, devfn); 2369 2370 return !pci_for_each_dma_alias(to_pci_dev(dev), 2371 domain_context_mapped_cb, iommu); 2372 } 2373 2374 /* Returns a number of VTD pages, but aligned to MM page size */ 2375 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2376 size_t size) 2377 { 2378 host_addr &= ~PAGE_MASK; 2379 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2380 } 2381 2382 /* Return largest possible superpage level for a given mapping */ 2383 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2384 unsigned long iov_pfn, 2385 unsigned long phy_pfn, 2386 unsigned long pages) 2387 { 2388 int support, level = 1; 2389 unsigned long pfnmerge; 2390 2391 support = domain->iommu_superpage; 2392 2393 /* To use a large page, the virtual *and* physical addresses 2394 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2395 of them will mean we have to use smaller pages. So just 2396 merge them and check both at once. */ 2397 pfnmerge = iov_pfn | phy_pfn; 2398 2399 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2400 pages >>= VTD_STRIDE_SHIFT; 2401 if (!pages) 2402 break; 2403 pfnmerge >>= VTD_STRIDE_SHIFT; 2404 level++; 2405 support--; 2406 } 2407 return level; 2408 } 2409 2410 /* 2411 * Ensure that old small page tables are removed to make room for superpage(s). 2412 * We're going to add new large pages, so make sure we don't remove their parent 2413 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 2414 */ 2415 static void switch_to_super_page(struct dmar_domain *domain, 2416 unsigned long start_pfn, 2417 unsigned long end_pfn, int level) 2418 { 2419 unsigned long lvl_pages = lvl_to_nr_pages(level); 2420 struct dma_pte *pte = NULL; 2421 int i; 2422 2423 while (start_pfn <= end_pfn) { 2424 if (!pte) 2425 pte = pfn_to_dma_pte(domain, start_pfn, &level); 2426 2427 if (dma_pte_present(pte)) { 2428 dma_pte_free_pagetable(domain, start_pfn, 2429 start_pfn + lvl_pages - 1, 2430 level + 1); 2431 2432 for_each_domain_iommu(i, domain) 2433 iommu_flush_iotlb_psi(g_iommus[i], domain, 2434 start_pfn, lvl_pages, 2435 0, 0); 2436 } 2437 2438 pte++; 2439 start_pfn += lvl_pages; 2440 if (first_pte_in_page(pte)) 2441 pte = NULL; 2442 } 2443 } 2444 2445 static int 2446 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2447 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2448 { 2449 struct dma_pte *first_pte = NULL, *pte = NULL; 2450 unsigned int largepage_lvl = 0; 2451 unsigned long lvl_pages = 0; 2452 phys_addr_t pteval; 2453 u64 attr; 2454 2455 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2456 2457 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2458 return -EINVAL; 2459 2460 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2461 attr |= DMA_FL_PTE_PRESENT; 2462 if (domain_use_first_level(domain)) { 2463 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 2464 if (prot & DMA_PTE_WRITE) 2465 attr |= DMA_FL_PTE_DIRTY; 2466 } 2467 2468 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2469 2470 while (nr_pages > 0) { 2471 uint64_t tmp; 2472 2473 if (!pte) { 2474 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2475 phys_pfn, nr_pages); 2476 2477 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2478 if (!pte) 2479 return -ENOMEM; 2480 first_pte = pte; 2481 2482 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2483 2484 /* It is large page*/ 2485 if (largepage_lvl > 1) { 2486 unsigned long end_pfn; 2487 unsigned long pages_to_remove; 2488 2489 pteval |= DMA_PTE_LARGE_PAGE; 2490 pages_to_remove = min_t(unsigned long, nr_pages, 2491 nr_pte_to_next_page(pte) * lvl_pages); 2492 end_pfn = iov_pfn + pages_to_remove - 1; 2493 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 2494 } else { 2495 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2496 } 2497 2498 } 2499 /* We don't need lock here, nobody else 2500 * touches the iova range 2501 */ 2502 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2503 if (tmp) { 2504 static int dumps = 5; 2505 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2506 iov_pfn, tmp, (unsigned long long)pteval); 2507 if (dumps) { 2508 dumps--; 2509 debug_dma_dump_mappings(NULL); 2510 } 2511 WARN_ON(1); 2512 } 2513 2514 nr_pages -= lvl_pages; 2515 iov_pfn += lvl_pages; 2516 phys_pfn += lvl_pages; 2517 pteval += lvl_pages * VTD_PAGE_SIZE; 2518 2519 /* If the next PTE would be the first in a new page, then we 2520 * need to flush the cache on the entries we've just written. 2521 * And then we'll need to recalculate 'pte', so clear it and 2522 * let it get set again in the if (!pte) block above. 2523 * 2524 * If we're done (!nr_pages) we need to flush the cache too. 2525 * 2526 * Also if we've been setting superpages, we may need to 2527 * recalculate 'pte' and switch back to smaller pages for the 2528 * end of the mapping, if the trailing size is not enough to 2529 * use another superpage (i.e. nr_pages < lvl_pages). 2530 */ 2531 pte++; 2532 if (!nr_pages || first_pte_in_page(pte) || 2533 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2534 domain_flush_cache(domain, first_pte, 2535 (void *)pte - (void *)first_pte); 2536 pte = NULL; 2537 } 2538 } 2539 2540 return 0; 2541 } 2542 2543 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) 2544 { 2545 struct intel_iommu *iommu = info->iommu; 2546 struct context_entry *context; 2547 unsigned long flags; 2548 u16 did_old; 2549 2550 if (!iommu) 2551 return; 2552 2553 spin_lock_irqsave(&iommu->lock, flags); 2554 context = iommu_context_addr(iommu, bus, devfn, 0); 2555 if (!context) { 2556 spin_unlock_irqrestore(&iommu->lock, flags); 2557 return; 2558 } 2559 2560 if (sm_supported(iommu)) { 2561 if (hw_pass_through && domain_type_is_si(info->domain)) 2562 did_old = FLPT_DEFAULT_DID; 2563 else 2564 did_old = info->domain->iommu_did[iommu->seq_id]; 2565 } else { 2566 did_old = context_domain_id(context); 2567 } 2568 2569 context_clear_entry(context); 2570 __iommu_flush_cache(iommu, context, sizeof(*context)); 2571 spin_unlock_irqrestore(&iommu->lock, flags); 2572 iommu->flush.flush_context(iommu, 2573 did_old, 2574 (((u16)bus) << 8) | devfn, 2575 DMA_CCMD_MASK_NOBIT, 2576 DMA_CCMD_DEVICE_INVL); 2577 2578 if (sm_supported(iommu)) 2579 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0); 2580 2581 iommu->flush.flush_iotlb(iommu, 2582 did_old, 2583 0, 2584 0, 2585 DMA_TLB_DSI_FLUSH); 2586 2587 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); 2588 } 2589 2590 static inline void unlink_domain_info(struct device_domain_info *info) 2591 { 2592 assert_spin_locked(&device_domain_lock); 2593 list_del(&info->link); 2594 list_del(&info->global); 2595 if (info->dev) 2596 dev_iommu_priv_set(info->dev, NULL); 2597 } 2598 2599 static void domain_remove_dev_info(struct dmar_domain *domain) 2600 { 2601 struct device_domain_info *info, *tmp; 2602 unsigned long flags; 2603 2604 spin_lock_irqsave(&device_domain_lock, flags); 2605 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2606 __dmar_remove_one_dev_info(info); 2607 spin_unlock_irqrestore(&device_domain_lock, flags); 2608 } 2609 2610 struct dmar_domain *find_domain(struct device *dev) 2611 { 2612 struct device_domain_info *info; 2613 2614 if (unlikely(!dev || !dev->iommu)) 2615 return NULL; 2616 2617 if (unlikely(attach_deferred(dev))) 2618 return NULL; 2619 2620 /* No lock here, assumes no domain exit in normal case */ 2621 info = get_domain_info(dev); 2622 if (likely(info)) 2623 return info->domain; 2624 2625 return NULL; 2626 } 2627 2628 static inline struct device_domain_info * 2629 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2630 { 2631 struct device_domain_info *info; 2632 2633 list_for_each_entry(info, &device_domain_list, global) 2634 if (info->segment == segment && info->bus == bus && 2635 info->devfn == devfn) 2636 return info; 2637 2638 return NULL; 2639 } 2640 2641 static int domain_setup_first_level(struct intel_iommu *iommu, 2642 struct dmar_domain *domain, 2643 struct device *dev, 2644 u32 pasid) 2645 { 2646 struct dma_pte *pgd = domain->pgd; 2647 int agaw, level; 2648 int flags = 0; 2649 2650 /* 2651 * Skip top levels of page tables for iommu which has 2652 * less agaw than default. Unnecessary for PT mode. 2653 */ 2654 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2655 pgd = phys_to_virt(dma_pte_addr(pgd)); 2656 if (!dma_pte_present(pgd)) 2657 return -ENOMEM; 2658 } 2659 2660 level = agaw_to_level(agaw); 2661 if (level != 4 && level != 5) 2662 return -EINVAL; 2663 2664 if (pasid != PASID_RID2PASID) 2665 flags |= PASID_FLAG_SUPERVISOR_MODE; 2666 if (level == 5) 2667 flags |= PASID_FLAG_FL5LP; 2668 2669 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) 2670 flags |= PASID_FLAG_PAGE_SNOOP; 2671 2672 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2673 domain->iommu_did[iommu->seq_id], 2674 flags); 2675 } 2676 2677 static bool dev_is_real_dma_subdevice(struct device *dev) 2678 { 2679 return dev && dev_is_pci(dev) && 2680 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2681 } 2682 2683 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2684 int bus, int devfn, 2685 struct device *dev, 2686 struct dmar_domain *domain) 2687 { 2688 struct dmar_domain *found = NULL; 2689 struct device_domain_info *info; 2690 unsigned long flags; 2691 int ret; 2692 2693 info = alloc_devinfo_mem(); 2694 if (!info) 2695 return NULL; 2696 2697 if (!dev_is_real_dma_subdevice(dev)) { 2698 info->bus = bus; 2699 info->devfn = devfn; 2700 info->segment = iommu->segment; 2701 } else { 2702 struct pci_dev *pdev = to_pci_dev(dev); 2703 2704 info->bus = pdev->bus->number; 2705 info->devfn = pdev->devfn; 2706 info->segment = pci_domain_nr(pdev->bus); 2707 } 2708 2709 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2710 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2711 info->ats_qdep = 0; 2712 info->dev = dev; 2713 info->domain = domain; 2714 info->iommu = iommu; 2715 info->pasid_table = NULL; 2716 info->auxd_enabled = 0; 2717 INIT_LIST_HEAD(&info->subdevices); 2718 2719 if (dev && dev_is_pci(dev)) { 2720 struct pci_dev *pdev = to_pci_dev(info->dev); 2721 2722 if (ecap_dev_iotlb_support(iommu->ecap) && 2723 pci_ats_supported(pdev) && 2724 dmar_find_matched_atsr_unit(pdev)) 2725 info->ats_supported = 1; 2726 2727 if (sm_supported(iommu)) { 2728 if (pasid_supported(iommu)) { 2729 int features = pci_pasid_features(pdev); 2730 if (features >= 0) 2731 info->pasid_supported = features | 1; 2732 } 2733 2734 if (info->ats_supported && ecap_prs(iommu->ecap) && 2735 pci_pri_supported(pdev)) 2736 info->pri_supported = 1; 2737 } 2738 } 2739 2740 spin_lock_irqsave(&device_domain_lock, flags); 2741 if (dev) 2742 found = find_domain(dev); 2743 2744 if (!found) { 2745 struct device_domain_info *info2; 2746 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2747 info->devfn); 2748 if (info2) { 2749 found = info2->domain; 2750 info2->dev = dev; 2751 } 2752 } 2753 2754 if (found) { 2755 spin_unlock_irqrestore(&device_domain_lock, flags); 2756 free_devinfo_mem(info); 2757 /* Caller must free the original domain */ 2758 return found; 2759 } 2760 2761 spin_lock(&iommu->lock); 2762 ret = domain_attach_iommu(domain, iommu); 2763 spin_unlock(&iommu->lock); 2764 2765 if (ret) { 2766 spin_unlock_irqrestore(&device_domain_lock, flags); 2767 free_devinfo_mem(info); 2768 return NULL; 2769 } 2770 2771 list_add(&info->link, &domain->devices); 2772 list_add(&info->global, &device_domain_list); 2773 if (dev) 2774 dev_iommu_priv_set(dev, info); 2775 spin_unlock_irqrestore(&device_domain_lock, flags); 2776 2777 /* PASID table is mandatory for a PCI device in scalable mode. */ 2778 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2779 ret = intel_pasid_alloc_table(dev); 2780 if (ret) { 2781 dev_err(dev, "PASID table allocation failed\n"); 2782 dmar_remove_one_dev_info(dev); 2783 return NULL; 2784 } 2785 2786 /* Setup the PASID entry for requests without PASID: */ 2787 spin_lock_irqsave(&iommu->lock, flags); 2788 if (hw_pass_through && domain_type_is_si(domain)) 2789 ret = intel_pasid_setup_pass_through(iommu, domain, 2790 dev, PASID_RID2PASID); 2791 else if (domain_use_first_level(domain)) 2792 ret = domain_setup_first_level(iommu, domain, dev, 2793 PASID_RID2PASID); 2794 else 2795 ret = intel_pasid_setup_second_level(iommu, domain, 2796 dev, PASID_RID2PASID); 2797 spin_unlock_irqrestore(&iommu->lock, flags); 2798 if (ret) { 2799 dev_err(dev, "Setup RID2PASID failed\n"); 2800 dmar_remove_one_dev_info(dev); 2801 return NULL; 2802 } 2803 } 2804 2805 if (dev && domain_context_mapping(domain, dev)) { 2806 dev_err(dev, "Domain context map failed\n"); 2807 dmar_remove_one_dev_info(dev); 2808 return NULL; 2809 } 2810 2811 return domain; 2812 } 2813 2814 static int iommu_domain_identity_map(struct dmar_domain *domain, 2815 unsigned long first_vpfn, 2816 unsigned long last_vpfn) 2817 { 2818 /* 2819 * RMRR range might have overlap with physical memory range, 2820 * clear it first 2821 */ 2822 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2823 2824 return __domain_mapping(domain, first_vpfn, 2825 first_vpfn, last_vpfn - first_vpfn + 1, 2826 DMA_PTE_READ|DMA_PTE_WRITE); 2827 } 2828 2829 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2830 2831 static int __init si_domain_init(int hw) 2832 { 2833 struct dmar_rmrr_unit *rmrr; 2834 struct device *dev; 2835 int i, nid, ret; 2836 2837 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); 2838 if (!si_domain) 2839 return -EFAULT; 2840 2841 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2842 domain_exit(si_domain); 2843 return -EFAULT; 2844 } 2845 2846 if (hw) 2847 return 0; 2848 2849 for_each_online_node(nid) { 2850 unsigned long start_pfn, end_pfn; 2851 int i; 2852 2853 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2854 ret = iommu_domain_identity_map(si_domain, 2855 mm_to_dma_pfn(start_pfn), 2856 mm_to_dma_pfn(end_pfn)); 2857 if (ret) 2858 return ret; 2859 } 2860 } 2861 2862 /* 2863 * Identity map the RMRRs so that devices with RMRRs could also use 2864 * the si_domain. 2865 */ 2866 for_each_rmrr_units(rmrr) { 2867 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2868 i, dev) { 2869 unsigned long long start = rmrr->base_address; 2870 unsigned long long end = rmrr->end_address; 2871 2872 if (WARN_ON(end < start || 2873 end >> agaw_to_width(si_domain->agaw))) 2874 continue; 2875 2876 ret = iommu_domain_identity_map(si_domain, 2877 mm_to_dma_pfn(start >> PAGE_SHIFT), 2878 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2879 if (ret) 2880 return ret; 2881 } 2882 } 2883 2884 return 0; 2885 } 2886 2887 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2888 { 2889 struct dmar_domain *ndomain; 2890 struct intel_iommu *iommu; 2891 u8 bus, devfn; 2892 2893 iommu = device_to_iommu(dev, &bus, &devfn); 2894 if (!iommu) 2895 return -ENODEV; 2896 2897 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2898 if (ndomain != domain) 2899 return -EBUSY; 2900 2901 return 0; 2902 } 2903 2904 static bool device_has_rmrr(struct device *dev) 2905 { 2906 struct dmar_rmrr_unit *rmrr; 2907 struct device *tmp; 2908 int i; 2909 2910 rcu_read_lock(); 2911 for_each_rmrr_units(rmrr) { 2912 /* 2913 * Return TRUE if this RMRR contains the device that 2914 * is passed in. 2915 */ 2916 for_each_active_dev_scope(rmrr->devices, 2917 rmrr->devices_cnt, i, tmp) 2918 if (tmp == dev || 2919 is_downstream_to_pci_bridge(dev, tmp)) { 2920 rcu_read_unlock(); 2921 return true; 2922 } 2923 } 2924 rcu_read_unlock(); 2925 return false; 2926 } 2927 2928 /** 2929 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2930 * is relaxable (ie. is allowed to be not enforced under some conditions) 2931 * @dev: device handle 2932 * 2933 * We assume that PCI USB devices with RMRRs have them largely 2934 * for historical reasons and that the RMRR space is not actively used post 2935 * boot. This exclusion may change if vendors begin to abuse it. 2936 * 2937 * The same exception is made for graphics devices, with the requirement that 2938 * any use of the RMRR regions will be torn down before assigning the device 2939 * to a guest. 2940 * 2941 * Return: true if the RMRR is relaxable, false otherwise 2942 */ 2943 static bool device_rmrr_is_relaxable(struct device *dev) 2944 { 2945 struct pci_dev *pdev; 2946 2947 if (!dev_is_pci(dev)) 2948 return false; 2949 2950 pdev = to_pci_dev(dev); 2951 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2952 return true; 2953 else 2954 return false; 2955 } 2956 2957 /* 2958 * There are a couple cases where we need to restrict the functionality of 2959 * devices associated with RMRRs. The first is when evaluating a device for 2960 * identity mapping because problems exist when devices are moved in and out 2961 * of domains and their respective RMRR information is lost. This means that 2962 * a device with associated RMRRs will never be in a "passthrough" domain. 2963 * The second is use of the device through the IOMMU API. This interface 2964 * expects to have full control of the IOVA space for the device. We cannot 2965 * satisfy both the requirement that RMRR access is maintained and have an 2966 * unencumbered IOVA space. We also have no ability to quiesce the device's 2967 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2968 * We therefore prevent devices associated with an RMRR from participating in 2969 * the IOMMU API, which eliminates them from device assignment. 2970 * 2971 * In both cases, devices which have relaxable RMRRs are not concerned by this 2972 * restriction. See device_rmrr_is_relaxable comment. 2973 */ 2974 static bool device_is_rmrr_locked(struct device *dev) 2975 { 2976 if (!device_has_rmrr(dev)) 2977 return false; 2978 2979 if (device_rmrr_is_relaxable(dev)) 2980 return false; 2981 2982 return true; 2983 } 2984 2985 /* 2986 * Return the required default domain type for a specific device. 2987 * 2988 * @dev: the device in query 2989 * @startup: true if this is during early boot 2990 * 2991 * Returns: 2992 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2993 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2994 * - 0: both identity and dynamic domains work for this device 2995 */ 2996 static int device_def_domain_type(struct device *dev) 2997 { 2998 if (dev_is_pci(dev)) { 2999 struct pci_dev *pdev = to_pci_dev(dev); 3000 3001 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 3002 return IOMMU_DOMAIN_IDENTITY; 3003 3004 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 3005 return IOMMU_DOMAIN_IDENTITY; 3006 } 3007 3008 return 0; 3009 } 3010 3011 static void intel_iommu_init_qi(struct intel_iommu *iommu) 3012 { 3013 /* 3014 * Start from the sane iommu hardware state. 3015 * If the queued invalidation is already initialized by us 3016 * (for example, while enabling interrupt-remapping) then 3017 * we got the things already rolling from a sane state. 3018 */ 3019 if (!iommu->qi) { 3020 /* 3021 * Clear any previous faults. 3022 */ 3023 dmar_fault(-1, iommu); 3024 /* 3025 * Disable queued invalidation if supported and already enabled 3026 * before OS handover. 3027 */ 3028 dmar_disable_qi(iommu); 3029 } 3030 3031 if (dmar_enable_qi(iommu)) { 3032 /* 3033 * Queued Invalidate not enabled, use Register Based Invalidate 3034 */ 3035 iommu->flush.flush_context = __iommu_flush_context; 3036 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 3037 pr_info("%s: Using Register based invalidation\n", 3038 iommu->name); 3039 } else { 3040 iommu->flush.flush_context = qi_flush_context; 3041 iommu->flush.flush_iotlb = qi_flush_iotlb; 3042 pr_info("%s: Using Queued invalidation\n", iommu->name); 3043 } 3044 } 3045 3046 static int copy_context_table(struct intel_iommu *iommu, 3047 struct root_entry *old_re, 3048 struct context_entry **tbl, 3049 int bus, bool ext) 3050 { 3051 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 3052 struct context_entry *new_ce = NULL, ce; 3053 struct context_entry *old_ce = NULL; 3054 struct root_entry re; 3055 phys_addr_t old_ce_phys; 3056 3057 tbl_idx = ext ? bus * 2 : bus; 3058 memcpy(&re, old_re, sizeof(re)); 3059 3060 for (devfn = 0; devfn < 256; devfn++) { 3061 /* First calculate the correct index */ 3062 idx = (ext ? devfn * 2 : devfn) % 256; 3063 3064 if (idx == 0) { 3065 /* First save what we may have and clean up */ 3066 if (new_ce) { 3067 tbl[tbl_idx] = new_ce; 3068 __iommu_flush_cache(iommu, new_ce, 3069 VTD_PAGE_SIZE); 3070 pos = 1; 3071 } 3072 3073 if (old_ce) 3074 memunmap(old_ce); 3075 3076 ret = 0; 3077 if (devfn < 0x80) 3078 old_ce_phys = root_entry_lctp(&re); 3079 else 3080 old_ce_phys = root_entry_uctp(&re); 3081 3082 if (!old_ce_phys) { 3083 if (ext && devfn == 0) { 3084 /* No LCTP, try UCTP */ 3085 devfn = 0x7f; 3086 continue; 3087 } else { 3088 goto out; 3089 } 3090 } 3091 3092 ret = -ENOMEM; 3093 old_ce = memremap(old_ce_phys, PAGE_SIZE, 3094 MEMREMAP_WB); 3095 if (!old_ce) 3096 goto out; 3097 3098 new_ce = alloc_pgtable_page(iommu->node); 3099 if (!new_ce) 3100 goto out_unmap; 3101 3102 ret = 0; 3103 } 3104 3105 /* Now copy the context entry */ 3106 memcpy(&ce, old_ce + idx, sizeof(ce)); 3107 3108 if (!__context_present(&ce)) 3109 continue; 3110 3111 did = context_domain_id(&ce); 3112 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3113 set_bit(did, iommu->domain_ids); 3114 3115 /* 3116 * We need a marker for copied context entries. This 3117 * marker needs to work for the old format as well as 3118 * for extended context entries. 3119 * 3120 * Bit 67 of the context entry is used. In the old 3121 * format this bit is available to software, in the 3122 * extended format it is the PGE bit, but PGE is ignored 3123 * by HW if PASIDs are disabled (and thus still 3124 * available). 3125 * 3126 * So disable PASIDs first and then mark the entry 3127 * copied. This means that we don't copy PASID 3128 * translations from the old kernel, but this is fine as 3129 * faults there are not fatal. 3130 */ 3131 context_clear_pasid_enable(&ce); 3132 context_set_copied(&ce); 3133 3134 new_ce[idx] = ce; 3135 } 3136 3137 tbl[tbl_idx + pos] = new_ce; 3138 3139 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3140 3141 out_unmap: 3142 memunmap(old_ce); 3143 3144 out: 3145 return ret; 3146 } 3147 3148 static int copy_translation_tables(struct intel_iommu *iommu) 3149 { 3150 struct context_entry **ctxt_tbls; 3151 struct root_entry *old_rt; 3152 phys_addr_t old_rt_phys; 3153 int ctxt_table_entries; 3154 unsigned long flags; 3155 u64 rtaddr_reg; 3156 int bus, ret; 3157 bool new_ext, ext; 3158 3159 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3160 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3161 new_ext = !!ecap_ecs(iommu->ecap); 3162 3163 /* 3164 * The RTT bit can only be changed when translation is disabled, 3165 * but disabling translation means to open a window for data 3166 * corruption. So bail out and don't copy anything if we would 3167 * have to change the bit. 3168 */ 3169 if (new_ext != ext) 3170 return -EINVAL; 3171 3172 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3173 if (!old_rt_phys) 3174 return -EINVAL; 3175 3176 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3177 if (!old_rt) 3178 return -ENOMEM; 3179 3180 /* This is too big for the stack - allocate it from slab */ 3181 ctxt_table_entries = ext ? 512 : 256; 3182 ret = -ENOMEM; 3183 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3184 if (!ctxt_tbls) 3185 goto out_unmap; 3186 3187 for (bus = 0; bus < 256; bus++) { 3188 ret = copy_context_table(iommu, &old_rt[bus], 3189 ctxt_tbls, bus, ext); 3190 if (ret) { 3191 pr_err("%s: Failed to copy context table for bus %d\n", 3192 iommu->name, bus); 3193 continue; 3194 } 3195 } 3196 3197 spin_lock_irqsave(&iommu->lock, flags); 3198 3199 /* Context tables are copied, now write them to the root_entry table */ 3200 for (bus = 0; bus < 256; bus++) { 3201 int idx = ext ? bus * 2 : bus; 3202 u64 val; 3203 3204 if (ctxt_tbls[idx]) { 3205 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3206 iommu->root_entry[bus].lo = val; 3207 } 3208 3209 if (!ext || !ctxt_tbls[idx + 1]) 3210 continue; 3211 3212 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3213 iommu->root_entry[bus].hi = val; 3214 } 3215 3216 spin_unlock_irqrestore(&iommu->lock, flags); 3217 3218 kfree(ctxt_tbls); 3219 3220 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3221 3222 ret = 0; 3223 3224 out_unmap: 3225 memunmap(old_rt); 3226 3227 return ret; 3228 } 3229 3230 #ifdef CONFIG_INTEL_IOMMU_SVM 3231 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3232 { 3233 struct intel_iommu *iommu = data; 3234 ioasid_t ioasid; 3235 3236 if (!iommu) 3237 return INVALID_IOASID; 3238 /* 3239 * VT-d virtual command interface always uses the full 20 bit 3240 * PASID range. Host can partition guest PASID range based on 3241 * policies but it is out of guest's control. 3242 */ 3243 if (min < PASID_MIN || max > intel_pasid_max_id) 3244 return INVALID_IOASID; 3245 3246 if (vcmd_alloc_pasid(iommu, &ioasid)) 3247 return INVALID_IOASID; 3248 3249 return ioasid; 3250 } 3251 3252 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3253 { 3254 struct intel_iommu *iommu = data; 3255 3256 if (!iommu) 3257 return; 3258 /* 3259 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3260 * We can only free the PASID when all the devices are unbound. 3261 */ 3262 if (ioasid_find(NULL, ioasid, NULL)) { 3263 pr_alert("Cannot free active IOASID %d\n", ioasid); 3264 return; 3265 } 3266 vcmd_free_pasid(iommu, ioasid); 3267 } 3268 3269 static void register_pasid_allocator(struct intel_iommu *iommu) 3270 { 3271 /* 3272 * If we are running in the host, no need for custom allocator 3273 * in that PASIDs are allocated from the host system-wide. 3274 */ 3275 if (!cap_caching_mode(iommu->cap)) 3276 return; 3277 3278 if (!sm_supported(iommu)) { 3279 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3280 return; 3281 } 3282 3283 /* 3284 * Register a custom PASID allocator if we are running in a guest, 3285 * guest PASID must be obtained via virtual command interface. 3286 * There can be multiple vIOMMUs in each guest but only one allocator 3287 * is active. All vIOMMU allocators will eventually be calling the same 3288 * host allocator. 3289 */ 3290 if (!vccap_pasid(iommu->vccap)) 3291 return; 3292 3293 pr_info("Register custom PASID allocator\n"); 3294 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3295 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3296 iommu->pasid_allocator.pdata = (void *)iommu; 3297 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3298 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3299 /* 3300 * Disable scalable mode on this IOMMU if there 3301 * is no custom allocator. Mixing SM capable vIOMMU 3302 * and non-SM vIOMMU are not supported. 3303 */ 3304 intel_iommu_sm = 0; 3305 } 3306 } 3307 #endif 3308 3309 static int __init init_dmars(void) 3310 { 3311 struct dmar_drhd_unit *drhd; 3312 struct intel_iommu *iommu; 3313 int ret; 3314 3315 /* 3316 * for each drhd 3317 * allocate root 3318 * initialize and program root entry to not present 3319 * endfor 3320 */ 3321 for_each_drhd_unit(drhd) { 3322 /* 3323 * lock not needed as this is only incremented in the single 3324 * threaded kernel __init code path all other access are read 3325 * only 3326 */ 3327 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3328 g_num_of_iommus++; 3329 continue; 3330 } 3331 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3332 } 3333 3334 /* Preallocate enough resources for IOMMU hot-addition */ 3335 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3336 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3337 3338 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3339 GFP_KERNEL); 3340 if (!g_iommus) { 3341 ret = -ENOMEM; 3342 goto error; 3343 } 3344 3345 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3346 if (ret) 3347 goto free_iommu; 3348 3349 for_each_iommu(iommu, drhd) { 3350 if (drhd->ignored) { 3351 iommu_disable_translation(iommu); 3352 continue; 3353 } 3354 3355 /* 3356 * Find the max pasid size of all IOMMU's in the system. 3357 * We need to ensure the system pasid table is no bigger 3358 * than the smallest supported. 3359 */ 3360 if (pasid_supported(iommu)) { 3361 u32 temp = 2 << ecap_pss(iommu->ecap); 3362 3363 intel_pasid_max_id = min_t(u32, temp, 3364 intel_pasid_max_id); 3365 } 3366 3367 g_iommus[iommu->seq_id] = iommu; 3368 3369 intel_iommu_init_qi(iommu); 3370 3371 ret = iommu_init_domains(iommu); 3372 if (ret) 3373 goto free_iommu; 3374 3375 init_translation_status(iommu); 3376 3377 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3378 iommu_disable_translation(iommu); 3379 clear_translation_pre_enabled(iommu); 3380 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3381 iommu->name); 3382 } 3383 3384 /* 3385 * TBD: 3386 * we could share the same root & context tables 3387 * among all IOMMU's. Need to Split it later. 3388 */ 3389 ret = iommu_alloc_root_entry(iommu); 3390 if (ret) 3391 goto free_iommu; 3392 3393 if (translation_pre_enabled(iommu)) { 3394 pr_info("Translation already enabled - trying to copy translation structures\n"); 3395 3396 ret = copy_translation_tables(iommu); 3397 if (ret) { 3398 /* 3399 * We found the IOMMU with translation 3400 * enabled - but failed to copy over the 3401 * old root-entry table. Try to proceed 3402 * by disabling translation now and 3403 * allocating a clean root-entry table. 3404 * This might cause DMAR faults, but 3405 * probably the dump will still succeed. 3406 */ 3407 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3408 iommu->name); 3409 iommu_disable_translation(iommu); 3410 clear_translation_pre_enabled(iommu); 3411 } else { 3412 pr_info("Copied translation tables from previous kernel for %s\n", 3413 iommu->name); 3414 } 3415 } 3416 3417 if (!ecap_pass_through(iommu->ecap)) 3418 hw_pass_through = 0; 3419 intel_svm_check(iommu); 3420 } 3421 3422 /* 3423 * Now that qi is enabled on all iommus, set the root entry and flush 3424 * caches. This is required on some Intel X58 chipsets, otherwise the 3425 * flush_context function will loop forever and the boot hangs. 3426 */ 3427 for_each_active_iommu(iommu, drhd) { 3428 iommu_flush_write_buffer(iommu); 3429 #ifdef CONFIG_INTEL_IOMMU_SVM 3430 register_pasid_allocator(iommu); 3431 #endif 3432 iommu_set_root_entry(iommu); 3433 } 3434 3435 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3436 dmar_map_gfx = 0; 3437 #endif 3438 3439 if (!dmar_map_gfx) 3440 iommu_identity_mapping |= IDENTMAP_GFX; 3441 3442 check_tylersburg_isoch(); 3443 3444 ret = si_domain_init(hw_pass_through); 3445 if (ret) 3446 goto free_iommu; 3447 3448 /* 3449 * for each drhd 3450 * enable fault log 3451 * global invalidate context cache 3452 * global invalidate iotlb 3453 * enable translation 3454 */ 3455 for_each_iommu(iommu, drhd) { 3456 if (drhd->ignored) { 3457 /* 3458 * we always have to disable PMRs or DMA may fail on 3459 * this device 3460 */ 3461 if (force_on) 3462 iommu_disable_protect_mem_regions(iommu); 3463 continue; 3464 } 3465 3466 iommu_flush_write_buffer(iommu); 3467 3468 #ifdef CONFIG_INTEL_IOMMU_SVM 3469 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3470 /* 3471 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3472 * could cause possible lock race condition. 3473 */ 3474 up_write(&dmar_global_lock); 3475 ret = intel_svm_enable_prq(iommu); 3476 down_write(&dmar_global_lock); 3477 if (ret) 3478 goto free_iommu; 3479 } 3480 #endif 3481 ret = dmar_set_interrupt(iommu); 3482 if (ret) 3483 goto free_iommu; 3484 } 3485 3486 return 0; 3487 3488 free_iommu: 3489 for_each_active_iommu(iommu, drhd) { 3490 disable_dmar_iommu(iommu); 3491 free_dmar_iommu(iommu); 3492 } 3493 3494 kfree(g_iommus); 3495 3496 error: 3497 return ret; 3498 } 3499 3500 static inline int iommu_domain_cache_init(void) 3501 { 3502 int ret = 0; 3503 3504 iommu_domain_cache = kmem_cache_create("iommu_domain", 3505 sizeof(struct dmar_domain), 3506 0, 3507 SLAB_HWCACHE_ALIGN, 3508 3509 NULL); 3510 if (!iommu_domain_cache) { 3511 pr_err("Couldn't create iommu_domain cache\n"); 3512 ret = -ENOMEM; 3513 } 3514 3515 return ret; 3516 } 3517 3518 static inline int iommu_devinfo_cache_init(void) 3519 { 3520 int ret = 0; 3521 3522 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3523 sizeof(struct device_domain_info), 3524 0, 3525 SLAB_HWCACHE_ALIGN, 3526 NULL); 3527 if (!iommu_devinfo_cache) { 3528 pr_err("Couldn't create devinfo cache\n"); 3529 ret = -ENOMEM; 3530 } 3531 3532 return ret; 3533 } 3534 3535 static int __init iommu_init_mempool(void) 3536 { 3537 int ret; 3538 ret = iova_cache_get(); 3539 if (ret) 3540 return ret; 3541 3542 ret = iommu_domain_cache_init(); 3543 if (ret) 3544 goto domain_error; 3545 3546 ret = iommu_devinfo_cache_init(); 3547 if (!ret) 3548 return ret; 3549 3550 kmem_cache_destroy(iommu_domain_cache); 3551 domain_error: 3552 iova_cache_put(); 3553 3554 return -ENOMEM; 3555 } 3556 3557 static void __init iommu_exit_mempool(void) 3558 { 3559 kmem_cache_destroy(iommu_devinfo_cache); 3560 kmem_cache_destroy(iommu_domain_cache); 3561 iova_cache_put(); 3562 } 3563 3564 static void __init init_no_remapping_devices(void) 3565 { 3566 struct dmar_drhd_unit *drhd; 3567 struct device *dev; 3568 int i; 3569 3570 for_each_drhd_unit(drhd) { 3571 if (!drhd->include_all) { 3572 for_each_active_dev_scope(drhd->devices, 3573 drhd->devices_cnt, i, dev) 3574 break; 3575 /* ignore DMAR unit if no devices exist */ 3576 if (i == drhd->devices_cnt) 3577 drhd->ignored = 1; 3578 } 3579 } 3580 3581 for_each_active_drhd_unit(drhd) { 3582 if (drhd->include_all) 3583 continue; 3584 3585 for_each_active_dev_scope(drhd->devices, 3586 drhd->devices_cnt, i, dev) 3587 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3588 break; 3589 if (i < drhd->devices_cnt) 3590 continue; 3591 3592 /* This IOMMU has *only* gfx devices. Either bypass it or 3593 set the gfx_mapped flag, as appropriate */ 3594 drhd->gfx_dedicated = 1; 3595 if (!dmar_map_gfx) 3596 drhd->ignored = 1; 3597 } 3598 } 3599 3600 #ifdef CONFIG_SUSPEND 3601 static int init_iommu_hw(void) 3602 { 3603 struct dmar_drhd_unit *drhd; 3604 struct intel_iommu *iommu = NULL; 3605 3606 for_each_active_iommu(iommu, drhd) 3607 if (iommu->qi) 3608 dmar_reenable_qi(iommu); 3609 3610 for_each_iommu(iommu, drhd) { 3611 if (drhd->ignored) { 3612 /* 3613 * we always have to disable PMRs or DMA may fail on 3614 * this device 3615 */ 3616 if (force_on) 3617 iommu_disable_protect_mem_regions(iommu); 3618 continue; 3619 } 3620 3621 iommu_flush_write_buffer(iommu); 3622 iommu_set_root_entry(iommu); 3623 iommu_enable_translation(iommu); 3624 iommu_disable_protect_mem_regions(iommu); 3625 } 3626 3627 return 0; 3628 } 3629 3630 static void iommu_flush_all(void) 3631 { 3632 struct dmar_drhd_unit *drhd; 3633 struct intel_iommu *iommu; 3634 3635 for_each_active_iommu(iommu, drhd) { 3636 iommu->flush.flush_context(iommu, 0, 0, 0, 3637 DMA_CCMD_GLOBAL_INVL); 3638 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3639 DMA_TLB_GLOBAL_FLUSH); 3640 } 3641 } 3642 3643 static int iommu_suspend(void) 3644 { 3645 struct dmar_drhd_unit *drhd; 3646 struct intel_iommu *iommu = NULL; 3647 unsigned long flag; 3648 3649 for_each_active_iommu(iommu, drhd) { 3650 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3651 GFP_KERNEL); 3652 if (!iommu->iommu_state) 3653 goto nomem; 3654 } 3655 3656 iommu_flush_all(); 3657 3658 for_each_active_iommu(iommu, drhd) { 3659 iommu_disable_translation(iommu); 3660 3661 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3662 3663 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3664 readl(iommu->reg + DMAR_FECTL_REG); 3665 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3666 readl(iommu->reg + DMAR_FEDATA_REG); 3667 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3668 readl(iommu->reg + DMAR_FEADDR_REG); 3669 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3670 readl(iommu->reg + DMAR_FEUADDR_REG); 3671 3672 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3673 } 3674 return 0; 3675 3676 nomem: 3677 for_each_active_iommu(iommu, drhd) 3678 kfree(iommu->iommu_state); 3679 3680 return -ENOMEM; 3681 } 3682 3683 static void iommu_resume(void) 3684 { 3685 struct dmar_drhd_unit *drhd; 3686 struct intel_iommu *iommu = NULL; 3687 unsigned long flag; 3688 3689 if (init_iommu_hw()) { 3690 if (force_on) 3691 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3692 else 3693 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3694 return; 3695 } 3696 3697 for_each_active_iommu(iommu, drhd) { 3698 3699 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3700 3701 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3702 iommu->reg + DMAR_FECTL_REG); 3703 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3704 iommu->reg + DMAR_FEDATA_REG); 3705 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3706 iommu->reg + DMAR_FEADDR_REG); 3707 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3708 iommu->reg + DMAR_FEUADDR_REG); 3709 3710 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3711 } 3712 3713 for_each_active_iommu(iommu, drhd) 3714 kfree(iommu->iommu_state); 3715 } 3716 3717 static struct syscore_ops iommu_syscore_ops = { 3718 .resume = iommu_resume, 3719 .suspend = iommu_suspend, 3720 }; 3721 3722 static void __init init_iommu_pm_ops(void) 3723 { 3724 register_syscore_ops(&iommu_syscore_ops); 3725 } 3726 3727 #else 3728 static inline void init_iommu_pm_ops(void) {} 3729 #endif /* CONFIG_PM */ 3730 3731 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3732 { 3733 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3734 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3735 rmrr->end_address <= rmrr->base_address || 3736 arch_rmrr_sanity_check(rmrr)) 3737 return -EINVAL; 3738 3739 return 0; 3740 } 3741 3742 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3743 { 3744 struct acpi_dmar_reserved_memory *rmrr; 3745 struct dmar_rmrr_unit *rmrru; 3746 3747 rmrr = (struct acpi_dmar_reserved_memory *)header; 3748 if (rmrr_sanity_check(rmrr)) { 3749 pr_warn(FW_BUG 3750 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3751 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3752 rmrr->base_address, rmrr->end_address, 3753 dmi_get_system_info(DMI_BIOS_VENDOR), 3754 dmi_get_system_info(DMI_BIOS_VERSION), 3755 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3756 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3757 } 3758 3759 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3760 if (!rmrru) 3761 goto out; 3762 3763 rmrru->hdr = header; 3764 3765 rmrru->base_address = rmrr->base_address; 3766 rmrru->end_address = rmrr->end_address; 3767 3768 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3769 ((void *)rmrr) + rmrr->header.length, 3770 &rmrru->devices_cnt); 3771 if (rmrru->devices_cnt && rmrru->devices == NULL) 3772 goto free_rmrru; 3773 3774 list_add(&rmrru->list, &dmar_rmrr_units); 3775 3776 return 0; 3777 free_rmrru: 3778 kfree(rmrru); 3779 out: 3780 return -ENOMEM; 3781 } 3782 3783 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3784 { 3785 struct dmar_atsr_unit *atsru; 3786 struct acpi_dmar_atsr *tmp; 3787 3788 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3789 dmar_rcu_check()) { 3790 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3791 if (atsr->segment != tmp->segment) 3792 continue; 3793 if (atsr->header.length != tmp->header.length) 3794 continue; 3795 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3796 return atsru; 3797 } 3798 3799 return NULL; 3800 } 3801 3802 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3803 { 3804 struct acpi_dmar_atsr *atsr; 3805 struct dmar_atsr_unit *atsru; 3806 3807 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3808 return 0; 3809 3810 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3811 atsru = dmar_find_atsr(atsr); 3812 if (atsru) 3813 return 0; 3814 3815 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3816 if (!atsru) 3817 return -ENOMEM; 3818 3819 /* 3820 * If memory is allocated from slab by ACPI _DSM method, we need to 3821 * copy the memory content because the memory buffer will be freed 3822 * on return. 3823 */ 3824 atsru->hdr = (void *)(atsru + 1); 3825 memcpy(atsru->hdr, hdr, hdr->length); 3826 atsru->include_all = atsr->flags & 0x1; 3827 if (!atsru->include_all) { 3828 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3829 (void *)atsr + atsr->header.length, 3830 &atsru->devices_cnt); 3831 if (atsru->devices_cnt && atsru->devices == NULL) { 3832 kfree(atsru); 3833 return -ENOMEM; 3834 } 3835 } 3836 3837 list_add_rcu(&atsru->list, &dmar_atsr_units); 3838 3839 return 0; 3840 } 3841 3842 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3843 { 3844 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3845 kfree(atsru); 3846 } 3847 3848 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3849 { 3850 struct acpi_dmar_atsr *atsr; 3851 struct dmar_atsr_unit *atsru; 3852 3853 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3854 atsru = dmar_find_atsr(atsr); 3855 if (atsru) { 3856 list_del_rcu(&atsru->list); 3857 synchronize_rcu(); 3858 intel_iommu_free_atsr(atsru); 3859 } 3860 3861 return 0; 3862 } 3863 3864 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3865 { 3866 int i; 3867 struct device *dev; 3868 struct acpi_dmar_atsr *atsr; 3869 struct dmar_atsr_unit *atsru; 3870 3871 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3872 atsru = dmar_find_atsr(atsr); 3873 if (!atsru) 3874 return 0; 3875 3876 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3877 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3878 i, dev) 3879 return -EBUSY; 3880 } 3881 3882 return 0; 3883 } 3884 3885 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3886 { 3887 struct dmar_satc_unit *satcu; 3888 struct acpi_dmar_satc *tmp; 3889 3890 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3891 dmar_rcu_check()) { 3892 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3893 if (satc->segment != tmp->segment) 3894 continue; 3895 if (satc->header.length != tmp->header.length) 3896 continue; 3897 if (memcmp(satc, tmp, satc->header.length) == 0) 3898 return satcu; 3899 } 3900 3901 return NULL; 3902 } 3903 3904 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3905 { 3906 struct acpi_dmar_satc *satc; 3907 struct dmar_satc_unit *satcu; 3908 3909 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3910 return 0; 3911 3912 satc = container_of(hdr, struct acpi_dmar_satc, header); 3913 satcu = dmar_find_satc(satc); 3914 if (satcu) 3915 return 0; 3916 3917 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3918 if (!satcu) 3919 return -ENOMEM; 3920 3921 satcu->hdr = (void *)(satcu + 1); 3922 memcpy(satcu->hdr, hdr, hdr->length); 3923 satcu->atc_required = satc->flags & 0x1; 3924 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3925 (void *)satc + satc->header.length, 3926 &satcu->devices_cnt); 3927 if (satcu->devices_cnt && !satcu->devices) { 3928 kfree(satcu); 3929 return -ENOMEM; 3930 } 3931 list_add_rcu(&satcu->list, &dmar_satc_units); 3932 3933 return 0; 3934 } 3935 3936 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3937 { 3938 int sp, ret; 3939 struct intel_iommu *iommu = dmaru->iommu; 3940 3941 if (g_iommus[iommu->seq_id]) 3942 return 0; 3943 3944 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3945 if (ret) 3946 goto out; 3947 3948 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3949 pr_warn("%s: Doesn't support hardware pass through.\n", 3950 iommu->name); 3951 return -ENXIO; 3952 } 3953 if (!ecap_sc_support(iommu->ecap) && 3954 domain_update_iommu_snooping(iommu)) { 3955 pr_warn("%s: Doesn't support snooping.\n", 3956 iommu->name); 3957 return -ENXIO; 3958 } 3959 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3960 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3961 pr_warn("%s: Doesn't support large page.\n", 3962 iommu->name); 3963 return -ENXIO; 3964 } 3965 3966 /* 3967 * Disable translation if already enabled prior to OS handover. 3968 */ 3969 if (iommu->gcmd & DMA_GCMD_TE) 3970 iommu_disable_translation(iommu); 3971 3972 g_iommus[iommu->seq_id] = iommu; 3973 ret = iommu_init_domains(iommu); 3974 if (ret == 0) 3975 ret = iommu_alloc_root_entry(iommu); 3976 if (ret) 3977 goto out; 3978 3979 intel_svm_check(iommu); 3980 3981 if (dmaru->ignored) { 3982 /* 3983 * we always have to disable PMRs or DMA may fail on this device 3984 */ 3985 if (force_on) 3986 iommu_disable_protect_mem_regions(iommu); 3987 return 0; 3988 } 3989 3990 intel_iommu_init_qi(iommu); 3991 iommu_flush_write_buffer(iommu); 3992 3993 #ifdef CONFIG_INTEL_IOMMU_SVM 3994 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3995 ret = intel_svm_enable_prq(iommu); 3996 if (ret) 3997 goto disable_iommu; 3998 } 3999 #endif 4000 ret = dmar_set_interrupt(iommu); 4001 if (ret) 4002 goto disable_iommu; 4003 4004 iommu_set_root_entry(iommu); 4005 iommu_enable_translation(iommu); 4006 4007 iommu_disable_protect_mem_regions(iommu); 4008 return 0; 4009 4010 disable_iommu: 4011 disable_dmar_iommu(iommu); 4012 out: 4013 free_dmar_iommu(iommu); 4014 return ret; 4015 } 4016 4017 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4018 { 4019 int ret = 0; 4020 struct intel_iommu *iommu = dmaru->iommu; 4021 4022 if (!intel_iommu_enabled) 4023 return 0; 4024 if (iommu == NULL) 4025 return -EINVAL; 4026 4027 if (insert) { 4028 ret = intel_iommu_add(dmaru); 4029 } else { 4030 disable_dmar_iommu(iommu); 4031 free_dmar_iommu(iommu); 4032 } 4033 4034 return ret; 4035 } 4036 4037 static void intel_iommu_free_dmars(void) 4038 { 4039 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4040 struct dmar_atsr_unit *atsru, *atsr_n; 4041 struct dmar_satc_unit *satcu, *satc_n; 4042 4043 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4044 list_del(&rmrru->list); 4045 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4046 kfree(rmrru); 4047 } 4048 4049 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4050 list_del(&atsru->list); 4051 intel_iommu_free_atsr(atsru); 4052 } 4053 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 4054 list_del(&satcu->list); 4055 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 4056 kfree(satcu); 4057 } 4058 } 4059 4060 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4061 { 4062 int i, ret = 1; 4063 struct pci_bus *bus; 4064 struct pci_dev *bridge = NULL; 4065 struct device *tmp; 4066 struct acpi_dmar_atsr *atsr; 4067 struct dmar_atsr_unit *atsru; 4068 4069 dev = pci_physfn(dev); 4070 for (bus = dev->bus; bus; bus = bus->parent) { 4071 bridge = bus->self; 4072 /* If it's an integrated device, allow ATS */ 4073 if (!bridge) 4074 return 1; 4075 /* Connected via non-PCIe: no ATS */ 4076 if (!pci_is_pcie(bridge) || 4077 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4078 return 0; 4079 /* If we found the root port, look it up in the ATSR */ 4080 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4081 break; 4082 } 4083 4084 rcu_read_lock(); 4085 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4086 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4087 if (atsr->segment != pci_domain_nr(dev->bus)) 4088 continue; 4089 4090 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4091 if (tmp == &bridge->dev) 4092 goto out; 4093 4094 if (atsru->include_all) 4095 goto out; 4096 } 4097 ret = 0; 4098 out: 4099 rcu_read_unlock(); 4100 4101 return ret; 4102 } 4103 4104 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4105 { 4106 int ret; 4107 struct dmar_rmrr_unit *rmrru; 4108 struct dmar_atsr_unit *atsru; 4109 struct dmar_satc_unit *satcu; 4110 struct acpi_dmar_atsr *atsr; 4111 struct acpi_dmar_reserved_memory *rmrr; 4112 struct acpi_dmar_satc *satc; 4113 4114 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4115 return 0; 4116 4117 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4118 rmrr = container_of(rmrru->hdr, 4119 struct acpi_dmar_reserved_memory, header); 4120 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4121 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4122 ((void *)rmrr) + rmrr->header.length, 4123 rmrr->segment, rmrru->devices, 4124 rmrru->devices_cnt); 4125 if (ret < 0) 4126 return ret; 4127 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4128 dmar_remove_dev_scope(info, rmrr->segment, 4129 rmrru->devices, rmrru->devices_cnt); 4130 } 4131 } 4132 4133 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4134 if (atsru->include_all) 4135 continue; 4136 4137 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4138 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4139 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4140 (void *)atsr + atsr->header.length, 4141 atsr->segment, atsru->devices, 4142 atsru->devices_cnt); 4143 if (ret > 0) 4144 break; 4145 else if (ret < 0) 4146 return ret; 4147 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4148 if (dmar_remove_dev_scope(info, atsr->segment, 4149 atsru->devices, atsru->devices_cnt)) 4150 break; 4151 } 4152 } 4153 list_for_each_entry(satcu, &dmar_satc_units, list) { 4154 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 4155 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4156 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 4157 (void *)satc + satc->header.length, 4158 satc->segment, satcu->devices, 4159 satcu->devices_cnt); 4160 if (ret > 0) 4161 break; 4162 else if (ret < 0) 4163 return ret; 4164 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4165 if (dmar_remove_dev_scope(info, satc->segment, 4166 satcu->devices, satcu->devices_cnt)) 4167 break; 4168 } 4169 } 4170 4171 return 0; 4172 } 4173 4174 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4175 unsigned long val, void *v) 4176 { 4177 struct memory_notify *mhp = v; 4178 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4179 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4180 mhp->nr_pages - 1); 4181 4182 switch (val) { 4183 case MEM_GOING_ONLINE: 4184 if (iommu_domain_identity_map(si_domain, 4185 start_vpfn, last_vpfn)) { 4186 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4187 start_vpfn, last_vpfn); 4188 return NOTIFY_BAD; 4189 } 4190 break; 4191 4192 case MEM_OFFLINE: 4193 case MEM_CANCEL_ONLINE: 4194 { 4195 struct dmar_drhd_unit *drhd; 4196 struct intel_iommu *iommu; 4197 struct page *freelist; 4198 4199 freelist = domain_unmap(si_domain, 4200 start_vpfn, last_vpfn, 4201 NULL); 4202 4203 rcu_read_lock(); 4204 for_each_active_iommu(iommu, drhd) 4205 iommu_flush_iotlb_psi(iommu, si_domain, 4206 start_vpfn, mhp->nr_pages, 4207 !freelist, 0); 4208 rcu_read_unlock(); 4209 dma_free_pagelist(freelist); 4210 } 4211 break; 4212 } 4213 4214 return NOTIFY_OK; 4215 } 4216 4217 static struct notifier_block intel_iommu_memory_nb = { 4218 .notifier_call = intel_iommu_memory_notifier, 4219 .priority = 0 4220 }; 4221 4222 static void intel_disable_iommus(void) 4223 { 4224 struct intel_iommu *iommu = NULL; 4225 struct dmar_drhd_unit *drhd; 4226 4227 for_each_iommu(iommu, drhd) 4228 iommu_disable_translation(iommu); 4229 } 4230 4231 void intel_iommu_shutdown(void) 4232 { 4233 struct dmar_drhd_unit *drhd; 4234 struct intel_iommu *iommu = NULL; 4235 4236 if (no_iommu || dmar_disabled) 4237 return; 4238 4239 down_write(&dmar_global_lock); 4240 4241 /* Disable PMRs explicitly here. */ 4242 for_each_iommu(iommu, drhd) 4243 iommu_disable_protect_mem_regions(iommu); 4244 4245 /* Make sure the IOMMUs are switched off */ 4246 intel_disable_iommus(); 4247 4248 up_write(&dmar_global_lock); 4249 } 4250 4251 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4252 { 4253 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4254 4255 return container_of(iommu_dev, struct intel_iommu, iommu); 4256 } 4257 4258 static ssize_t version_show(struct device *dev, 4259 struct device_attribute *attr, char *buf) 4260 { 4261 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4262 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4263 return sprintf(buf, "%d:%d\n", 4264 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4265 } 4266 static DEVICE_ATTR_RO(version); 4267 4268 static ssize_t address_show(struct device *dev, 4269 struct device_attribute *attr, char *buf) 4270 { 4271 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4272 return sprintf(buf, "%llx\n", iommu->reg_phys); 4273 } 4274 static DEVICE_ATTR_RO(address); 4275 4276 static ssize_t cap_show(struct device *dev, 4277 struct device_attribute *attr, char *buf) 4278 { 4279 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4280 return sprintf(buf, "%llx\n", iommu->cap); 4281 } 4282 static DEVICE_ATTR_RO(cap); 4283 4284 static ssize_t ecap_show(struct device *dev, 4285 struct device_attribute *attr, char *buf) 4286 { 4287 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4288 return sprintf(buf, "%llx\n", iommu->ecap); 4289 } 4290 static DEVICE_ATTR_RO(ecap); 4291 4292 static ssize_t domains_supported_show(struct device *dev, 4293 struct device_attribute *attr, char *buf) 4294 { 4295 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4296 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4297 } 4298 static DEVICE_ATTR_RO(domains_supported); 4299 4300 static ssize_t domains_used_show(struct device *dev, 4301 struct device_attribute *attr, char *buf) 4302 { 4303 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4304 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4305 cap_ndoms(iommu->cap))); 4306 } 4307 static DEVICE_ATTR_RO(domains_used); 4308 4309 static struct attribute *intel_iommu_attrs[] = { 4310 &dev_attr_version.attr, 4311 &dev_attr_address.attr, 4312 &dev_attr_cap.attr, 4313 &dev_attr_ecap.attr, 4314 &dev_attr_domains_supported.attr, 4315 &dev_attr_domains_used.attr, 4316 NULL, 4317 }; 4318 4319 static struct attribute_group intel_iommu_group = { 4320 .name = "intel-iommu", 4321 .attrs = intel_iommu_attrs, 4322 }; 4323 4324 const struct attribute_group *intel_iommu_groups[] = { 4325 &intel_iommu_group, 4326 NULL, 4327 }; 4328 4329 static inline bool has_external_pci(void) 4330 { 4331 struct pci_dev *pdev = NULL; 4332 4333 for_each_pci_dev(pdev) 4334 if (pdev->external_facing) 4335 return true; 4336 4337 return false; 4338 } 4339 4340 static int __init platform_optin_force_iommu(void) 4341 { 4342 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4343 return 0; 4344 4345 if (no_iommu || dmar_disabled) 4346 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4347 4348 /* 4349 * If Intel-IOMMU is disabled by default, we will apply identity 4350 * map for all devices except those marked as being untrusted. 4351 */ 4352 if (dmar_disabled) 4353 iommu_set_default_passthrough(false); 4354 4355 dmar_disabled = 0; 4356 no_iommu = 0; 4357 4358 return 1; 4359 } 4360 4361 static int __init probe_acpi_namespace_devices(void) 4362 { 4363 struct dmar_drhd_unit *drhd; 4364 /* To avoid a -Wunused-but-set-variable warning. */ 4365 struct intel_iommu *iommu __maybe_unused; 4366 struct device *dev; 4367 int i, ret = 0; 4368 4369 for_each_active_iommu(iommu, drhd) { 4370 for_each_active_dev_scope(drhd->devices, 4371 drhd->devices_cnt, i, dev) { 4372 struct acpi_device_physical_node *pn; 4373 struct iommu_group *group; 4374 struct acpi_device *adev; 4375 4376 if (dev->bus != &acpi_bus_type) 4377 continue; 4378 4379 adev = to_acpi_device(dev); 4380 mutex_lock(&adev->physical_node_lock); 4381 list_for_each_entry(pn, 4382 &adev->physical_node_list, node) { 4383 group = iommu_group_get(pn->dev); 4384 if (group) { 4385 iommu_group_put(group); 4386 continue; 4387 } 4388 4389 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4390 ret = iommu_probe_device(pn->dev); 4391 if (ret) 4392 break; 4393 } 4394 mutex_unlock(&adev->physical_node_lock); 4395 4396 if (ret) 4397 return ret; 4398 } 4399 } 4400 4401 return 0; 4402 } 4403 4404 int __init intel_iommu_init(void) 4405 { 4406 int ret = -ENODEV; 4407 struct dmar_drhd_unit *drhd; 4408 struct intel_iommu *iommu; 4409 4410 /* 4411 * Intel IOMMU is required for a TXT/tboot launch or platform 4412 * opt in, so enforce that. 4413 */ 4414 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4415 platform_optin_force_iommu(); 4416 4417 if (iommu_init_mempool()) { 4418 if (force_on) 4419 panic("tboot: Failed to initialize iommu memory\n"); 4420 return -ENOMEM; 4421 } 4422 4423 down_write(&dmar_global_lock); 4424 if (dmar_table_init()) { 4425 if (force_on) 4426 panic("tboot: Failed to initialize DMAR table\n"); 4427 goto out_free_dmar; 4428 } 4429 4430 if (dmar_dev_scope_init() < 0) { 4431 if (force_on) 4432 panic("tboot: Failed to initialize DMAR device scope\n"); 4433 goto out_free_dmar; 4434 } 4435 4436 up_write(&dmar_global_lock); 4437 4438 /* 4439 * The bus notifier takes the dmar_global_lock, so lockdep will 4440 * complain later when we register it under the lock. 4441 */ 4442 dmar_register_bus_notifier(); 4443 4444 down_write(&dmar_global_lock); 4445 4446 if (!no_iommu) 4447 intel_iommu_debugfs_init(); 4448 4449 if (no_iommu || dmar_disabled) { 4450 /* 4451 * We exit the function here to ensure IOMMU's remapping and 4452 * mempool aren't setup, which means that the IOMMU's PMRs 4453 * won't be disabled via the call to init_dmars(). So disable 4454 * it explicitly here. The PMRs were setup by tboot prior to 4455 * calling SENTER, but the kernel is expected to reset/tear 4456 * down the PMRs. 4457 */ 4458 if (intel_iommu_tboot_noforce) { 4459 for_each_iommu(iommu, drhd) 4460 iommu_disable_protect_mem_regions(iommu); 4461 } 4462 4463 /* 4464 * Make sure the IOMMUs are switched off, even when we 4465 * boot into a kexec kernel and the previous kernel left 4466 * them enabled 4467 */ 4468 intel_disable_iommus(); 4469 goto out_free_dmar; 4470 } 4471 4472 if (list_empty(&dmar_rmrr_units)) 4473 pr_info("No RMRR found\n"); 4474 4475 if (list_empty(&dmar_atsr_units)) 4476 pr_info("No ATSR found\n"); 4477 4478 if (list_empty(&dmar_satc_units)) 4479 pr_info("No SATC found\n"); 4480 4481 if (dmar_map_gfx) 4482 intel_iommu_gfx_mapped = 1; 4483 4484 init_no_remapping_devices(); 4485 4486 ret = init_dmars(); 4487 if (ret) { 4488 if (force_on) 4489 panic("tboot: Failed to initialize DMARs\n"); 4490 pr_err("Initialization failed\n"); 4491 goto out_free_dmar; 4492 } 4493 up_write(&dmar_global_lock); 4494 4495 init_iommu_pm_ops(); 4496 4497 down_read(&dmar_global_lock); 4498 for_each_active_iommu(iommu, drhd) { 4499 /* 4500 * The flush queue implementation does not perform 4501 * page-selective invalidations that are required for efficient 4502 * TLB flushes in virtual environments. The benefit of batching 4503 * is likely to be much lower than the overhead of synchronizing 4504 * the virtual and physical IOMMU page-tables. 4505 */ 4506 if (cap_caching_mode(iommu->cap)) { 4507 pr_info_once("IOMMU batching disallowed due to virtualization\n"); 4508 iommu_set_dma_strict(); 4509 } 4510 iommu_device_sysfs_add(&iommu->iommu, NULL, 4511 intel_iommu_groups, 4512 "%s", iommu->name); 4513 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); 4514 } 4515 up_read(&dmar_global_lock); 4516 4517 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4518 if (si_domain && !hw_pass_through) 4519 register_memory_notifier(&intel_iommu_memory_nb); 4520 4521 down_read(&dmar_global_lock); 4522 if (probe_acpi_namespace_devices()) 4523 pr_warn("ACPI name space devices didn't probe correctly\n"); 4524 4525 /* Finally, we enable the DMA remapping hardware. */ 4526 for_each_iommu(iommu, drhd) { 4527 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4528 iommu_enable_translation(iommu); 4529 4530 iommu_disable_protect_mem_regions(iommu); 4531 } 4532 up_read(&dmar_global_lock); 4533 4534 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4535 4536 intel_iommu_enabled = 1; 4537 4538 return 0; 4539 4540 out_free_dmar: 4541 intel_iommu_free_dmars(); 4542 up_write(&dmar_global_lock); 4543 iommu_exit_mempool(); 4544 return ret; 4545 } 4546 4547 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4548 { 4549 struct device_domain_info *info = opaque; 4550 4551 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); 4552 return 0; 4553 } 4554 4555 /* 4556 * NB - intel-iommu lacks any sort of reference counting for the users of 4557 * dependent devices. If multiple endpoints have intersecting dependent 4558 * devices, unbinding the driver from any one of them will possibly leave 4559 * the others unable to operate. 4560 */ 4561 static void domain_context_clear(struct device_domain_info *info) 4562 { 4563 if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) 4564 return; 4565 4566 pci_for_each_dma_alias(to_pci_dev(info->dev), 4567 &domain_context_clear_one_cb, info); 4568 } 4569 4570 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4571 { 4572 struct dmar_domain *domain; 4573 struct intel_iommu *iommu; 4574 unsigned long flags; 4575 4576 assert_spin_locked(&device_domain_lock); 4577 4578 if (WARN_ON(!info)) 4579 return; 4580 4581 iommu = info->iommu; 4582 domain = info->domain; 4583 4584 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { 4585 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4586 intel_pasid_tear_down_entry(iommu, info->dev, 4587 PASID_RID2PASID, false); 4588 4589 iommu_disable_dev_iotlb(info); 4590 domain_context_clear(info); 4591 intel_pasid_free_table(info->dev); 4592 } 4593 4594 unlink_domain_info(info); 4595 4596 spin_lock_irqsave(&iommu->lock, flags); 4597 domain_detach_iommu(domain, iommu); 4598 spin_unlock_irqrestore(&iommu->lock, flags); 4599 4600 free_devinfo_mem(info); 4601 } 4602 4603 static void dmar_remove_one_dev_info(struct device *dev) 4604 { 4605 struct device_domain_info *info; 4606 unsigned long flags; 4607 4608 spin_lock_irqsave(&device_domain_lock, flags); 4609 info = get_domain_info(dev); 4610 if (info) 4611 __dmar_remove_one_dev_info(info); 4612 spin_unlock_irqrestore(&device_domain_lock, flags); 4613 } 4614 4615 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4616 { 4617 int adjust_width; 4618 4619 /* calculate AGAW */ 4620 domain->gaw = guest_width; 4621 adjust_width = guestwidth_to_adjustwidth(guest_width); 4622 domain->agaw = width_to_agaw(adjust_width); 4623 4624 domain->iommu_coherency = false; 4625 domain->iommu_snooping = false; 4626 domain->iommu_superpage = 0; 4627 domain->max_addr = 0; 4628 4629 /* always allocate the top pgd */ 4630 domain->pgd = alloc_pgtable_page(domain->nid); 4631 if (!domain->pgd) 4632 return -ENOMEM; 4633 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4634 return 0; 4635 } 4636 4637 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4638 { 4639 struct dmar_domain *dmar_domain; 4640 struct iommu_domain *domain; 4641 4642 switch (type) { 4643 case IOMMU_DOMAIN_DMA: 4644 case IOMMU_DOMAIN_DMA_FQ: 4645 case IOMMU_DOMAIN_UNMANAGED: 4646 dmar_domain = alloc_domain(type); 4647 if (!dmar_domain) { 4648 pr_err("Can't allocate dmar_domain\n"); 4649 return NULL; 4650 } 4651 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4652 pr_err("Domain initialization failed\n"); 4653 domain_exit(dmar_domain); 4654 return NULL; 4655 } 4656 4657 domain = &dmar_domain->domain; 4658 domain->geometry.aperture_start = 0; 4659 domain->geometry.aperture_end = 4660 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4661 domain->geometry.force_aperture = true; 4662 4663 return domain; 4664 case IOMMU_DOMAIN_IDENTITY: 4665 return &si_domain->domain; 4666 default: 4667 return NULL; 4668 } 4669 4670 return NULL; 4671 } 4672 4673 static void intel_iommu_domain_free(struct iommu_domain *domain) 4674 { 4675 if (domain != &si_domain->domain) 4676 domain_exit(to_dmar_domain(domain)); 4677 } 4678 4679 /* 4680 * Check whether a @domain could be attached to the @dev through the 4681 * aux-domain attach/detach APIs. 4682 */ 4683 static inline bool 4684 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4685 { 4686 struct device_domain_info *info = get_domain_info(dev); 4687 4688 return info && info->auxd_enabled && 4689 domain->type == IOMMU_DOMAIN_UNMANAGED; 4690 } 4691 4692 static inline struct subdev_domain_info * 4693 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4694 { 4695 struct subdev_domain_info *sinfo; 4696 4697 if (!list_empty(&domain->subdevices)) { 4698 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4699 if (sinfo->pdev == dev) 4700 return sinfo; 4701 } 4702 } 4703 4704 return NULL; 4705 } 4706 4707 static int auxiliary_link_device(struct dmar_domain *domain, 4708 struct device *dev) 4709 { 4710 struct device_domain_info *info = get_domain_info(dev); 4711 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4712 4713 assert_spin_locked(&device_domain_lock); 4714 if (WARN_ON(!info)) 4715 return -EINVAL; 4716 4717 if (!sinfo) { 4718 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4719 if (!sinfo) 4720 return -ENOMEM; 4721 sinfo->domain = domain; 4722 sinfo->pdev = dev; 4723 list_add(&sinfo->link_phys, &info->subdevices); 4724 list_add(&sinfo->link_domain, &domain->subdevices); 4725 } 4726 4727 return ++sinfo->users; 4728 } 4729 4730 static int auxiliary_unlink_device(struct dmar_domain *domain, 4731 struct device *dev) 4732 { 4733 struct device_domain_info *info = get_domain_info(dev); 4734 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4735 int ret; 4736 4737 assert_spin_locked(&device_domain_lock); 4738 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4739 return -EINVAL; 4740 4741 ret = --sinfo->users; 4742 if (!ret) { 4743 list_del(&sinfo->link_phys); 4744 list_del(&sinfo->link_domain); 4745 kfree(sinfo); 4746 } 4747 4748 return ret; 4749 } 4750 4751 static int aux_domain_add_dev(struct dmar_domain *domain, 4752 struct device *dev) 4753 { 4754 int ret; 4755 unsigned long flags; 4756 struct intel_iommu *iommu; 4757 4758 iommu = device_to_iommu(dev, NULL, NULL); 4759 if (!iommu) 4760 return -ENODEV; 4761 4762 if (domain->default_pasid <= 0) { 4763 u32 pasid; 4764 4765 /* No private data needed for the default pasid */ 4766 pasid = ioasid_alloc(NULL, PASID_MIN, 4767 pci_max_pasids(to_pci_dev(dev)) - 1, 4768 NULL); 4769 if (pasid == INVALID_IOASID) { 4770 pr_err("Can't allocate default pasid\n"); 4771 return -ENODEV; 4772 } 4773 domain->default_pasid = pasid; 4774 } 4775 4776 spin_lock_irqsave(&device_domain_lock, flags); 4777 ret = auxiliary_link_device(domain, dev); 4778 if (ret <= 0) 4779 goto link_failed; 4780 4781 /* 4782 * Subdevices from the same physical device can be attached to the 4783 * same domain. For such cases, only the first subdevice attachment 4784 * needs to go through the full steps in this function. So if ret > 4785 * 1, just goto out. 4786 */ 4787 if (ret > 1) 4788 goto out; 4789 4790 /* 4791 * iommu->lock must be held to attach domain to iommu and setup the 4792 * pasid entry for second level translation. 4793 */ 4794 spin_lock(&iommu->lock); 4795 ret = domain_attach_iommu(domain, iommu); 4796 if (ret) 4797 goto attach_failed; 4798 4799 /* Setup the PASID entry for mediated devices: */ 4800 if (domain_use_first_level(domain)) 4801 ret = domain_setup_first_level(iommu, domain, dev, 4802 domain->default_pasid); 4803 else 4804 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4805 domain->default_pasid); 4806 if (ret) 4807 goto table_failed; 4808 4809 spin_unlock(&iommu->lock); 4810 out: 4811 spin_unlock_irqrestore(&device_domain_lock, flags); 4812 4813 return 0; 4814 4815 table_failed: 4816 domain_detach_iommu(domain, iommu); 4817 attach_failed: 4818 spin_unlock(&iommu->lock); 4819 auxiliary_unlink_device(domain, dev); 4820 link_failed: 4821 spin_unlock_irqrestore(&device_domain_lock, flags); 4822 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4823 ioasid_put(domain->default_pasid); 4824 4825 return ret; 4826 } 4827 4828 static void aux_domain_remove_dev(struct dmar_domain *domain, 4829 struct device *dev) 4830 { 4831 struct device_domain_info *info; 4832 struct intel_iommu *iommu; 4833 unsigned long flags; 4834 4835 if (!is_aux_domain(dev, &domain->domain)) 4836 return; 4837 4838 spin_lock_irqsave(&device_domain_lock, flags); 4839 info = get_domain_info(dev); 4840 iommu = info->iommu; 4841 4842 if (!auxiliary_unlink_device(domain, dev)) { 4843 spin_lock(&iommu->lock); 4844 intel_pasid_tear_down_entry(iommu, dev, 4845 domain->default_pasid, false); 4846 domain_detach_iommu(domain, iommu); 4847 spin_unlock(&iommu->lock); 4848 } 4849 4850 spin_unlock_irqrestore(&device_domain_lock, flags); 4851 4852 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4853 ioasid_put(domain->default_pasid); 4854 } 4855 4856 static int prepare_domain_attach_device(struct iommu_domain *domain, 4857 struct device *dev) 4858 { 4859 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4860 struct intel_iommu *iommu; 4861 int addr_width; 4862 4863 iommu = device_to_iommu(dev, NULL, NULL); 4864 if (!iommu) 4865 return -ENODEV; 4866 4867 if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) && 4868 !ecap_nest(iommu->ecap)) { 4869 dev_err(dev, "%s: iommu not support nested translation\n", 4870 iommu->name); 4871 return -EINVAL; 4872 } 4873 4874 /* check if this iommu agaw is sufficient for max mapped address */ 4875 addr_width = agaw_to_width(iommu->agaw); 4876 if (addr_width > cap_mgaw(iommu->cap)) 4877 addr_width = cap_mgaw(iommu->cap); 4878 4879 if (dmar_domain->max_addr > (1LL << addr_width)) { 4880 dev_err(dev, "%s: iommu width (%d) is not " 4881 "sufficient for the mapped address (%llx)\n", 4882 __func__, addr_width, dmar_domain->max_addr); 4883 return -EFAULT; 4884 } 4885 dmar_domain->gaw = addr_width; 4886 4887 /* 4888 * Knock out extra levels of page tables if necessary 4889 */ 4890 while (iommu->agaw < dmar_domain->agaw) { 4891 struct dma_pte *pte; 4892 4893 pte = dmar_domain->pgd; 4894 if (dma_pte_present(pte)) { 4895 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); 4896 free_pgtable_page(pte); 4897 } 4898 dmar_domain->agaw--; 4899 } 4900 4901 return 0; 4902 } 4903 4904 static int intel_iommu_attach_device(struct iommu_domain *domain, 4905 struct device *dev) 4906 { 4907 int ret; 4908 4909 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4910 device_is_rmrr_locked(dev)) { 4911 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4912 return -EPERM; 4913 } 4914 4915 if (is_aux_domain(dev, domain)) 4916 return -EPERM; 4917 4918 /* normally dev is not mapped */ 4919 if (unlikely(domain_context_mapped(dev))) { 4920 struct dmar_domain *old_domain; 4921 4922 old_domain = find_domain(dev); 4923 if (old_domain) 4924 dmar_remove_one_dev_info(dev); 4925 } 4926 4927 ret = prepare_domain_attach_device(domain, dev); 4928 if (ret) 4929 return ret; 4930 4931 return domain_add_dev_info(to_dmar_domain(domain), dev); 4932 } 4933 4934 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4935 struct device *dev) 4936 { 4937 int ret; 4938 4939 if (!is_aux_domain(dev, domain)) 4940 return -EPERM; 4941 4942 ret = prepare_domain_attach_device(domain, dev); 4943 if (ret) 4944 return ret; 4945 4946 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4947 } 4948 4949 static void intel_iommu_detach_device(struct iommu_domain *domain, 4950 struct device *dev) 4951 { 4952 dmar_remove_one_dev_info(dev); 4953 } 4954 4955 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4956 struct device *dev) 4957 { 4958 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4959 } 4960 4961 #ifdef CONFIG_INTEL_IOMMU_SVM 4962 /* 4963 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4964 * VT-d granularity. Invalidation is typically included in the unmap operation 4965 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4966 * owns the first level page tables. Invalidations of translation caches in the 4967 * guest are trapped and passed down to the host. 4968 * 4969 * vIOMMU in the guest will only expose first level page tables, therefore 4970 * we do not support IOTLB granularity for request without PASID (second level). 4971 * 4972 * For example, to find the VT-d granularity encoding for IOTLB 4973 * type and page selective granularity within PASID: 4974 * X: indexed by iommu cache type 4975 * Y: indexed by enum iommu_inv_granularity 4976 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4977 */ 4978 4979 static const int 4980 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4981 /* 4982 * PASID based IOTLB invalidation: PASID selective (per PASID), 4983 * page selective (address granularity) 4984 */ 4985 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4986 /* PASID based dev TLBs */ 4987 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4988 /* PASID cache */ 4989 {-EINVAL, -EINVAL, -EINVAL} 4990 }; 4991 4992 static inline int to_vtd_granularity(int type, int granu) 4993 { 4994 return inv_type_granu_table[type][granu]; 4995 } 4996 4997 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4998 { 4999 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 5000 5001 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5002 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5003 * granu size in contiguous memory. 5004 */ 5005 return order_base_2(nr_pages); 5006 } 5007 5008 static int 5009 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5010 struct iommu_cache_invalidate_info *inv_info) 5011 { 5012 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5013 struct device_domain_info *info; 5014 struct intel_iommu *iommu; 5015 unsigned long flags; 5016 int cache_type; 5017 u8 bus, devfn; 5018 u16 did, sid; 5019 int ret = 0; 5020 u64 size = 0; 5021 5022 if (!inv_info || !dmar_domain) 5023 return -EINVAL; 5024 5025 if (!dev || !dev_is_pci(dev)) 5026 return -ENODEV; 5027 5028 iommu = device_to_iommu(dev, &bus, &devfn); 5029 if (!iommu) 5030 return -ENODEV; 5031 5032 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5033 return -EINVAL; 5034 5035 spin_lock_irqsave(&device_domain_lock, flags); 5036 spin_lock(&iommu->lock); 5037 info = get_domain_info(dev); 5038 if (!info) { 5039 ret = -EINVAL; 5040 goto out_unlock; 5041 } 5042 did = dmar_domain->iommu_did[iommu->seq_id]; 5043 sid = PCI_DEVID(bus, devfn); 5044 5045 /* Size is only valid in address selective invalidation */ 5046 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5047 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 5048 inv_info->granu.addr_info.nb_granules); 5049 5050 for_each_set_bit(cache_type, 5051 (unsigned long *)&inv_info->cache, 5052 IOMMU_CACHE_INV_TYPE_NR) { 5053 int granu = 0; 5054 u64 pasid = 0; 5055 u64 addr = 0; 5056 5057 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5058 if (granu == -EINVAL) { 5059 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5060 cache_type, inv_info->granularity); 5061 break; 5062 } 5063 5064 /* 5065 * PASID is stored in different locations based on the 5066 * granularity. 5067 */ 5068 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5069 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5070 pasid = inv_info->granu.pasid_info.pasid; 5071 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5072 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5073 pasid = inv_info->granu.addr_info.pasid; 5074 5075 switch (BIT(cache_type)) { 5076 case IOMMU_CACHE_INV_TYPE_IOTLB: 5077 /* HW will ignore LSB bits based on address mask */ 5078 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5079 size && 5080 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5081 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5082 inv_info->granu.addr_info.addr, size); 5083 } 5084 5085 /* 5086 * If granu is PASID-selective, address is ignored. 5087 * We use npages = -1 to indicate that. 5088 */ 5089 qi_flush_piotlb(iommu, did, pasid, 5090 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 5091 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5092 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5093 5094 if (!info->ats_enabled) 5095 break; 5096 /* 5097 * Always flush device IOTLB if ATS is enabled. vIOMMU 5098 * in the guest may assume IOTLB flush is inclusive, 5099 * which is more efficient. 5100 */ 5101 fallthrough; 5102 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5103 /* 5104 * PASID based device TLB invalidation does not support 5105 * IOMMU_INV_GRANU_PASID granularity but only supports 5106 * IOMMU_INV_GRANU_ADDR. 5107 * The equivalent of that is we set the size to be the 5108 * entire range of 64 bit. User only provides PASID info 5109 * without address info. So we set addr to 0. 5110 */ 5111 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5112 size = 64 - VTD_PAGE_SHIFT; 5113 addr = 0; 5114 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5115 addr = inv_info->granu.addr_info.addr; 5116 } 5117 5118 if (info->ats_enabled) 5119 qi_flush_dev_iotlb_pasid(iommu, sid, 5120 info->pfsid, pasid, 5121 info->ats_qdep, addr, 5122 size); 5123 else 5124 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5125 break; 5126 default: 5127 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5128 cache_type); 5129 ret = -EINVAL; 5130 } 5131 } 5132 out_unlock: 5133 spin_unlock(&iommu->lock); 5134 spin_unlock_irqrestore(&device_domain_lock, flags); 5135 5136 return ret; 5137 } 5138 #endif 5139 5140 static int intel_iommu_map(struct iommu_domain *domain, 5141 unsigned long iova, phys_addr_t hpa, 5142 size_t size, int iommu_prot, gfp_t gfp) 5143 { 5144 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5145 u64 max_addr; 5146 int prot = 0; 5147 5148 if (iommu_prot & IOMMU_READ) 5149 prot |= DMA_PTE_READ; 5150 if (iommu_prot & IOMMU_WRITE) 5151 prot |= DMA_PTE_WRITE; 5152 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5153 prot |= DMA_PTE_SNP; 5154 5155 max_addr = iova + size; 5156 if (dmar_domain->max_addr < max_addr) { 5157 u64 end; 5158 5159 /* check if minimum agaw is sufficient for mapped address */ 5160 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5161 if (end < max_addr) { 5162 pr_err("%s: iommu width (%d) is not " 5163 "sufficient for the mapped address (%llx)\n", 5164 __func__, dmar_domain->gaw, max_addr); 5165 return -EFAULT; 5166 } 5167 dmar_domain->max_addr = max_addr; 5168 } 5169 /* Round up size to next multiple of PAGE_SIZE, if it and 5170 the low bits of hpa would take us onto the next page */ 5171 size = aligned_nrpages(hpa, size); 5172 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5173 hpa >> VTD_PAGE_SHIFT, size, prot); 5174 } 5175 5176 static int intel_iommu_map_pages(struct iommu_domain *domain, 5177 unsigned long iova, phys_addr_t paddr, 5178 size_t pgsize, size_t pgcount, 5179 int prot, gfp_t gfp, size_t *mapped) 5180 { 5181 unsigned long pgshift = __ffs(pgsize); 5182 size_t size = pgcount << pgshift; 5183 int ret; 5184 5185 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 5186 return -EINVAL; 5187 5188 if (!IS_ALIGNED(iova | paddr, pgsize)) 5189 return -EINVAL; 5190 5191 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 5192 if (!ret && mapped) 5193 *mapped = size; 5194 5195 return ret; 5196 } 5197 5198 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5199 unsigned long iova, size_t size, 5200 struct iommu_iotlb_gather *gather) 5201 { 5202 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5203 unsigned long start_pfn, last_pfn; 5204 int level = 0; 5205 5206 /* Cope with horrid API which requires us to unmap more than the 5207 size argument if it happens to be a large-page mapping. */ 5208 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5209 5210 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5211 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5212 5213 start_pfn = iova >> VTD_PAGE_SHIFT; 5214 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5215 5216 gather->freelist = domain_unmap(dmar_domain, start_pfn, 5217 last_pfn, gather->freelist); 5218 5219 if (dmar_domain->max_addr == iova + size) 5220 dmar_domain->max_addr = iova; 5221 5222 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5223 5224 return size; 5225 } 5226 5227 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 5228 unsigned long iova, 5229 size_t pgsize, size_t pgcount, 5230 struct iommu_iotlb_gather *gather) 5231 { 5232 unsigned long pgshift = __ffs(pgsize); 5233 size_t size = pgcount << pgshift; 5234 5235 return intel_iommu_unmap(domain, iova, size, gather); 5236 } 5237 5238 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5239 struct iommu_iotlb_gather *gather) 5240 { 5241 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5242 unsigned long iova_pfn = IOVA_PFN(gather->start); 5243 size_t size = gather->end - gather->start; 5244 unsigned long start_pfn; 5245 unsigned long nrpages; 5246 int iommu_id; 5247 5248 nrpages = aligned_nrpages(gather->start, size); 5249 start_pfn = mm_to_dma_pfn(iova_pfn); 5250 5251 for_each_domain_iommu(iommu_id, dmar_domain) 5252 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5253 start_pfn, nrpages, !gather->freelist, 0); 5254 5255 dma_free_pagelist(gather->freelist); 5256 } 5257 5258 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5259 dma_addr_t iova) 5260 { 5261 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5262 struct dma_pte *pte; 5263 int level = 0; 5264 u64 phys = 0; 5265 5266 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5267 if (pte && dma_pte_present(pte)) 5268 phys = dma_pte_addr(pte) + 5269 (iova & (BIT_MASK(level_to_offset_bits(level) + 5270 VTD_PAGE_SHIFT) - 1)); 5271 5272 return phys; 5273 } 5274 5275 static bool intel_iommu_capable(enum iommu_cap cap) 5276 { 5277 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5278 return domain_update_iommu_snooping(NULL); 5279 if (cap == IOMMU_CAP_INTR_REMAP) 5280 return irq_remapping_enabled == 1; 5281 5282 return false; 5283 } 5284 5285 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5286 { 5287 struct intel_iommu *iommu; 5288 5289 iommu = device_to_iommu(dev, NULL, NULL); 5290 if (!iommu) 5291 return ERR_PTR(-ENODEV); 5292 5293 if (translation_pre_enabled(iommu)) 5294 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5295 5296 return &iommu->iommu; 5297 } 5298 5299 static void intel_iommu_release_device(struct device *dev) 5300 { 5301 struct intel_iommu *iommu; 5302 5303 iommu = device_to_iommu(dev, NULL, NULL); 5304 if (!iommu) 5305 return; 5306 5307 dmar_remove_one_dev_info(dev); 5308 5309 set_dma_ops(dev, NULL); 5310 } 5311 5312 static void intel_iommu_probe_finalize(struct device *dev) 5313 { 5314 set_dma_ops(dev, NULL); 5315 iommu_setup_dma_ops(dev, 0, U64_MAX); 5316 } 5317 5318 static void intel_iommu_get_resv_regions(struct device *device, 5319 struct list_head *head) 5320 { 5321 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5322 struct iommu_resv_region *reg; 5323 struct dmar_rmrr_unit *rmrr; 5324 struct device *i_dev; 5325 int i; 5326 5327 down_read(&dmar_global_lock); 5328 for_each_rmrr_units(rmrr) { 5329 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5330 i, i_dev) { 5331 struct iommu_resv_region *resv; 5332 enum iommu_resv_type type; 5333 size_t length; 5334 5335 if (i_dev != device && 5336 !is_downstream_to_pci_bridge(device, i_dev)) 5337 continue; 5338 5339 length = rmrr->end_address - rmrr->base_address + 1; 5340 5341 type = device_rmrr_is_relaxable(device) ? 5342 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5343 5344 resv = iommu_alloc_resv_region(rmrr->base_address, 5345 length, prot, type); 5346 if (!resv) 5347 break; 5348 5349 list_add_tail(&resv->list, head); 5350 } 5351 } 5352 up_read(&dmar_global_lock); 5353 5354 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5355 if (dev_is_pci(device)) { 5356 struct pci_dev *pdev = to_pci_dev(device); 5357 5358 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5359 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5360 IOMMU_RESV_DIRECT_RELAXABLE); 5361 if (reg) 5362 list_add_tail(®->list, head); 5363 } 5364 } 5365 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5366 5367 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5368 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5369 0, IOMMU_RESV_MSI); 5370 if (!reg) 5371 return; 5372 list_add_tail(®->list, head); 5373 } 5374 5375 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5376 { 5377 struct device_domain_info *info; 5378 struct context_entry *context; 5379 struct dmar_domain *domain; 5380 unsigned long flags; 5381 u64 ctx_lo; 5382 int ret; 5383 5384 domain = find_domain(dev); 5385 if (!domain) 5386 return -EINVAL; 5387 5388 spin_lock_irqsave(&device_domain_lock, flags); 5389 spin_lock(&iommu->lock); 5390 5391 ret = -EINVAL; 5392 info = get_domain_info(dev); 5393 if (!info || !info->pasid_supported) 5394 goto out; 5395 5396 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5397 if (WARN_ON(!context)) 5398 goto out; 5399 5400 ctx_lo = context[0].lo; 5401 5402 if (!(ctx_lo & CONTEXT_PASIDE)) { 5403 ctx_lo |= CONTEXT_PASIDE; 5404 context[0].lo = ctx_lo; 5405 wmb(); 5406 iommu->flush.flush_context(iommu, 5407 domain->iommu_did[iommu->seq_id], 5408 PCI_DEVID(info->bus, info->devfn), 5409 DMA_CCMD_MASK_NOBIT, 5410 DMA_CCMD_DEVICE_INVL); 5411 } 5412 5413 /* Enable PASID support in the device, if it wasn't already */ 5414 if (!info->pasid_enabled) 5415 iommu_enable_dev_iotlb(info); 5416 5417 ret = 0; 5418 5419 out: 5420 spin_unlock(&iommu->lock); 5421 spin_unlock_irqrestore(&device_domain_lock, flags); 5422 5423 return ret; 5424 } 5425 5426 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5427 { 5428 if (dev_is_pci(dev)) 5429 return pci_device_group(dev); 5430 return generic_device_group(dev); 5431 } 5432 5433 static int intel_iommu_enable_auxd(struct device *dev) 5434 { 5435 struct device_domain_info *info; 5436 struct intel_iommu *iommu; 5437 unsigned long flags; 5438 int ret; 5439 5440 iommu = device_to_iommu(dev, NULL, NULL); 5441 if (!iommu || dmar_disabled) 5442 return -EINVAL; 5443 5444 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5445 return -EINVAL; 5446 5447 ret = intel_iommu_enable_pasid(iommu, dev); 5448 if (ret) 5449 return -ENODEV; 5450 5451 spin_lock_irqsave(&device_domain_lock, flags); 5452 info = get_domain_info(dev); 5453 info->auxd_enabled = 1; 5454 spin_unlock_irqrestore(&device_domain_lock, flags); 5455 5456 return 0; 5457 } 5458 5459 static int intel_iommu_disable_auxd(struct device *dev) 5460 { 5461 struct device_domain_info *info; 5462 unsigned long flags; 5463 5464 spin_lock_irqsave(&device_domain_lock, flags); 5465 info = get_domain_info(dev); 5466 if (!WARN_ON(!info)) 5467 info->auxd_enabled = 0; 5468 spin_unlock_irqrestore(&device_domain_lock, flags); 5469 5470 return 0; 5471 } 5472 5473 static int intel_iommu_enable_sva(struct device *dev) 5474 { 5475 struct device_domain_info *info = get_domain_info(dev); 5476 struct intel_iommu *iommu; 5477 int ret; 5478 5479 if (!info || dmar_disabled) 5480 return -EINVAL; 5481 5482 iommu = info->iommu; 5483 if (!iommu) 5484 return -EINVAL; 5485 5486 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) 5487 return -ENODEV; 5488 5489 if (intel_iommu_enable_pasid(iommu, dev)) 5490 return -ENODEV; 5491 5492 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) 5493 return -EINVAL; 5494 5495 ret = iopf_queue_add_device(iommu->iopf_queue, dev); 5496 if (!ret) 5497 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); 5498 5499 return ret; 5500 } 5501 5502 static int intel_iommu_disable_sva(struct device *dev) 5503 { 5504 struct device_domain_info *info = get_domain_info(dev); 5505 struct intel_iommu *iommu = info->iommu; 5506 int ret; 5507 5508 ret = iommu_unregister_device_fault_handler(dev); 5509 if (!ret) 5510 ret = iopf_queue_remove_device(iommu->iopf_queue, dev); 5511 5512 return ret; 5513 } 5514 5515 static int intel_iommu_enable_iopf(struct device *dev) 5516 { 5517 struct device_domain_info *info = get_domain_info(dev); 5518 5519 if (info && info->pri_supported) 5520 return 0; 5521 5522 return -ENODEV; 5523 } 5524 5525 static int 5526 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5527 { 5528 switch (feat) { 5529 case IOMMU_DEV_FEAT_AUX: 5530 return intel_iommu_enable_auxd(dev); 5531 5532 case IOMMU_DEV_FEAT_IOPF: 5533 return intel_iommu_enable_iopf(dev); 5534 5535 case IOMMU_DEV_FEAT_SVA: 5536 return intel_iommu_enable_sva(dev); 5537 5538 default: 5539 return -ENODEV; 5540 } 5541 } 5542 5543 static int 5544 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5545 { 5546 switch (feat) { 5547 case IOMMU_DEV_FEAT_AUX: 5548 return intel_iommu_disable_auxd(dev); 5549 5550 case IOMMU_DEV_FEAT_IOPF: 5551 return 0; 5552 5553 case IOMMU_DEV_FEAT_SVA: 5554 return intel_iommu_disable_sva(dev); 5555 5556 default: 5557 return -ENODEV; 5558 } 5559 } 5560 5561 static bool 5562 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5563 { 5564 struct device_domain_info *info = get_domain_info(dev); 5565 5566 if (feat == IOMMU_DEV_FEAT_AUX) 5567 return scalable_mode_support() && info && info->auxd_enabled; 5568 5569 return false; 5570 } 5571 5572 static int 5573 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5574 { 5575 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5576 5577 return dmar_domain->default_pasid > 0 ? 5578 dmar_domain->default_pasid : -EINVAL; 5579 } 5580 5581 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5582 struct device *dev) 5583 { 5584 return attach_deferred(dev); 5585 } 5586 5587 static int 5588 intel_iommu_enable_nesting(struct iommu_domain *domain) 5589 { 5590 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5591 unsigned long flags; 5592 int ret = -ENODEV; 5593 5594 spin_lock_irqsave(&device_domain_lock, flags); 5595 if (list_empty(&dmar_domain->devices)) { 5596 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5597 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5598 ret = 0; 5599 } 5600 spin_unlock_irqrestore(&device_domain_lock, flags); 5601 5602 return ret; 5603 } 5604 5605 /* 5606 * Check that the device does not live on an external facing PCI port that is 5607 * marked as untrusted. Such devices should not be able to apply quirks and 5608 * thus not be able to bypass the IOMMU restrictions. 5609 */ 5610 static bool risky_device(struct pci_dev *pdev) 5611 { 5612 if (pdev->untrusted) { 5613 pci_info(pdev, 5614 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5615 pdev->vendor, pdev->device); 5616 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5617 return true; 5618 } 5619 return false; 5620 } 5621 5622 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 5623 unsigned long iova, size_t size) 5624 { 5625 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5626 unsigned long pages = aligned_nrpages(iova, size); 5627 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 5628 struct intel_iommu *iommu; 5629 int iommu_id; 5630 5631 for_each_domain_iommu(iommu_id, dmar_domain) { 5632 iommu = g_iommus[iommu_id]; 5633 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 5634 } 5635 } 5636 5637 const struct iommu_ops intel_iommu_ops = { 5638 .capable = intel_iommu_capable, 5639 .domain_alloc = intel_iommu_domain_alloc, 5640 .domain_free = intel_iommu_domain_free, 5641 .enable_nesting = intel_iommu_enable_nesting, 5642 .attach_dev = intel_iommu_attach_device, 5643 .detach_dev = intel_iommu_detach_device, 5644 .aux_attach_dev = intel_iommu_aux_attach_device, 5645 .aux_detach_dev = intel_iommu_aux_detach_device, 5646 .aux_get_pasid = intel_iommu_aux_get_pasid, 5647 .map_pages = intel_iommu_map_pages, 5648 .unmap_pages = intel_iommu_unmap_pages, 5649 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 5650 .flush_iotlb_all = intel_flush_iotlb_all, 5651 .iotlb_sync = intel_iommu_tlb_sync, 5652 .iova_to_phys = intel_iommu_iova_to_phys, 5653 .probe_device = intel_iommu_probe_device, 5654 .probe_finalize = intel_iommu_probe_finalize, 5655 .release_device = intel_iommu_release_device, 5656 .get_resv_regions = intel_iommu_get_resv_regions, 5657 .put_resv_regions = generic_iommu_put_resv_regions, 5658 .device_group = intel_iommu_device_group, 5659 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5660 .dev_enable_feat = intel_iommu_dev_enable_feat, 5661 .dev_disable_feat = intel_iommu_dev_disable_feat, 5662 .is_attach_deferred = intel_iommu_is_attach_deferred, 5663 .def_domain_type = device_def_domain_type, 5664 .pgsize_bitmap = SZ_4K, 5665 #ifdef CONFIG_INTEL_IOMMU_SVM 5666 .cache_invalidate = intel_iommu_sva_invalidate, 5667 .sva_bind_gpasid = intel_svm_bind_gpasid, 5668 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5669 .sva_bind = intel_svm_bind, 5670 .sva_unbind = intel_svm_unbind, 5671 .sva_get_pasid = intel_svm_get_pasid, 5672 .page_response = intel_svm_page_response, 5673 #endif 5674 }; 5675 5676 static void quirk_iommu_igfx(struct pci_dev *dev) 5677 { 5678 if (risky_device(dev)) 5679 return; 5680 5681 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5682 dmar_map_gfx = 0; 5683 } 5684 5685 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5693 5694 /* Broadwell igfx malfunctions with dmar */ 5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5719 5720 static void quirk_iommu_rwbf(struct pci_dev *dev) 5721 { 5722 if (risky_device(dev)) 5723 return; 5724 5725 /* 5726 * Mobile 4 Series Chipset neglects to set RWBF capability, 5727 * but needs it. Same seems to hold for the desktop versions. 5728 */ 5729 pci_info(dev, "Forcing write-buffer flush capability\n"); 5730 rwbf_quirk = 1; 5731 } 5732 5733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5736 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5737 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5738 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5739 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5740 5741 #define GGC 0x52 5742 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5743 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5744 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5745 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5746 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5747 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5748 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5749 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5750 5751 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5752 { 5753 unsigned short ggc; 5754 5755 if (risky_device(dev)) 5756 return; 5757 5758 if (pci_read_config_word(dev, GGC, &ggc)) 5759 return; 5760 5761 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5762 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5763 dmar_map_gfx = 0; 5764 } else if (dmar_map_gfx) { 5765 /* we have to ensure the gfx device is idle before we flush */ 5766 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5767 iommu_set_dma_strict(); 5768 } 5769 } 5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5774 5775 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5776 { 5777 unsigned short ver; 5778 5779 if (!IS_GFX_DEVICE(dev)) 5780 return; 5781 5782 ver = (dev->device >> 8) & 0xff; 5783 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5784 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5785 ver != 0x9a) 5786 return; 5787 5788 if (risky_device(dev)) 5789 return; 5790 5791 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5792 iommu_skip_te_disable = 1; 5793 } 5794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5795 5796 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5797 ISOCH DMAR unit for the Azalia sound device, but not give it any 5798 TLB entries, which causes it to deadlock. Check for that. We do 5799 this in a function called from init_dmars(), instead of in a PCI 5800 quirk, because we don't want to print the obnoxious "BIOS broken" 5801 message if VT-d is actually disabled. 5802 */ 5803 static void __init check_tylersburg_isoch(void) 5804 { 5805 struct pci_dev *pdev; 5806 uint32_t vtisochctrl; 5807 5808 /* If there's no Azalia in the system anyway, forget it. */ 5809 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5810 if (!pdev) 5811 return; 5812 5813 if (risky_device(pdev)) { 5814 pci_dev_put(pdev); 5815 return; 5816 } 5817 5818 pci_dev_put(pdev); 5819 5820 /* System Management Registers. Might be hidden, in which case 5821 we can't do the sanity check. But that's OK, because the 5822 known-broken BIOSes _don't_ actually hide it, so far. */ 5823 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5824 if (!pdev) 5825 return; 5826 5827 if (risky_device(pdev)) { 5828 pci_dev_put(pdev); 5829 return; 5830 } 5831 5832 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5833 pci_dev_put(pdev); 5834 return; 5835 } 5836 5837 pci_dev_put(pdev); 5838 5839 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5840 if (vtisochctrl & 1) 5841 return; 5842 5843 /* Drop all bits other than the number of TLB entries */ 5844 vtisochctrl &= 0x1c; 5845 5846 /* If we have the recommended number of TLB entries (16), fine. */ 5847 if (vtisochctrl == 0x10) 5848 return; 5849 5850 /* Zero TLB entries? You get to ride the short bus to school. */ 5851 if (!vtisochctrl) { 5852 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5853 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5854 dmi_get_system_info(DMI_BIOS_VENDOR), 5855 dmi_get_system_info(DMI_BIOS_VERSION), 5856 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5857 iommu_identity_mapping |= IDENTMAP_AZALIA; 5858 return; 5859 } 5860 5861 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5862 vtisochctrl); 5863 } 5864