1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/syscore_ops.h> 37 #include <linux/tboot.h> 38 #include <linux/dmi.h> 39 #include <linux/pci-ats.h> 40 #include <linux/memblock.h> 41 #include <linux/dma-map-ops.h> 42 #include <linux/dma-direct.h> 43 #include <linux/crash_dump.h> 44 #include <linux/numa.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 #include <trace/events/intel_iommu.h> 49 50 #include "../irq_remapping.h" 51 #include "pasid.h" 52 53 #define ROOT_SIZE VTD_PAGE_SIZE 54 #define CONTEXT_SIZE VTD_PAGE_SIZE 55 56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 60 61 #define IOAPIC_RANGE_START (0xfee00000) 62 #define IOAPIC_RANGE_END (0xfeefffff) 63 #define IOVA_START_ADDR (0x1000) 64 65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 66 67 #define MAX_AGAW_WIDTH 64 68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 69 70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 72 73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 78 79 /* IO virtual address start page frame number */ 80 #define IOVA_START_PFN (1) 81 82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 83 84 /* page table handling */ 85 #define LEVEL_STRIDE (9) 86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 87 88 /* 89 * This bitmap is used to advertise the page sizes our hardware support 90 * to the IOMMU core, which will then use this information to split 91 * physically contiguous memory regions it is mapping into page sizes 92 * that we support. 93 * 94 * Traditionally the IOMMU core just handed us the mappings directly, 95 * after making sure the size is an order of a 4KiB page and that the 96 * mapping has natural alignment. 97 * 98 * To retain this behavior, we currently advertise that we support 99 * all page sizes that are an order of 4KiB. 100 * 101 * If at some point we'd like to utilize the IOMMU core's new behavior, 102 * we could change this to advertise the real page sizes we support. 103 */ 104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 105 106 static inline int agaw_to_level(int agaw) 107 { 108 return agaw + 2; 109 } 110 111 static inline int agaw_to_width(int agaw) 112 { 113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 114 } 115 116 static inline int width_to_agaw(int width) 117 { 118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 119 } 120 121 static inline unsigned int level_to_offset_bits(int level) 122 { 123 return (level - 1) * LEVEL_STRIDE; 124 } 125 126 static inline int pfn_level_offset(u64 pfn, int level) 127 { 128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 129 } 130 131 static inline u64 level_mask(int level) 132 { 133 return -1ULL << level_to_offset_bits(level); 134 } 135 136 static inline u64 level_size(int level) 137 { 138 return 1ULL << level_to_offset_bits(level); 139 } 140 141 static inline u64 align_to_level(u64 pfn, int level) 142 { 143 return (pfn + level_size(level) - 1) & level_mask(level); 144 } 145 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 147 { 148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 149 } 150 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 152 are never going to work. */ 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 154 { 155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 156 } 157 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 159 { 160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 161 } 162 static inline unsigned long page_to_dma_pfn(struct page *pg) 163 { 164 return mm_to_dma_pfn(page_to_pfn(pg)); 165 } 166 static inline unsigned long virt_to_dma_pfn(void *p) 167 { 168 return page_to_dma_pfn(virt_to_page(p)); 169 } 170 171 /* global iommu list, set NULL for ignored DMAR units */ 172 static struct intel_iommu **g_iommus; 173 174 static void __init check_tylersburg_isoch(void); 175 static int rwbf_quirk; 176 177 /* 178 * set to 1 to panic kernel if can't successfully enable VT-d 179 * (used when kernel is launched w/ TXT) 180 */ 181 static int force_on = 0; 182 static int intel_iommu_tboot_noforce; 183 static int no_platform_optin; 184 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 186 187 /* 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 189 * if marked present. 190 */ 191 static phys_addr_t root_entry_lctp(struct root_entry *re) 192 { 193 if (!(re->lo & 1)) 194 return 0; 195 196 return re->lo & VTD_PAGE_MASK; 197 } 198 199 /* 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 201 * if marked present. 202 */ 203 static phys_addr_t root_entry_uctp(struct root_entry *re) 204 { 205 if (!(re->hi & 1)) 206 return 0; 207 208 return re->hi & VTD_PAGE_MASK; 209 } 210 211 static inline void context_clear_pasid_enable(struct context_entry *context) 212 { 213 context->lo &= ~(1ULL << 11); 214 } 215 216 static inline bool context_pasid_enabled(struct context_entry *context) 217 { 218 return !!(context->lo & (1ULL << 11)); 219 } 220 221 static inline void context_set_copied(struct context_entry *context) 222 { 223 context->hi |= (1ull << 3); 224 } 225 226 static inline bool context_copied(struct context_entry *context) 227 { 228 return !!(context->hi & (1ULL << 3)); 229 } 230 231 static inline bool __context_present(struct context_entry *context) 232 { 233 return (context->lo & 1); 234 } 235 236 bool context_present(struct context_entry *context) 237 { 238 return context_pasid_enabled(context) ? 239 __context_present(context) : 240 __context_present(context) && !context_copied(context); 241 } 242 243 static inline void context_set_present(struct context_entry *context) 244 { 245 context->lo |= 1; 246 } 247 248 static inline void context_set_fault_enable(struct context_entry *context) 249 { 250 context->lo &= (((u64)-1) << 2) | 1; 251 } 252 253 static inline void context_set_translation_type(struct context_entry *context, 254 unsigned long value) 255 { 256 context->lo &= (((u64)-1) << 4) | 3; 257 context->lo |= (value & 3) << 2; 258 } 259 260 static inline void context_set_address_root(struct context_entry *context, 261 unsigned long value) 262 { 263 context->lo &= ~VTD_PAGE_MASK; 264 context->lo |= value & VTD_PAGE_MASK; 265 } 266 267 static inline void context_set_address_width(struct context_entry *context, 268 unsigned long value) 269 { 270 context->hi |= value & 7; 271 } 272 273 static inline void context_set_domain_id(struct context_entry *context, 274 unsigned long value) 275 { 276 context->hi |= (value & ((1 << 16) - 1)) << 8; 277 } 278 279 static inline int context_domain_id(struct context_entry *c) 280 { 281 return((c->hi >> 8) & 0xffff); 282 } 283 284 static inline void context_clear_entry(struct context_entry *context) 285 { 286 context->lo = 0; 287 context->hi = 0; 288 } 289 290 /* 291 * This domain is a statically identity mapping domain. 292 * 1. This domain creats a static 1:1 mapping to all usable memory. 293 * 2. It maps to each iommu if successful. 294 * 3. Each iommu mapps to this domain if successful. 295 */ 296 static struct dmar_domain *si_domain; 297 static int hw_pass_through = 1; 298 299 #define for_each_domain_iommu(idx, domain) \ 300 for (idx = 0; idx < g_num_of_iommus; idx++) \ 301 if (domain->iommu_refcnt[idx]) 302 303 struct dmar_rmrr_unit { 304 struct list_head list; /* list of rmrr units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 u64 base_address; /* reserved base address*/ 307 u64 end_address; /* reserved end address */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 int devices_cnt; /* target device count */ 310 }; 311 312 struct dmar_atsr_unit { 313 struct list_head list; /* list of ATSR units */ 314 struct acpi_dmar_header *hdr; /* ACPI header */ 315 struct dmar_dev_scope *devices; /* target devices */ 316 int devices_cnt; /* target device count */ 317 u8 include_all:1; /* include all ports */ 318 }; 319 320 static LIST_HEAD(dmar_atsr_units); 321 static LIST_HEAD(dmar_rmrr_units); 322 323 #define for_each_rmrr_units(rmrr) \ 324 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 325 326 /* bitmap for indexing intel_iommus */ 327 static int g_num_of_iommus; 328 329 static void domain_exit(struct dmar_domain *domain); 330 static void domain_remove_dev_info(struct dmar_domain *domain); 331 static void dmar_remove_one_dev_info(struct device *dev); 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 333 static int intel_iommu_attach_device(struct iommu_domain *domain, 334 struct device *dev); 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 336 dma_addr_t iova); 337 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 339 int dmar_disabled = 0; 340 #else 341 int dmar_disabled = 1; 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 343 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 345 int intel_iommu_sm = 1; 346 #else 347 int intel_iommu_sm; 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 349 350 int intel_iommu_enabled = 0; 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 352 353 static int dmar_map_gfx = 1; 354 static int dmar_forcedac; 355 static int intel_iommu_strict; 356 static int intel_iommu_superpage = 1; 357 static int iommu_identity_mapping; 358 static int iommu_skip_te_disable; 359 360 #define IDENTMAP_GFX 2 361 #define IDENTMAP_AZALIA 4 362 363 int intel_iommu_gfx_mapped; 364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 365 366 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 367 struct device_domain_info *get_domain_info(struct device *dev) 368 { 369 struct device_domain_info *info; 370 371 if (!dev) 372 return NULL; 373 374 info = dev_iommu_priv_get(dev); 375 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 376 return NULL; 377 378 return info; 379 } 380 381 DEFINE_SPINLOCK(device_domain_lock); 382 static LIST_HEAD(device_domain_list); 383 384 /* 385 * Iterate over elements in device_domain_list and call the specified 386 * callback @fn against each element. 387 */ 388 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 389 void *data), void *data) 390 { 391 int ret = 0; 392 unsigned long flags; 393 struct device_domain_info *info; 394 395 spin_lock_irqsave(&device_domain_lock, flags); 396 list_for_each_entry(info, &device_domain_list, global) { 397 ret = fn(info, data); 398 if (ret) { 399 spin_unlock_irqrestore(&device_domain_lock, flags); 400 return ret; 401 } 402 } 403 spin_unlock_irqrestore(&device_domain_lock, flags); 404 405 return 0; 406 } 407 408 const struct iommu_ops intel_iommu_ops; 409 410 static bool translation_pre_enabled(struct intel_iommu *iommu) 411 { 412 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 413 } 414 415 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 416 { 417 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 418 } 419 420 static void init_translation_status(struct intel_iommu *iommu) 421 { 422 u32 gsts; 423 424 gsts = readl(iommu->reg + DMAR_GSTS_REG); 425 if (gsts & DMA_GSTS_TES) 426 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 427 } 428 429 static int __init intel_iommu_setup(char *str) 430 { 431 if (!str) 432 return -EINVAL; 433 while (*str) { 434 if (!strncmp(str, "on", 2)) { 435 dmar_disabled = 0; 436 pr_info("IOMMU enabled\n"); 437 } else if (!strncmp(str, "off", 3)) { 438 dmar_disabled = 1; 439 no_platform_optin = 1; 440 pr_info("IOMMU disabled\n"); 441 } else if (!strncmp(str, "igfx_off", 8)) { 442 dmar_map_gfx = 0; 443 pr_info("Disable GFX device mapping\n"); 444 } else if (!strncmp(str, "forcedac", 8)) { 445 pr_info("Forcing DAC for PCI devices\n"); 446 dmar_forcedac = 1; 447 } else if (!strncmp(str, "strict", 6)) { 448 pr_info("Disable batched IOTLB flush\n"); 449 intel_iommu_strict = 1; 450 } else if (!strncmp(str, "sp_off", 6)) { 451 pr_info("Disable supported super page\n"); 452 intel_iommu_superpage = 0; 453 } else if (!strncmp(str, "sm_on", 5)) { 454 pr_info("Intel-IOMMU: scalable mode supported\n"); 455 intel_iommu_sm = 1; 456 } else if (!strncmp(str, "tboot_noforce", 13)) { 457 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 458 intel_iommu_tboot_noforce = 1; 459 } 460 461 str += strcspn(str, ","); 462 while (*str == ',') 463 str++; 464 } 465 return 0; 466 } 467 __setup("intel_iommu=", intel_iommu_setup); 468 469 static struct kmem_cache *iommu_domain_cache; 470 static struct kmem_cache *iommu_devinfo_cache; 471 472 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 473 { 474 struct dmar_domain **domains; 475 int idx = did >> 8; 476 477 domains = iommu->domains[idx]; 478 if (!domains) 479 return NULL; 480 481 return domains[did & 0xff]; 482 } 483 484 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 485 struct dmar_domain *domain) 486 { 487 struct dmar_domain **domains; 488 int idx = did >> 8; 489 490 if (!iommu->domains[idx]) { 491 size_t size = 256 * sizeof(struct dmar_domain *); 492 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 493 } 494 495 domains = iommu->domains[idx]; 496 if (WARN_ON(!domains)) 497 return; 498 else 499 domains[did & 0xff] = domain; 500 } 501 502 void *alloc_pgtable_page(int node) 503 { 504 struct page *page; 505 void *vaddr = NULL; 506 507 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 508 if (page) 509 vaddr = page_address(page); 510 return vaddr; 511 } 512 513 void free_pgtable_page(void *vaddr) 514 { 515 free_page((unsigned long)vaddr); 516 } 517 518 static inline void *alloc_domain_mem(void) 519 { 520 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 521 } 522 523 static void free_domain_mem(void *vaddr) 524 { 525 kmem_cache_free(iommu_domain_cache, vaddr); 526 } 527 528 static inline void * alloc_devinfo_mem(void) 529 { 530 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 531 } 532 533 static inline void free_devinfo_mem(void *vaddr) 534 { 535 kmem_cache_free(iommu_devinfo_cache, vaddr); 536 } 537 538 static inline int domain_type_is_si(struct dmar_domain *domain) 539 { 540 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 541 } 542 543 static inline bool domain_use_first_level(struct dmar_domain *domain) 544 { 545 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 546 } 547 548 static inline int domain_pfn_supported(struct dmar_domain *domain, 549 unsigned long pfn) 550 { 551 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 552 553 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 554 } 555 556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 557 { 558 unsigned long sagaw; 559 int agaw = -1; 560 561 sagaw = cap_sagaw(iommu->cap); 562 for (agaw = width_to_agaw(max_gaw); 563 agaw >= 0; agaw--) { 564 if (test_bit(agaw, &sagaw)) 565 break; 566 } 567 568 return agaw; 569 } 570 571 /* 572 * Calculate max SAGAW for each iommu. 573 */ 574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 575 { 576 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 577 } 578 579 /* 580 * calculate agaw for each iommu. 581 * "SAGAW" may be different across iommus, use a default agaw, and 582 * get a supported less agaw for iommus that don't support the default agaw. 583 */ 584 int iommu_calculate_agaw(struct intel_iommu *iommu) 585 { 586 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 587 } 588 589 /* This functionin only returns single iommu in a domain */ 590 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 591 { 592 int iommu_id; 593 594 /* si_domain and vm domain should not get here. */ 595 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 596 return NULL; 597 598 for_each_domain_iommu(iommu_id, domain) 599 break; 600 601 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 602 return NULL; 603 604 return g_iommus[iommu_id]; 605 } 606 607 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 608 { 609 return sm_supported(iommu) ? 610 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 611 } 612 613 static void domain_update_iommu_coherency(struct dmar_domain *domain) 614 { 615 struct dmar_drhd_unit *drhd; 616 struct intel_iommu *iommu; 617 bool found = false; 618 int i; 619 620 domain->iommu_coherency = 1; 621 622 for_each_domain_iommu(i, domain) { 623 found = true; 624 if (!iommu_paging_structure_coherency(g_iommus[i])) { 625 domain->iommu_coherency = 0; 626 break; 627 } 628 } 629 if (found) 630 return; 631 632 /* No hardware attached; use lowest common denominator */ 633 rcu_read_lock(); 634 for_each_active_iommu(iommu, drhd) { 635 if (!iommu_paging_structure_coherency(iommu)) { 636 domain->iommu_coherency = 0; 637 break; 638 } 639 } 640 rcu_read_unlock(); 641 } 642 643 static int domain_update_iommu_snooping(struct intel_iommu *skip) 644 { 645 struct dmar_drhd_unit *drhd; 646 struct intel_iommu *iommu; 647 int ret = 1; 648 649 rcu_read_lock(); 650 for_each_active_iommu(iommu, drhd) { 651 if (iommu != skip) { 652 if (!ecap_sc_support(iommu->ecap)) { 653 ret = 0; 654 break; 655 } 656 } 657 } 658 rcu_read_unlock(); 659 660 return ret; 661 } 662 663 static int domain_update_iommu_superpage(struct dmar_domain *domain, 664 struct intel_iommu *skip) 665 { 666 struct dmar_drhd_unit *drhd; 667 struct intel_iommu *iommu; 668 int mask = 0x3; 669 670 if (!intel_iommu_superpage) { 671 return 0; 672 } 673 674 /* set iommu_superpage to the smallest common denominator */ 675 rcu_read_lock(); 676 for_each_active_iommu(iommu, drhd) { 677 if (iommu != skip) { 678 if (domain && domain_use_first_level(domain)) { 679 if (!cap_fl1gp_support(iommu->cap)) 680 mask = 0x1; 681 } else { 682 mask &= cap_super_page_val(iommu->cap); 683 } 684 685 if (!mask) 686 break; 687 } 688 } 689 rcu_read_unlock(); 690 691 return fls(mask); 692 } 693 694 static int domain_update_device_node(struct dmar_domain *domain) 695 { 696 struct device_domain_info *info; 697 int nid = NUMA_NO_NODE; 698 699 assert_spin_locked(&device_domain_lock); 700 701 if (list_empty(&domain->devices)) 702 return NUMA_NO_NODE; 703 704 list_for_each_entry(info, &domain->devices, link) { 705 if (!info->dev) 706 continue; 707 708 /* 709 * There could possibly be multiple device numa nodes as devices 710 * within the same domain may sit behind different IOMMUs. There 711 * isn't perfect answer in such situation, so we select first 712 * come first served policy. 713 */ 714 nid = dev_to_node(info->dev); 715 if (nid != NUMA_NO_NODE) 716 break; 717 } 718 719 return nid; 720 } 721 722 /* Some capabilities may be different across iommus */ 723 static void domain_update_iommu_cap(struct dmar_domain *domain) 724 { 725 domain_update_iommu_coherency(domain); 726 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 727 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 728 729 /* 730 * If RHSA is missing, we should default to the device numa domain 731 * as fall back. 732 */ 733 if (domain->nid == NUMA_NO_NODE) 734 domain->nid = domain_update_device_node(domain); 735 736 /* 737 * First-level translation restricts the input-address to a 738 * canonical address (i.e., address bits 63:N have the same 739 * value as address bit [N-1], where N is 48-bits with 4-level 740 * paging and 57-bits with 5-level paging). Hence, skip bit 741 * [N-1]. 742 */ 743 if (domain_use_first_level(domain)) 744 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 745 else 746 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 747 } 748 749 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 750 u8 devfn, int alloc) 751 { 752 struct root_entry *root = &iommu->root_entry[bus]; 753 struct context_entry *context; 754 u64 *entry; 755 756 entry = &root->lo; 757 if (sm_supported(iommu)) { 758 if (devfn >= 0x80) { 759 devfn -= 0x80; 760 entry = &root->hi; 761 } 762 devfn *= 2; 763 } 764 if (*entry & 1) 765 context = phys_to_virt(*entry & VTD_PAGE_MASK); 766 else { 767 unsigned long phy_addr; 768 if (!alloc) 769 return NULL; 770 771 context = alloc_pgtable_page(iommu->node); 772 if (!context) 773 return NULL; 774 775 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 776 phy_addr = virt_to_phys((void *)context); 777 *entry = phy_addr | 1; 778 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 779 } 780 return &context[devfn]; 781 } 782 783 static bool attach_deferred(struct device *dev) 784 { 785 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 786 } 787 788 /** 789 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 790 * sub-hierarchy of a candidate PCI-PCI bridge 791 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 792 * @bridge: the candidate PCI-PCI bridge 793 * 794 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 795 */ 796 static bool 797 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 798 { 799 struct pci_dev *pdev, *pbridge; 800 801 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 802 return false; 803 804 pdev = to_pci_dev(dev); 805 pbridge = to_pci_dev(bridge); 806 807 if (pbridge->subordinate && 808 pbridge->subordinate->number <= pdev->bus->number && 809 pbridge->subordinate->busn_res.end >= pdev->bus->number) 810 return true; 811 812 return false; 813 } 814 815 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 816 { 817 struct dmar_drhd_unit *drhd; 818 u32 vtbar; 819 int rc; 820 821 /* We know that this device on this chipset has its own IOMMU. 822 * If we find it under a different IOMMU, then the BIOS is lying 823 * to us. Hope that the IOMMU for this device is actually 824 * disabled, and it needs no translation... 825 */ 826 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 827 if (rc) { 828 /* "can't" happen */ 829 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 830 return false; 831 } 832 vtbar &= 0xffff0000; 833 834 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 835 drhd = dmar_find_matched_drhd_unit(pdev); 836 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 837 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 838 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 839 return true; 840 } 841 842 return false; 843 } 844 845 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 846 { 847 if (!iommu || iommu->drhd->ignored) 848 return true; 849 850 if (dev_is_pci(dev)) { 851 struct pci_dev *pdev = to_pci_dev(dev); 852 853 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 854 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 855 quirk_ioat_snb_local_iommu(pdev)) 856 return true; 857 } 858 859 return false; 860 } 861 862 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 863 { 864 struct dmar_drhd_unit *drhd = NULL; 865 struct pci_dev *pdev = NULL; 866 struct intel_iommu *iommu; 867 struct device *tmp; 868 u16 segment = 0; 869 int i; 870 871 if (!dev) 872 return NULL; 873 874 if (dev_is_pci(dev)) { 875 struct pci_dev *pf_pdev; 876 877 pdev = pci_real_dma_dev(to_pci_dev(dev)); 878 879 /* VFs aren't listed in scope tables; we need to look up 880 * the PF instead to find the IOMMU. */ 881 pf_pdev = pci_physfn(pdev); 882 dev = &pf_pdev->dev; 883 segment = pci_domain_nr(pdev->bus); 884 } else if (has_acpi_companion(dev)) 885 dev = &ACPI_COMPANION(dev)->dev; 886 887 rcu_read_lock(); 888 for_each_iommu(iommu, drhd) { 889 if (pdev && segment != drhd->segment) 890 continue; 891 892 for_each_active_dev_scope(drhd->devices, 893 drhd->devices_cnt, i, tmp) { 894 if (tmp == dev) { 895 /* For a VF use its original BDF# not that of the PF 896 * which we used for the IOMMU lookup. Strictly speaking 897 * we could do this for all PCI devices; we only need to 898 * get the BDF# from the scope table for ACPI matches. */ 899 if (pdev && pdev->is_virtfn) 900 goto got_pdev; 901 902 if (bus && devfn) { 903 *bus = drhd->devices[i].bus; 904 *devfn = drhd->devices[i].devfn; 905 } 906 goto out; 907 } 908 909 if (is_downstream_to_pci_bridge(dev, tmp)) 910 goto got_pdev; 911 } 912 913 if (pdev && drhd->include_all) { 914 got_pdev: 915 if (bus && devfn) { 916 *bus = pdev->bus->number; 917 *devfn = pdev->devfn; 918 } 919 goto out; 920 } 921 } 922 iommu = NULL; 923 out: 924 if (iommu_is_dummy(iommu, dev)) 925 iommu = NULL; 926 927 rcu_read_unlock(); 928 929 return iommu; 930 } 931 932 static void domain_flush_cache(struct dmar_domain *domain, 933 void *addr, int size) 934 { 935 if (!domain->iommu_coherency) 936 clflush_cache_range(addr, size); 937 } 938 939 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 940 { 941 struct context_entry *context; 942 int ret = 0; 943 unsigned long flags; 944 945 spin_lock_irqsave(&iommu->lock, flags); 946 context = iommu_context_addr(iommu, bus, devfn, 0); 947 if (context) 948 ret = context_present(context); 949 spin_unlock_irqrestore(&iommu->lock, flags); 950 return ret; 951 } 952 953 static void free_context_table(struct intel_iommu *iommu) 954 { 955 int i; 956 unsigned long flags; 957 struct context_entry *context; 958 959 spin_lock_irqsave(&iommu->lock, flags); 960 if (!iommu->root_entry) { 961 goto out; 962 } 963 for (i = 0; i < ROOT_ENTRY_NR; i++) { 964 context = iommu_context_addr(iommu, i, 0, 0); 965 if (context) 966 free_pgtable_page(context); 967 968 if (!sm_supported(iommu)) 969 continue; 970 971 context = iommu_context_addr(iommu, i, 0x80, 0); 972 if (context) 973 free_pgtable_page(context); 974 975 } 976 free_pgtable_page(iommu->root_entry); 977 iommu->root_entry = NULL; 978 out: 979 spin_unlock_irqrestore(&iommu->lock, flags); 980 } 981 982 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 983 unsigned long pfn, int *target_level) 984 { 985 struct dma_pte *parent, *pte; 986 int level = agaw_to_level(domain->agaw); 987 int offset; 988 989 BUG_ON(!domain->pgd); 990 991 if (!domain_pfn_supported(domain, pfn)) 992 /* Address beyond IOMMU's addressing capabilities. */ 993 return NULL; 994 995 parent = domain->pgd; 996 997 while (1) { 998 void *tmp_page; 999 1000 offset = pfn_level_offset(pfn, level); 1001 pte = &parent[offset]; 1002 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1003 break; 1004 if (level == *target_level) 1005 break; 1006 1007 if (!dma_pte_present(pte)) { 1008 uint64_t pteval; 1009 1010 tmp_page = alloc_pgtable_page(domain->nid); 1011 1012 if (!tmp_page) 1013 return NULL; 1014 1015 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1016 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1017 if (domain_use_first_level(domain)) 1018 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1019 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1020 /* Someone else set it while we were thinking; use theirs. */ 1021 free_pgtable_page(tmp_page); 1022 else 1023 domain_flush_cache(domain, pte, sizeof(*pte)); 1024 } 1025 if (level == 1) 1026 break; 1027 1028 parent = phys_to_virt(dma_pte_addr(pte)); 1029 level--; 1030 } 1031 1032 if (!*target_level) 1033 *target_level = level; 1034 1035 return pte; 1036 } 1037 1038 /* return address's pte at specific level */ 1039 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1040 unsigned long pfn, 1041 int level, int *large_page) 1042 { 1043 struct dma_pte *parent, *pte; 1044 int total = agaw_to_level(domain->agaw); 1045 int offset; 1046 1047 parent = domain->pgd; 1048 while (level <= total) { 1049 offset = pfn_level_offset(pfn, total); 1050 pte = &parent[offset]; 1051 if (level == total) 1052 return pte; 1053 1054 if (!dma_pte_present(pte)) { 1055 *large_page = total; 1056 break; 1057 } 1058 1059 if (dma_pte_superpage(pte)) { 1060 *large_page = total; 1061 return pte; 1062 } 1063 1064 parent = phys_to_virt(dma_pte_addr(pte)); 1065 total--; 1066 } 1067 return NULL; 1068 } 1069 1070 /* clear last level pte, a tlb flush should be followed */ 1071 static void dma_pte_clear_range(struct dmar_domain *domain, 1072 unsigned long start_pfn, 1073 unsigned long last_pfn) 1074 { 1075 unsigned int large_page; 1076 struct dma_pte *first_pte, *pte; 1077 1078 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1079 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1080 BUG_ON(start_pfn > last_pfn); 1081 1082 /* we don't need lock here; nobody else touches the iova range */ 1083 do { 1084 large_page = 1; 1085 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1086 if (!pte) { 1087 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1088 continue; 1089 } 1090 do { 1091 dma_clear_pte(pte); 1092 start_pfn += lvl_to_nr_pages(large_page); 1093 pte++; 1094 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1095 1096 domain_flush_cache(domain, first_pte, 1097 (void *)pte - (void *)first_pte); 1098 1099 } while (start_pfn && start_pfn <= last_pfn); 1100 } 1101 1102 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1103 int retain_level, struct dma_pte *pte, 1104 unsigned long pfn, unsigned long start_pfn, 1105 unsigned long last_pfn) 1106 { 1107 pfn = max(start_pfn, pfn); 1108 pte = &pte[pfn_level_offset(pfn, level)]; 1109 1110 do { 1111 unsigned long level_pfn; 1112 struct dma_pte *level_pte; 1113 1114 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1115 goto next; 1116 1117 level_pfn = pfn & level_mask(level); 1118 level_pte = phys_to_virt(dma_pte_addr(pte)); 1119 1120 if (level > 2) { 1121 dma_pte_free_level(domain, level - 1, retain_level, 1122 level_pte, level_pfn, start_pfn, 1123 last_pfn); 1124 } 1125 1126 /* 1127 * Free the page table if we're below the level we want to 1128 * retain and the range covers the entire table. 1129 */ 1130 if (level < retain_level && !(start_pfn > level_pfn || 1131 last_pfn < level_pfn + level_size(level) - 1)) { 1132 dma_clear_pte(pte); 1133 domain_flush_cache(domain, pte, sizeof(*pte)); 1134 free_pgtable_page(level_pte); 1135 } 1136 next: 1137 pfn += level_size(level); 1138 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1139 } 1140 1141 /* 1142 * clear last level (leaf) ptes and free page table pages below the 1143 * level we wish to keep intact. 1144 */ 1145 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1146 unsigned long start_pfn, 1147 unsigned long last_pfn, 1148 int retain_level) 1149 { 1150 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1151 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1152 BUG_ON(start_pfn > last_pfn); 1153 1154 dma_pte_clear_range(domain, start_pfn, last_pfn); 1155 1156 /* We don't need lock here; nobody else touches the iova range */ 1157 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1158 domain->pgd, 0, start_pfn, last_pfn); 1159 1160 /* free pgd */ 1161 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1162 free_pgtable_page(domain->pgd); 1163 domain->pgd = NULL; 1164 } 1165 } 1166 1167 /* When a page at a given level is being unlinked from its parent, we don't 1168 need to *modify* it at all. All we need to do is make a list of all the 1169 pages which can be freed just as soon as we've flushed the IOTLB and we 1170 know the hardware page-walk will no longer touch them. 1171 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1172 be freed. */ 1173 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1174 int level, struct dma_pte *pte, 1175 struct page *freelist) 1176 { 1177 struct page *pg; 1178 1179 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1180 pg->freelist = freelist; 1181 freelist = pg; 1182 1183 if (level == 1) 1184 return freelist; 1185 1186 pte = page_address(pg); 1187 do { 1188 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1189 freelist = dma_pte_list_pagetables(domain, level - 1, 1190 pte, freelist); 1191 pte++; 1192 } while (!first_pte_in_page(pte)); 1193 1194 return freelist; 1195 } 1196 1197 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1198 struct dma_pte *pte, unsigned long pfn, 1199 unsigned long start_pfn, 1200 unsigned long last_pfn, 1201 struct page *freelist) 1202 { 1203 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1204 1205 pfn = max(start_pfn, pfn); 1206 pte = &pte[pfn_level_offset(pfn, level)]; 1207 1208 do { 1209 unsigned long level_pfn; 1210 1211 if (!dma_pte_present(pte)) 1212 goto next; 1213 1214 level_pfn = pfn & level_mask(level); 1215 1216 /* If range covers entire pagetable, free it */ 1217 if (start_pfn <= level_pfn && 1218 last_pfn >= level_pfn + level_size(level) - 1) { 1219 /* These suborbinate page tables are going away entirely. Don't 1220 bother to clear them; we're just going to *free* them. */ 1221 if (level > 1 && !dma_pte_superpage(pte)) 1222 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1223 1224 dma_clear_pte(pte); 1225 if (!first_pte) 1226 first_pte = pte; 1227 last_pte = pte; 1228 } else if (level > 1) { 1229 /* Recurse down into a level that isn't *entirely* obsolete */ 1230 freelist = dma_pte_clear_level(domain, level - 1, 1231 phys_to_virt(dma_pte_addr(pte)), 1232 level_pfn, start_pfn, last_pfn, 1233 freelist); 1234 } 1235 next: 1236 pfn += level_size(level); 1237 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1238 1239 if (first_pte) 1240 domain_flush_cache(domain, first_pte, 1241 (void *)++last_pte - (void *)first_pte); 1242 1243 return freelist; 1244 } 1245 1246 /* We can't just free the pages because the IOMMU may still be walking 1247 the page tables, and may have cached the intermediate levels. The 1248 pages can only be freed after the IOTLB flush has been done. */ 1249 static struct page *domain_unmap(struct dmar_domain *domain, 1250 unsigned long start_pfn, 1251 unsigned long last_pfn, 1252 struct page *freelist) 1253 { 1254 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1255 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1256 BUG_ON(start_pfn > last_pfn); 1257 1258 /* we don't need lock here; nobody else touches the iova range */ 1259 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1260 domain->pgd, 0, start_pfn, last_pfn, 1261 freelist); 1262 1263 /* free pgd */ 1264 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1265 struct page *pgd_page = virt_to_page(domain->pgd); 1266 pgd_page->freelist = freelist; 1267 freelist = pgd_page; 1268 1269 domain->pgd = NULL; 1270 } 1271 1272 return freelist; 1273 } 1274 1275 static void dma_free_pagelist(struct page *freelist) 1276 { 1277 struct page *pg; 1278 1279 while ((pg = freelist)) { 1280 freelist = pg->freelist; 1281 free_pgtable_page(page_address(pg)); 1282 } 1283 } 1284 1285 /* iommu handling */ 1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1287 { 1288 struct root_entry *root; 1289 unsigned long flags; 1290 1291 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1292 if (!root) { 1293 pr_err("Allocating root entry for %s failed\n", 1294 iommu->name); 1295 return -ENOMEM; 1296 } 1297 1298 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1299 1300 spin_lock_irqsave(&iommu->lock, flags); 1301 iommu->root_entry = root; 1302 spin_unlock_irqrestore(&iommu->lock, flags); 1303 1304 return 0; 1305 } 1306 1307 static void iommu_set_root_entry(struct intel_iommu *iommu) 1308 { 1309 u64 addr; 1310 u32 sts; 1311 unsigned long flag; 1312 1313 addr = virt_to_phys(iommu->root_entry); 1314 if (sm_supported(iommu)) 1315 addr |= DMA_RTADDR_SMT; 1316 1317 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1318 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1319 1320 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1321 1322 /* Make sure hardware complete it */ 1323 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1324 readl, (sts & DMA_GSTS_RTPS), sts); 1325 1326 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1327 } 1328 1329 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1330 { 1331 u32 val; 1332 unsigned long flag; 1333 1334 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1335 return; 1336 1337 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1338 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1339 1340 /* Make sure hardware complete it */ 1341 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1342 readl, (!(val & DMA_GSTS_WBFS)), val); 1343 1344 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1345 } 1346 1347 /* return value determine if we need a write buffer flush */ 1348 static void __iommu_flush_context(struct intel_iommu *iommu, 1349 u16 did, u16 source_id, u8 function_mask, 1350 u64 type) 1351 { 1352 u64 val = 0; 1353 unsigned long flag; 1354 1355 switch (type) { 1356 case DMA_CCMD_GLOBAL_INVL: 1357 val = DMA_CCMD_GLOBAL_INVL; 1358 break; 1359 case DMA_CCMD_DOMAIN_INVL: 1360 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1361 break; 1362 case DMA_CCMD_DEVICE_INVL: 1363 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1364 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1365 break; 1366 default: 1367 BUG(); 1368 } 1369 val |= DMA_CCMD_ICC; 1370 1371 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1372 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1373 1374 /* Make sure hardware complete it */ 1375 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1376 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1377 1378 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1379 } 1380 1381 /* return value determine if we need a write buffer flush */ 1382 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1383 u64 addr, unsigned int size_order, u64 type) 1384 { 1385 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1386 u64 val = 0, val_iva = 0; 1387 unsigned long flag; 1388 1389 switch (type) { 1390 case DMA_TLB_GLOBAL_FLUSH: 1391 /* global flush doesn't need set IVA_REG */ 1392 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1393 break; 1394 case DMA_TLB_DSI_FLUSH: 1395 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1396 break; 1397 case DMA_TLB_PSI_FLUSH: 1398 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1399 /* IH bit is passed in as part of address */ 1400 val_iva = size_order | addr; 1401 break; 1402 default: 1403 BUG(); 1404 } 1405 /* Note: set drain read/write */ 1406 #if 0 1407 /* 1408 * This is probably to be super secure.. Looks like we can 1409 * ignore it without any impact. 1410 */ 1411 if (cap_read_drain(iommu->cap)) 1412 val |= DMA_TLB_READ_DRAIN; 1413 #endif 1414 if (cap_write_drain(iommu->cap)) 1415 val |= DMA_TLB_WRITE_DRAIN; 1416 1417 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1418 /* Note: Only uses first TLB reg currently */ 1419 if (val_iva) 1420 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1421 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1422 1423 /* Make sure hardware complete it */ 1424 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1425 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1426 1427 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1428 1429 /* check IOTLB invalidation granularity */ 1430 if (DMA_TLB_IAIG(val) == 0) 1431 pr_err("Flush IOTLB failed\n"); 1432 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1433 pr_debug("TLB flush request %Lx, actual %Lx\n", 1434 (unsigned long long)DMA_TLB_IIRG(type), 1435 (unsigned long long)DMA_TLB_IAIG(val)); 1436 } 1437 1438 static struct device_domain_info * 1439 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1440 u8 bus, u8 devfn) 1441 { 1442 struct device_domain_info *info; 1443 1444 assert_spin_locked(&device_domain_lock); 1445 1446 if (!iommu->qi) 1447 return NULL; 1448 1449 list_for_each_entry(info, &domain->devices, link) 1450 if (info->iommu == iommu && info->bus == bus && 1451 info->devfn == devfn) { 1452 if (info->ats_supported && info->dev) 1453 return info; 1454 break; 1455 } 1456 1457 return NULL; 1458 } 1459 1460 static void domain_update_iotlb(struct dmar_domain *domain) 1461 { 1462 struct device_domain_info *info; 1463 bool has_iotlb_device = false; 1464 1465 assert_spin_locked(&device_domain_lock); 1466 1467 list_for_each_entry(info, &domain->devices, link) { 1468 struct pci_dev *pdev; 1469 1470 if (!info->dev || !dev_is_pci(info->dev)) 1471 continue; 1472 1473 pdev = to_pci_dev(info->dev); 1474 if (pdev->ats_enabled) { 1475 has_iotlb_device = true; 1476 break; 1477 } 1478 } 1479 1480 domain->has_iotlb_device = has_iotlb_device; 1481 } 1482 1483 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1484 { 1485 struct pci_dev *pdev; 1486 1487 assert_spin_locked(&device_domain_lock); 1488 1489 if (!info || !dev_is_pci(info->dev)) 1490 return; 1491 1492 pdev = to_pci_dev(info->dev); 1493 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1494 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1495 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1496 * reserved, which should be set to 0. 1497 */ 1498 if (!ecap_dit(info->iommu->ecap)) 1499 info->pfsid = 0; 1500 else { 1501 struct pci_dev *pf_pdev; 1502 1503 /* pdev will be returned if device is not a vf */ 1504 pf_pdev = pci_physfn(pdev); 1505 info->pfsid = pci_dev_id(pf_pdev); 1506 } 1507 1508 #ifdef CONFIG_INTEL_IOMMU_SVM 1509 /* The PCIe spec, in its wisdom, declares that the behaviour of 1510 the device if you enable PASID support after ATS support is 1511 undefined. So always enable PASID support on devices which 1512 have it, even if we can't yet know if we're ever going to 1513 use it. */ 1514 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1515 info->pasid_enabled = 1; 1516 1517 if (info->pri_supported && 1518 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1519 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1520 info->pri_enabled = 1; 1521 #endif 1522 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1523 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1524 info->ats_enabled = 1; 1525 domain_update_iotlb(info->domain); 1526 info->ats_qdep = pci_ats_queue_depth(pdev); 1527 } 1528 } 1529 1530 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1531 { 1532 struct pci_dev *pdev; 1533 1534 assert_spin_locked(&device_domain_lock); 1535 1536 if (!dev_is_pci(info->dev)) 1537 return; 1538 1539 pdev = to_pci_dev(info->dev); 1540 1541 if (info->ats_enabled) { 1542 pci_disable_ats(pdev); 1543 info->ats_enabled = 0; 1544 domain_update_iotlb(info->domain); 1545 } 1546 #ifdef CONFIG_INTEL_IOMMU_SVM 1547 if (info->pri_enabled) { 1548 pci_disable_pri(pdev); 1549 info->pri_enabled = 0; 1550 } 1551 if (info->pasid_enabled) { 1552 pci_disable_pasid(pdev); 1553 info->pasid_enabled = 0; 1554 } 1555 #endif 1556 } 1557 1558 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1559 u64 addr, unsigned mask) 1560 { 1561 u16 sid, qdep; 1562 unsigned long flags; 1563 struct device_domain_info *info; 1564 1565 if (!domain->has_iotlb_device) 1566 return; 1567 1568 spin_lock_irqsave(&device_domain_lock, flags); 1569 list_for_each_entry(info, &domain->devices, link) { 1570 if (!info->ats_enabled) 1571 continue; 1572 1573 sid = info->bus << 8 | info->devfn; 1574 qdep = info->ats_qdep; 1575 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1576 qdep, addr, mask); 1577 } 1578 spin_unlock_irqrestore(&device_domain_lock, flags); 1579 } 1580 1581 static void domain_flush_piotlb(struct intel_iommu *iommu, 1582 struct dmar_domain *domain, 1583 u64 addr, unsigned long npages, bool ih) 1584 { 1585 u16 did = domain->iommu_did[iommu->seq_id]; 1586 1587 if (domain->default_pasid) 1588 qi_flush_piotlb(iommu, did, domain->default_pasid, 1589 addr, npages, ih); 1590 1591 if (!list_empty(&domain->devices)) 1592 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1593 } 1594 1595 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1596 struct dmar_domain *domain, 1597 unsigned long pfn, unsigned int pages, 1598 int ih, int map) 1599 { 1600 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1601 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1602 u16 did = domain->iommu_did[iommu->seq_id]; 1603 1604 BUG_ON(pages == 0); 1605 1606 if (ih) 1607 ih = 1 << 6; 1608 1609 if (domain_use_first_level(domain)) { 1610 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1611 } else { 1612 /* 1613 * Fallback to domain selective flush if no PSI support or 1614 * the size is too big. PSI requires page size to be 2 ^ x, 1615 * and the base address is naturally aligned to the size. 1616 */ 1617 if (!cap_pgsel_inv(iommu->cap) || 1618 mask > cap_max_amask_val(iommu->cap)) 1619 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1620 DMA_TLB_DSI_FLUSH); 1621 else 1622 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1623 DMA_TLB_PSI_FLUSH); 1624 } 1625 1626 /* 1627 * In caching mode, changes of pages from non-present to present require 1628 * flush. However, device IOTLB doesn't need to be flushed in this case. 1629 */ 1630 if (!cap_caching_mode(iommu->cap) || !map) 1631 iommu_flush_dev_iotlb(domain, addr, mask); 1632 } 1633 1634 /* Notification for newly created mappings */ 1635 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1636 struct dmar_domain *domain, 1637 unsigned long pfn, unsigned int pages) 1638 { 1639 /* 1640 * It's a non-present to present mapping. Only flush if caching mode 1641 * and second level. 1642 */ 1643 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1644 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1645 else 1646 iommu_flush_write_buffer(iommu); 1647 } 1648 1649 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1650 { 1651 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1652 int idx; 1653 1654 for_each_domain_iommu(idx, dmar_domain) { 1655 struct intel_iommu *iommu = g_iommus[idx]; 1656 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1657 1658 if (domain_use_first_level(dmar_domain)) 1659 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1660 else 1661 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1662 DMA_TLB_DSI_FLUSH); 1663 1664 if (!cap_caching_mode(iommu->cap)) 1665 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1666 0, MAX_AGAW_PFN_WIDTH); 1667 } 1668 } 1669 1670 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1671 { 1672 u32 pmen; 1673 unsigned long flags; 1674 1675 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1676 return; 1677 1678 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1679 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1680 pmen &= ~DMA_PMEN_EPM; 1681 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1682 1683 /* wait for the protected region status bit to clear */ 1684 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1685 readl, !(pmen & DMA_PMEN_PRS), pmen); 1686 1687 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1688 } 1689 1690 static void iommu_enable_translation(struct intel_iommu *iommu) 1691 { 1692 u32 sts; 1693 unsigned long flags; 1694 1695 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1696 iommu->gcmd |= DMA_GCMD_TE; 1697 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1698 1699 /* Make sure hardware complete it */ 1700 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1701 readl, (sts & DMA_GSTS_TES), sts); 1702 1703 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1704 } 1705 1706 static void iommu_disable_translation(struct intel_iommu *iommu) 1707 { 1708 u32 sts; 1709 unsigned long flag; 1710 1711 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1712 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1713 return; 1714 1715 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1716 iommu->gcmd &= ~DMA_GCMD_TE; 1717 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1718 1719 /* Make sure hardware complete it */ 1720 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1721 readl, (!(sts & DMA_GSTS_TES)), sts); 1722 1723 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1724 } 1725 1726 static int iommu_init_domains(struct intel_iommu *iommu) 1727 { 1728 u32 ndomains, nlongs; 1729 size_t size; 1730 1731 ndomains = cap_ndoms(iommu->cap); 1732 pr_debug("%s: Number of Domains supported <%d>\n", 1733 iommu->name, ndomains); 1734 nlongs = BITS_TO_LONGS(ndomains); 1735 1736 spin_lock_init(&iommu->lock); 1737 1738 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1739 if (!iommu->domain_ids) { 1740 pr_err("%s: Allocating domain id array failed\n", 1741 iommu->name); 1742 return -ENOMEM; 1743 } 1744 1745 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1746 iommu->domains = kzalloc(size, GFP_KERNEL); 1747 1748 if (iommu->domains) { 1749 size = 256 * sizeof(struct dmar_domain *); 1750 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1751 } 1752 1753 if (!iommu->domains || !iommu->domains[0]) { 1754 pr_err("%s: Allocating domain array failed\n", 1755 iommu->name); 1756 kfree(iommu->domain_ids); 1757 kfree(iommu->domains); 1758 iommu->domain_ids = NULL; 1759 iommu->domains = NULL; 1760 return -ENOMEM; 1761 } 1762 1763 /* 1764 * If Caching mode is set, then invalid translations are tagged 1765 * with domain-id 0, hence we need to pre-allocate it. We also 1766 * use domain-id 0 as a marker for non-allocated domain-id, so 1767 * make sure it is not used for a real domain. 1768 */ 1769 set_bit(0, iommu->domain_ids); 1770 1771 /* 1772 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1773 * entry for first-level or pass-through translation modes should 1774 * be programmed with a domain id different from those used for 1775 * second-level or nested translation. We reserve a domain id for 1776 * this purpose. 1777 */ 1778 if (sm_supported(iommu)) 1779 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1780 1781 return 0; 1782 } 1783 1784 static void disable_dmar_iommu(struct intel_iommu *iommu) 1785 { 1786 struct device_domain_info *info, *tmp; 1787 unsigned long flags; 1788 1789 if (!iommu->domains || !iommu->domain_ids) 1790 return; 1791 1792 spin_lock_irqsave(&device_domain_lock, flags); 1793 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1794 if (info->iommu != iommu) 1795 continue; 1796 1797 if (!info->dev || !info->domain) 1798 continue; 1799 1800 __dmar_remove_one_dev_info(info); 1801 } 1802 spin_unlock_irqrestore(&device_domain_lock, flags); 1803 1804 if (iommu->gcmd & DMA_GCMD_TE) 1805 iommu_disable_translation(iommu); 1806 } 1807 1808 static void free_dmar_iommu(struct intel_iommu *iommu) 1809 { 1810 if ((iommu->domains) && (iommu->domain_ids)) { 1811 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1812 int i; 1813 1814 for (i = 0; i < elems; i++) 1815 kfree(iommu->domains[i]); 1816 kfree(iommu->domains); 1817 kfree(iommu->domain_ids); 1818 iommu->domains = NULL; 1819 iommu->domain_ids = NULL; 1820 } 1821 1822 g_iommus[iommu->seq_id] = NULL; 1823 1824 /* free context mapping */ 1825 free_context_table(iommu); 1826 1827 #ifdef CONFIG_INTEL_IOMMU_SVM 1828 if (pasid_supported(iommu)) { 1829 if (ecap_prs(iommu->ecap)) 1830 intel_svm_finish_prq(iommu); 1831 } 1832 if (vccap_pasid(iommu->vccap)) 1833 ioasid_unregister_allocator(&iommu->pasid_allocator); 1834 1835 #endif 1836 } 1837 1838 /* 1839 * Check and return whether first level is used by default for 1840 * DMA translation. 1841 */ 1842 static bool first_level_by_default(void) 1843 { 1844 struct dmar_drhd_unit *drhd; 1845 struct intel_iommu *iommu; 1846 static int first_level_support = -1; 1847 1848 if (likely(first_level_support != -1)) 1849 return first_level_support; 1850 1851 first_level_support = 1; 1852 1853 rcu_read_lock(); 1854 for_each_active_iommu(iommu, drhd) { 1855 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1856 first_level_support = 0; 1857 break; 1858 } 1859 } 1860 rcu_read_unlock(); 1861 1862 return first_level_support; 1863 } 1864 1865 static struct dmar_domain *alloc_domain(int flags) 1866 { 1867 struct dmar_domain *domain; 1868 1869 domain = alloc_domain_mem(); 1870 if (!domain) 1871 return NULL; 1872 1873 memset(domain, 0, sizeof(*domain)); 1874 domain->nid = NUMA_NO_NODE; 1875 domain->flags = flags; 1876 if (first_level_by_default()) 1877 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1878 domain->has_iotlb_device = false; 1879 INIT_LIST_HEAD(&domain->devices); 1880 1881 return domain; 1882 } 1883 1884 /* Must be called with iommu->lock */ 1885 static int domain_attach_iommu(struct dmar_domain *domain, 1886 struct intel_iommu *iommu) 1887 { 1888 unsigned long ndomains; 1889 int num; 1890 1891 assert_spin_locked(&device_domain_lock); 1892 assert_spin_locked(&iommu->lock); 1893 1894 domain->iommu_refcnt[iommu->seq_id] += 1; 1895 domain->iommu_count += 1; 1896 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1897 ndomains = cap_ndoms(iommu->cap); 1898 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1899 1900 if (num >= ndomains) { 1901 pr_err("%s: No free domain ids\n", iommu->name); 1902 domain->iommu_refcnt[iommu->seq_id] -= 1; 1903 domain->iommu_count -= 1; 1904 return -ENOSPC; 1905 } 1906 1907 set_bit(num, iommu->domain_ids); 1908 set_iommu_domain(iommu, num, domain); 1909 1910 domain->iommu_did[iommu->seq_id] = num; 1911 domain->nid = iommu->node; 1912 1913 domain_update_iommu_cap(domain); 1914 } 1915 1916 return 0; 1917 } 1918 1919 static int domain_detach_iommu(struct dmar_domain *domain, 1920 struct intel_iommu *iommu) 1921 { 1922 int num, count; 1923 1924 assert_spin_locked(&device_domain_lock); 1925 assert_spin_locked(&iommu->lock); 1926 1927 domain->iommu_refcnt[iommu->seq_id] -= 1; 1928 count = --domain->iommu_count; 1929 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1930 num = domain->iommu_did[iommu->seq_id]; 1931 clear_bit(num, iommu->domain_ids); 1932 set_iommu_domain(iommu, num, NULL); 1933 1934 domain_update_iommu_cap(domain); 1935 domain->iommu_did[iommu->seq_id] = 0; 1936 } 1937 1938 return count; 1939 } 1940 1941 static inline int guestwidth_to_adjustwidth(int gaw) 1942 { 1943 int agaw; 1944 int r = (gaw - 12) % 9; 1945 1946 if (r == 0) 1947 agaw = gaw; 1948 else 1949 agaw = gaw + 9 - r; 1950 if (agaw > 64) 1951 agaw = 64; 1952 return agaw; 1953 } 1954 1955 static void domain_exit(struct dmar_domain *domain) 1956 { 1957 1958 /* Remove associated devices and clear attached or cached domains */ 1959 domain_remove_dev_info(domain); 1960 1961 /* destroy iovas */ 1962 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1963 iommu_put_dma_cookie(&domain->domain); 1964 1965 if (domain->pgd) { 1966 struct page *freelist; 1967 1968 freelist = domain_unmap(domain, 0, 1969 DOMAIN_MAX_PFN(domain->gaw), NULL); 1970 dma_free_pagelist(freelist); 1971 } 1972 1973 free_domain_mem(domain); 1974 } 1975 1976 /* 1977 * Get the PASID directory size for scalable mode context entry. 1978 * Value of X in the PDTS field of a scalable mode context entry 1979 * indicates PASID directory with 2^(X + 7) entries. 1980 */ 1981 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1982 { 1983 int pds, max_pde; 1984 1985 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1986 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 1987 if (pds < 7) 1988 return 0; 1989 1990 return pds - 7; 1991 } 1992 1993 /* 1994 * Set the RID_PASID field of a scalable mode context entry. The 1995 * IOMMU hardware will use the PASID value set in this field for 1996 * DMA translations of DMA requests without PASID. 1997 */ 1998 static inline void 1999 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2000 { 2001 context->hi |= pasid & ((1 << 20) - 1); 2002 } 2003 2004 /* 2005 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2006 * entry. 2007 */ 2008 static inline void context_set_sm_dte(struct context_entry *context) 2009 { 2010 context->lo |= (1 << 2); 2011 } 2012 2013 /* 2014 * Set the PRE(Page Request Enable) field of a scalable mode context 2015 * entry. 2016 */ 2017 static inline void context_set_sm_pre(struct context_entry *context) 2018 { 2019 context->lo |= (1 << 4); 2020 } 2021 2022 /* Convert value to context PASID directory size field coding. */ 2023 #define context_pdts(pds) (((pds) & 0x7) << 9) 2024 2025 static int domain_context_mapping_one(struct dmar_domain *domain, 2026 struct intel_iommu *iommu, 2027 struct pasid_table *table, 2028 u8 bus, u8 devfn) 2029 { 2030 u16 did = domain->iommu_did[iommu->seq_id]; 2031 int translation = CONTEXT_TT_MULTI_LEVEL; 2032 struct device_domain_info *info = NULL; 2033 struct context_entry *context; 2034 unsigned long flags; 2035 int ret; 2036 2037 WARN_ON(did == 0); 2038 2039 if (hw_pass_through && domain_type_is_si(domain)) 2040 translation = CONTEXT_TT_PASS_THROUGH; 2041 2042 pr_debug("Set context mapping for %02x:%02x.%d\n", 2043 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2044 2045 BUG_ON(!domain->pgd); 2046 2047 spin_lock_irqsave(&device_domain_lock, flags); 2048 spin_lock(&iommu->lock); 2049 2050 ret = -ENOMEM; 2051 context = iommu_context_addr(iommu, bus, devfn, 1); 2052 if (!context) 2053 goto out_unlock; 2054 2055 ret = 0; 2056 if (context_present(context)) 2057 goto out_unlock; 2058 2059 /* 2060 * For kdump cases, old valid entries may be cached due to the 2061 * in-flight DMA and copied pgtable, but there is no unmapping 2062 * behaviour for them, thus we need an explicit cache flush for 2063 * the newly-mapped device. For kdump, at this point, the device 2064 * is supposed to finish reset at its driver probe stage, so no 2065 * in-flight DMA will exist, and we don't need to worry anymore 2066 * hereafter. 2067 */ 2068 if (context_copied(context)) { 2069 u16 did_old = context_domain_id(context); 2070 2071 if (did_old < cap_ndoms(iommu->cap)) { 2072 iommu->flush.flush_context(iommu, did_old, 2073 (((u16)bus) << 8) | devfn, 2074 DMA_CCMD_MASK_NOBIT, 2075 DMA_CCMD_DEVICE_INVL); 2076 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2077 DMA_TLB_DSI_FLUSH); 2078 } 2079 } 2080 2081 context_clear_entry(context); 2082 2083 if (sm_supported(iommu)) { 2084 unsigned long pds; 2085 2086 WARN_ON(!table); 2087 2088 /* Setup the PASID DIR pointer: */ 2089 pds = context_get_sm_pds(table); 2090 context->lo = (u64)virt_to_phys(table->table) | 2091 context_pdts(pds); 2092 2093 /* Setup the RID_PASID field: */ 2094 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2095 2096 /* 2097 * Setup the Device-TLB enable bit and Page request 2098 * Enable bit: 2099 */ 2100 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2101 if (info && info->ats_supported) 2102 context_set_sm_dte(context); 2103 if (info && info->pri_supported) 2104 context_set_sm_pre(context); 2105 } else { 2106 struct dma_pte *pgd = domain->pgd; 2107 int agaw; 2108 2109 context_set_domain_id(context, did); 2110 2111 if (translation != CONTEXT_TT_PASS_THROUGH) { 2112 /* 2113 * Skip top levels of page tables for iommu which has 2114 * less agaw than default. Unnecessary for PT mode. 2115 */ 2116 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2117 ret = -ENOMEM; 2118 pgd = phys_to_virt(dma_pte_addr(pgd)); 2119 if (!dma_pte_present(pgd)) 2120 goto out_unlock; 2121 } 2122 2123 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2124 if (info && info->ats_supported) 2125 translation = CONTEXT_TT_DEV_IOTLB; 2126 else 2127 translation = CONTEXT_TT_MULTI_LEVEL; 2128 2129 context_set_address_root(context, virt_to_phys(pgd)); 2130 context_set_address_width(context, agaw); 2131 } else { 2132 /* 2133 * In pass through mode, AW must be programmed to 2134 * indicate the largest AGAW value supported by 2135 * hardware. And ASR is ignored by hardware. 2136 */ 2137 context_set_address_width(context, iommu->msagaw); 2138 } 2139 2140 context_set_translation_type(context, translation); 2141 } 2142 2143 context_set_fault_enable(context); 2144 context_set_present(context); 2145 if (!ecap_coherent(iommu->ecap)) 2146 clflush_cache_range(context, sizeof(*context)); 2147 2148 /* 2149 * It's a non-present to present mapping. If hardware doesn't cache 2150 * non-present entry we only need to flush the write-buffer. If the 2151 * _does_ cache non-present entries, then it does so in the special 2152 * domain #0, which we have to flush: 2153 */ 2154 if (cap_caching_mode(iommu->cap)) { 2155 iommu->flush.flush_context(iommu, 0, 2156 (((u16)bus) << 8) | devfn, 2157 DMA_CCMD_MASK_NOBIT, 2158 DMA_CCMD_DEVICE_INVL); 2159 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2160 } else { 2161 iommu_flush_write_buffer(iommu); 2162 } 2163 iommu_enable_dev_iotlb(info); 2164 2165 ret = 0; 2166 2167 out_unlock: 2168 spin_unlock(&iommu->lock); 2169 spin_unlock_irqrestore(&device_domain_lock, flags); 2170 2171 return ret; 2172 } 2173 2174 struct domain_context_mapping_data { 2175 struct dmar_domain *domain; 2176 struct intel_iommu *iommu; 2177 struct pasid_table *table; 2178 }; 2179 2180 static int domain_context_mapping_cb(struct pci_dev *pdev, 2181 u16 alias, void *opaque) 2182 { 2183 struct domain_context_mapping_data *data = opaque; 2184 2185 return domain_context_mapping_one(data->domain, data->iommu, 2186 data->table, PCI_BUS_NUM(alias), 2187 alias & 0xff); 2188 } 2189 2190 static int 2191 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2192 { 2193 struct domain_context_mapping_data data; 2194 struct pasid_table *table; 2195 struct intel_iommu *iommu; 2196 u8 bus, devfn; 2197 2198 iommu = device_to_iommu(dev, &bus, &devfn); 2199 if (!iommu) 2200 return -ENODEV; 2201 2202 table = intel_pasid_get_table(dev); 2203 2204 if (!dev_is_pci(dev)) 2205 return domain_context_mapping_one(domain, iommu, table, 2206 bus, devfn); 2207 2208 data.domain = domain; 2209 data.iommu = iommu; 2210 data.table = table; 2211 2212 return pci_for_each_dma_alias(to_pci_dev(dev), 2213 &domain_context_mapping_cb, &data); 2214 } 2215 2216 static int domain_context_mapped_cb(struct pci_dev *pdev, 2217 u16 alias, void *opaque) 2218 { 2219 struct intel_iommu *iommu = opaque; 2220 2221 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2222 } 2223 2224 static int domain_context_mapped(struct device *dev) 2225 { 2226 struct intel_iommu *iommu; 2227 u8 bus, devfn; 2228 2229 iommu = device_to_iommu(dev, &bus, &devfn); 2230 if (!iommu) 2231 return -ENODEV; 2232 2233 if (!dev_is_pci(dev)) 2234 return device_context_mapped(iommu, bus, devfn); 2235 2236 return !pci_for_each_dma_alias(to_pci_dev(dev), 2237 domain_context_mapped_cb, iommu); 2238 } 2239 2240 /* Returns a number of VTD pages, but aligned to MM page size */ 2241 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2242 size_t size) 2243 { 2244 host_addr &= ~PAGE_MASK; 2245 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2246 } 2247 2248 /* Return largest possible superpage level for a given mapping */ 2249 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2250 unsigned long iov_pfn, 2251 unsigned long phy_pfn, 2252 unsigned long pages) 2253 { 2254 int support, level = 1; 2255 unsigned long pfnmerge; 2256 2257 support = domain->iommu_superpage; 2258 2259 /* To use a large page, the virtual *and* physical addresses 2260 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2261 of them will mean we have to use smaller pages. So just 2262 merge them and check both at once. */ 2263 pfnmerge = iov_pfn | phy_pfn; 2264 2265 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2266 pages >>= VTD_STRIDE_SHIFT; 2267 if (!pages) 2268 break; 2269 pfnmerge >>= VTD_STRIDE_SHIFT; 2270 level++; 2271 support--; 2272 } 2273 return level; 2274 } 2275 2276 static int 2277 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2278 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2279 { 2280 struct dma_pte *first_pte = NULL, *pte = NULL; 2281 unsigned int largepage_lvl = 0; 2282 unsigned long lvl_pages = 0; 2283 phys_addr_t pteval; 2284 u64 attr; 2285 2286 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2287 2288 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2289 return -EINVAL; 2290 2291 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2292 if (domain_use_first_level(domain)) 2293 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2294 2295 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2296 2297 while (nr_pages > 0) { 2298 uint64_t tmp; 2299 2300 if (!pte) { 2301 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2302 phys_pfn, nr_pages); 2303 2304 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2305 if (!pte) 2306 return -ENOMEM; 2307 /* It is large page*/ 2308 if (largepage_lvl > 1) { 2309 unsigned long nr_superpages, end_pfn; 2310 2311 pteval |= DMA_PTE_LARGE_PAGE; 2312 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2313 2314 nr_superpages = nr_pages / lvl_pages; 2315 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2316 2317 /* 2318 * Ensure that old small page tables are 2319 * removed to make room for superpage(s). 2320 * We're adding new large pages, so make sure 2321 * we don't remove their parent tables. 2322 */ 2323 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2324 largepage_lvl + 1); 2325 } else { 2326 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2327 } 2328 2329 } 2330 /* We don't need lock here, nobody else 2331 * touches the iova range 2332 */ 2333 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2334 if (tmp) { 2335 static int dumps = 5; 2336 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2337 iov_pfn, tmp, (unsigned long long)pteval); 2338 if (dumps) { 2339 dumps--; 2340 debug_dma_dump_mappings(NULL); 2341 } 2342 WARN_ON(1); 2343 } 2344 2345 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2346 2347 BUG_ON(nr_pages < lvl_pages); 2348 2349 nr_pages -= lvl_pages; 2350 iov_pfn += lvl_pages; 2351 phys_pfn += lvl_pages; 2352 pteval += lvl_pages * VTD_PAGE_SIZE; 2353 2354 /* If the next PTE would be the first in a new page, then we 2355 * need to flush the cache on the entries we've just written. 2356 * And then we'll need to recalculate 'pte', so clear it and 2357 * let it get set again in the if (!pte) block above. 2358 * 2359 * If we're done (!nr_pages) we need to flush the cache too. 2360 * 2361 * Also if we've been setting superpages, we may need to 2362 * recalculate 'pte' and switch back to smaller pages for the 2363 * end of the mapping, if the trailing size is not enough to 2364 * use another superpage (i.e. nr_pages < lvl_pages). 2365 */ 2366 pte++; 2367 if (!nr_pages || first_pte_in_page(pte) || 2368 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2369 domain_flush_cache(domain, first_pte, 2370 (void *)pte - (void *)first_pte); 2371 pte = NULL; 2372 } 2373 } 2374 2375 return 0; 2376 } 2377 2378 static int 2379 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2380 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2381 { 2382 int iommu_id, ret; 2383 struct intel_iommu *iommu; 2384 2385 /* Do the real mapping first */ 2386 ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot); 2387 if (ret) 2388 return ret; 2389 2390 for_each_domain_iommu(iommu_id, domain) { 2391 iommu = g_iommus[iommu_id]; 2392 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2393 } 2394 2395 return 0; 2396 } 2397 2398 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2399 { 2400 unsigned long flags; 2401 struct context_entry *context; 2402 u16 did_old; 2403 2404 if (!iommu) 2405 return; 2406 2407 spin_lock_irqsave(&iommu->lock, flags); 2408 context = iommu_context_addr(iommu, bus, devfn, 0); 2409 if (!context) { 2410 spin_unlock_irqrestore(&iommu->lock, flags); 2411 return; 2412 } 2413 did_old = context_domain_id(context); 2414 context_clear_entry(context); 2415 __iommu_flush_cache(iommu, context, sizeof(*context)); 2416 spin_unlock_irqrestore(&iommu->lock, flags); 2417 iommu->flush.flush_context(iommu, 2418 did_old, 2419 (((u16)bus) << 8) | devfn, 2420 DMA_CCMD_MASK_NOBIT, 2421 DMA_CCMD_DEVICE_INVL); 2422 iommu->flush.flush_iotlb(iommu, 2423 did_old, 2424 0, 2425 0, 2426 DMA_TLB_DSI_FLUSH); 2427 } 2428 2429 static inline void unlink_domain_info(struct device_domain_info *info) 2430 { 2431 assert_spin_locked(&device_domain_lock); 2432 list_del(&info->link); 2433 list_del(&info->global); 2434 if (info->dev) 2435 dev_iommu_priv_set(info->dev, NULL); 2436 } 2437 2438 static void domain_remove_dev_info(struct dmar_domain *domain) 2439 { 2440 struct device_domain_info *info, *tmp; 2441 unsigned long flags; 2442 2443 spin_lock_irqsave(&device_domain_lock, flags); 2444 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2445 __dmar_remove_one_dev_info(info); 2446 spin_unlock_irqrestore(&device_domain_lock, flags); 2447 } 2448 2449 struct dmar_domain *find_domain(struct device *dev) 2450 { 2451 struct device_domain_info *info; 2452 2453 if (unlikely(!dev || !dev->iommu)) 2454 return NULL; 2455 2456 if (unlikely(attach_deferred(dev))) 2457 return NULL; 2458 2459 /* No lock here, assumes no domain exit in normal case */ 2460 info = get_domain_info(dev); 2461 if (likely(info)) 2462 return info->domain; 2463 2464 return NULL; 2465 } 2466 2467 static inline struct device_domain_info * 2468 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2469 { 2470 struct device_domain_info *info; 2471 2472 list_for_each_entry(info, &device_domain_list, global) 2473 if (info->segment == segment && info->bus == bus && 2474 info->devfn == devfn) 2475 return info; 2476 2477 return NULL; 2478 } 2479 2480 static int domain_setup_first_level(struct intel_iommu *iommu, 2481 struct dmar_domain *domain, 2482 struct device *dev, 2483 u32 pasid) 2484 { 2485 int flags = PASID_FLAG_SUPERVISOR_MODE; 2486 struct dma_pte *pgd = domain->pgd; 2487 int agaw, level; 2488 2489 /* 2490 * Skip top levels of page tables for iommu which has 2491 * less agaw than default. Unnecessary for PT mode. 2492 */ 2493 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2494 pgd = phys_to_virt(dma_pte_addr(pgd)); 2495 if (!dma_pte_present(pgd)) 2496 return -ENOMEM; 2497 } 2498 2499 level = agaw_to_level(agaw); 2500 if (level != 4 && level != 5) 2501 return -EINVAL; 2502 2503 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2504 2505 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2506 domain->iommu_did[iommu->seq_id], 2507 flags); 2508 } 2509 2510 static bool dev_is_real_dma_subdevice(struct device *dev) 2511 { 2512 return dev && dev_is_pci(dev) && 2513 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2514 } 2515 2516 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2517 int bus, int devfn, 2518 struct device *dev, 2519 struct dmar_domain *domain) 2520 { 2521 struct dmar_domain *found = NULL; 2522 struct device_domain_info *info; 2523 unsigned long flags; 2524 int ret; 2525 2526 info = alloc_devinfo_mem(); 2527 if (!info) 2528 return NULL; 2529 2530 if (!dev_is_real_dma_subdevice(dev)) { 2531 info->bus = bus; 2532 info->devfn = devfn; 2533 info->segment = iommu->segment; 2534 } else { 2535 struct pci_dev *pdev = to_pci_dev(dev); 2536 2537 info->bus = pdev->bus->number; 2538 info->devfn = pdev->devfn; 2539 info->segment = pci_domain_nr(pdev->bus); 2540 } 2541 2542 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2543 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2544 info->ats_qdep = 0; 2545 info->dev = dev; 2546 info->domain = domain; 2547 info->iommu = iommu; 2548 info->pasid_table = NULL; 2549 info->auxd_enabled = 0; 2550 INIT_LIST_HEAD(&info->auxiliary_domains); 2551 2552 if (dev && dev_is_pci(dev)) { 2553 struct pci_dev *pdev = to_pci_dev(info->dev); 2554 2555 if (ecap_dev_iotlb_support(iommu->ecap) && 2556 pci_ats_supported(pdev) && 2557 dmar_find_matched_atsr_unit(pdev)) 2558 info->ats_supported = 1; 2559 2560 if (sm_supported(iommu)) { 2561 if (pasid_supported(iommu)) { 2562 int features = pci_pasid_features(pdev); 2563 if (features >= 0) 2564 info->pasid_supported = features | 1; 2565 } 2566 2567 if (info->ats_supported && ecap_prs(iommu->ecap) && 2568 pci_pri_supported(pdev)) 2569 info->pri_supported = 1; 2570 } 2571 } 2572 2573 spin_lock_irqsave(&device_domain_lock, flags); 2574 if (dev) 2575 found = find_domain(dev); 2576 2577 if (!found) { 2578 struct device_domain_info *info2; 2579 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2580 info->devfn); 2581 if (info2) { 2582 found = info2->domain; 2583 info2->dev = dev; 2584 } 2585 } 2586 2587 if (found) { 2588 spin_unlock_irqrestore(&device_domain_lock, flags); 2589 free_devinfo_mem(info); 2590 /* Caller must free the original domain */ 2591 return found; 2592 } 2593 2594 spin_lock(&iommu->lock); 2595 ret = domain_attach_iommu(domain, iommu); 2596 spin_unlock(&iommu->lock); 2597 2598 if (ret) { 2599 spin_unlock_irqrestore(&device_domain_lock, flags); 2600 free_devinfo_mem(info); 2601 return NULL; 2602 } 2603 2604 list_add(&info->link, &domain->devices); 2605 list_add(&info->global, &device_domain_list); 2606 if (dev) 2607 dev_iommu_priv_set(dev, info); 2608 spin_unlock_irqrestore(&device_domain_lock, flags); 2609 2610 /* PASID table is mandatory for a PCI device in scalable mode. */ 2611 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2612 ret = intel_pasid_alloc_table(dev); 2613 if (ret) { 2614 dev_err(dev, "PASID table allocation failed\n"); 2615 dmar_remove_one_dev_info(dev); 2616 return NULL; 2617 } 2618 2619 /* Setup the PASID entry for requests without PASID: */ 2620 spin_lock_irqsave(&iommu->lock, flags); 2621 if (hw_pass_through && domain_type_is_si(domain)) 2622 ret = intel_pasid_setup_pass_through(iommu, domain, 2623 dev, PASID_RID2PASID); 2624 else if (domain_use_first_level(domain)) 2625 ret = domain_setup_first_level(iommu, domain, dev, 2626 PASID_RID2PASID); 2627 else 2628 ret = intel_pasid_setup_second_level(iommu, domain, 2629 dev, PASID_RID2PASID); 2630 spin_unlock_irqrestore(&iommu->lock, flags); 2631 if (ret) { 2632 dev_err(dev, "Setup RID2PASID failed\n"); 2633 dmar_remove_one_dev_info(dev); 2634 return NULL; 2635 } 2636 } 2637 2638 if (dev && domain_context_mapping(domain, dev)) { 2639 dev_err(dev, "Domain context map failed\n"); 2640 dmar_remove_one_dev_info(dev); 2641 return NULL; 2642 } 2643 2644 return domain; 2645 } 2646 2647 static int iommu_domain_identity_map(struct dmar_domain *domain, 2648 unsigned long first_vpfn, 2649 unsigned long last_vpfn) 2650 { 2651 /* 2652 * RMRR range might have overlap with physical memory range, 2653 * clear it first 2654 */ 2655 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2656 2657 return __domain_mapping(domain, first_vpfn, 2658 first_vpfn, last_vpfn - first_vpfn + 1, 2659 DMA_PTE_READ|DMA_PTE_WRITE); 2660 } 2661 2662 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2663 2664 static int __init si_domain_init(int hw) 2665 { 2666 struct dmar_rmrr_unit *rmrr; 2667 struct device *dev; 2668 int i, nid, ret; 2669 2670 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2671 if (!si_domain) 2672 return -EFAULT; 2673 2674 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2675 domain_exit(si_domain); 2676 return -EFAULT; 2677 } 2678 2679 if (hw) 2680 return 0; 2681 2682 for_each_online_node(nid) { 2683 unsigned long start_pfn, end_pfn; 2684 int i; 2685 2686 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2687 ret = iommu_domain_identity_map(si_domain, 2688 mm_to_dma_pfn(start_pfn), 2689 mm_to_dma_pfn(end_pfn)); 2690 if (ret) 2691 return ret; 2692 } 2693 } 2694 2695 /* 2696 * Identity map the RMRRs so that devices with RMRRs could also use 2697 * the si_domain. 2698 */ 2699 for_each_rmrr_units(rmrr) { 2700 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2701 i, dev) { 2702 unsigned long long start = rmrr->base_address; 2703 unsigned long long end = rmrr->end_address; 2704 2705 if (WARN_ON(end < start || 2706 end >> agaw_to_width(si_domain->agaw))) 2707 continue; 2708 2709 ret = iommu_domain_identity_map(si_domain, 2710 mm_to_dma_pfn(start >> PAGE_SHIFT), 2711 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2712 if (ret) 2713 return ret; 2714 } 2715 } 2716 2717 return 0; 2718 } 2719 2720 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2721 { 2722 struct dmar_domain *ndomain; 2723 struct intel_iommu *iommu; 2724 u8 bus, devfn; 2725 2726 iommu = device_to_iommu(dev, &bus, &devfn); 2727 if (!iommu) 2728 return -ENODEV; 2729 2730 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2731 if (ndomain != domain) 2732 return -EBUSY; 2733 2734 return 0; 2735 } 2736 2737 static bool device_has_rmrr(struct device *dev) 2738 { 2739 struct dmar_rmrr_unit *rmrr; 2740 struct device *tmp; 2741 int i; 2742 2743 rcu_read_lock(); 2744 for_each_rmrr_units(rmrr) { 2745 /* 2746 * Return TRUE if this RMRR contains the device that 2747 * is passed in. 2748 */ 2749 for_each_active_dev_scope(rmrr->devices, 2750 rmrr->devices_cnt, i, tmp) 2751 if (tmp == dev || 2752 is_downstream_to_pci_bridge(dev, tmp)) { 2753 rcu_read_unlock(); 2754 return true; 2755 } 2756 } 2757 rcu_read_unlock(); 2758 return false; 2759 } 2760 2761 /** 2762 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2763 * is relaxable (ie. is allowed to be not enforced under some conditions) 2764 * @dev: device handle 2765 * 2766 * We assume that PCI USB devices with RMRRs have them largely 2767 * for historical reasons and that the RMRR space is not actively used post 2768 * boot. This exclusion may change if vendors begin to abuse it. 2769 * 2770 * The same exception is made for graphics devices, with the requirement that 2771 * any use of the RMRR regions will be torn down before assigning the device 2772 * to a guest. 2773 * 2774 * Return: true if the RMRR is relaxable, false otherwise 2775 */ 2776 static bool device_rmrr_is_relaxable(struct device *dev) 2777 { 2778 struct pci_dev *pdev; 2779 2780 if (!dev_is_pci(dev)) 2781 return false; 2782 2783 pdev = to_pci_dev(dev); 2784 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2785 return true; 2786 else 2787 return false; 2788 } 2789 2790 /* 2791 * There are a couple cases where we need to restrict the functionality of 2792 * devices associated with RMRRs. The first is when evaluating a device for 2793 * identity mapping because problems exist when devices are moved in and out 2794 * of domains and their respective RMRR information is lost. This means that 2795 * a device with associated RMRRs will never be in a "passthrough" domain. 2796 * The second is use of the device through the IOMMU API. This interface 2797 * expects to have full control of the IOVA space for the device. We cannot 2798 * satisfy both the requirement that RMRR access is maintained and have an 2799 * unencumbered IOVA space. We also have no ability to quiesce the device's 2800 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2801 * We therefore prevent devices associated with an RMRR from participating in 2802 * the IOMMU API, which eliminates them from device assignment. 2803 * 2804 * In both cases, devices which have relaxable RMRRs are not concerned by this 2805 * restriction. See device_rmrr_is_relaxable comment. 2806 */ 2807 static bool device_is_rmrr_locked(struct device *dev) 2808 { 2809 if (!device_has_rmrr(dev)) 2810 return false; 2811 2812 if (device_rmrr_is_relaxable(dev)) 2813 return false; 2814 2815 return true; 2816 } 2817 2818 /* 2819 * Return the required default domain type for a specific device. 2820 * 2821 * @dev: the device in query 2822 * @startup: true if this is during early boot 2823 * 2824 * Returns: 2825 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2826 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2827 * - 0: both identity and dynamic domains work for this device 2828 */ 2829 static int device_def_domain_type(struct device *dev) 2830 { 2831 if (dev_is_pci(dev)) { 2832 struct pci_dev *pdev = to_pci_dev(dev); 2833 2834 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2835 return IOMMU_DOMAIN_IDENTITY; 2836 2837 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2838 return IOMMU_DOMAIN_IDENTITY; 2839 } 2840 2841 return 0; 2842 } 2843 2844 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2845 { 2846 /* 2847 * Start from the sane iommu hardware state. 2848 * If the queued invalidation is already initialized by us 2849 * (for example, while enabling interrupt-remapping) then 2850 * we got the things already rolling from a sane state. 2851 */ 2852 if (!iommu->qi) { 2853 /* 2854 * Clear any previous faults. 2855 */ 2856 dmar_fault(-1, iommu); 2857 /* 2858 * Disable queued invalidation if supported and already enabled 2859 * before OS handover. 2860 */ 2861 dmar_disable_qi(iommu); 2862 } 2863 2864 if (dmar_enable_qi(iommu)) { 2865 /* 2866 * Queued Invalidate not enabled, use Register Based Invalidate 2867 */ 2868 iommu->flush.flush_context = __iommu_flush_context; 2869 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2870 pr_info("%s: Using Register based invalidation\n", 2871 iommu->name); 2872 } else { 2873 iommu->flush.flush_context = qi_flush_context; 2874 iommu->flush.flush_iotlb = qi_flush_iotlb; 2875 pr_info("%s: Using Queued invalidation\n", iommu->name); 2876 } 2877 } 2878 2879 static int copy_context_table(struct intel_iommu *iommu, 2880 struct root_entry *old_re, 2881 struct context_entry **tbl, 2882 int bus, bool ext) 2883 { 2884 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2885 struct context_entry *new_ce = NULL, ce; 2886 struct context_entry *old_ce = NULL; 2887 struct root_entry re; 2888 phys_addr_t old_ce_phys; 2889 2890 tbl_idx = ext ? bus * 2 : bus; 2891 memcpy(&re, old_re, sizeof(re)); 2892 2893 for (devfn = 0; devfn < 256; devfn++) { 2894 /* First calculate the correct index */ 2895 idx = (ext ? devfn * 2 : devfn) % 256; 2896 2897 if (idx == 0) { 2898 /* First save what we may have and clean up */ 2899 if (new_ce) { 2900 tbl[tbl_idx] = new_ce; 2901 __iommu_flush_cache(iommu, new_ce, 2902 VTD_PAGE_SIZE); 2903 pos = 1; 2904 } 2905 2906 if (old_ce) 2907 memunmap(old_ce); 2908 2909 ret = 0; 2910 if (devfn < 0x80) 2911 old_ce_phys = root_entry_lctp(&re); 2912 else 2913 old_ce_phys = root_entry_uctp(&re); 2914 2915 if (!old_ce_phys) { 2916 if (ext && devfn == 0) { 2917 /* No LCTP, try UCTP */ 2918 devfn = 0x7f; 2919 continue; 2920 } else { 2921 goto out; 2922 } 2923 } 2924 2925 ret = -ENOMEM; 2926 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2927 MEMREMAP_WB); 2928 if (!old_ce) 2929 goto out; 2930 2931 new_ce = alloc_pgtable_page(iommu->node); 2932 if (!new_ce) 2933 goto out_unmap; 2934 2935 ret = 0; 2936 } 2937 2938 /* Now copy the context entry */ 2939 memcpy(&ce, old_ce + idx, sizeof(ce)); 2940 2941 if (!__context_present(&ce)) 2942 continue; 2943 2944 did = context_domain_id(&ce); 2945 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2946 set_bit(did, iommu->domain_ids); 2947 2948 /* 2949 * We need a marker for copied context entries. This 2950 * marker needs to work for the old format as well as 2951 * for extended context entries. 2952 * 2953 * Bit 67 of the context entry is used. In the old 2954 * format this bit is available to software, in the 2955 * extended format it is the PGE bit, but PGE is ignored 2956 * by HW if PASIDs are disabled (and thus still 2957 * available). 2958 * 2959 * So disable PASIDs first and then mark the entry 2960 * copied. This means that we don't copy PASID 2961 * translations from the old kernel, but this is fine as 2962 * faults there are not fatal. 2963 */ 2964 context_clear_pasid_enable(&ce); 2965 context_set_copied(&ce); 2966 2967 new_ce[idx] = ce; 2968 } 2969 2970 tbl[tbl_idx + pos] = new_ce; 2971 2972 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2973 2974 out_unmap: 2975 memunmap(old_ce); 2976 2977 out: 2978 return ret; 2979 } 2980 2981 static int copy_translation_tables(struct intel_iommu *iommu) 2982 { 2983 struct context_entry **ctxt_tbls; 2984 struct root_entry *old_rt; 2985 phys_addr_t old_rt_phys; 2986 int ctxt_table_entries; 2987 unsigned long flags; 2988 u64 rtaddr_reg; 2989 int bus, ret; 2990 bool new_ext, ext; 2991 2992 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2993 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2994 new_ext = !!ecap_ecs(iommu->ecap); 2995 2996 /* 2997 * The RTT bit can only be changed when translation is disabled, 2998 * but disabling translation means to open a window for data 2999 * corruption. So bail out and don't copy anything if we would 3000 * have to change the bit. 3001 */ 3002 if (new_ext != ext) 3003 return -EINVAL; 3004 3005 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3006 if (!old_rt_phys) 3007 return -EINVAL; 3008 3009 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3010 if (!old_rt) 3011 return -ENOMEM; 3012 3013 /* This is too big for the stack - allocate it from slab */ 3014 ctxt_table_entries = ext ? 512 : 256; 3015 ret = -ENOMEM; 3016 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3017 if (!ctxt_tbls) 3018 goto out_unmap; 3019 3020 for (bus = 0; bus < 256; bus++) { 3021 ret = copy_context_table(iommu, &old_rt[bus], 3022 ctxt_tbls, bus, ext); 3023 if (ret) { 3024 pr_err("%s: Failed to copy context table for bus %d\n", 3025 iommu->name, bus); 3026 continue; 3027 } 3028 } 3029 3030 spin_lock_irqsave(&iommu->lock, flags); 3031 3032 /* Context tables are copied, now write them to the root_entry table */ 3033 for (bus = 0; bus < 256; bus++) { 3034 int idx = ext ? bus * 2 : bus; 3035 u64 val; 3036 3037 if (ctxt_tbls[idx]) { 3038 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3039 iommu->root_entry[bus].lo = val; 3040 } 3041 3042 if (!ext || !ctxt_tbls[idx + 1]) 3043 continue; 3044 3045 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3046 iommu->root_entry[bus].hi = val; 3047 } 3048 3049 spin_unlock_irqrestore(&iommu->lock, flags); 3050 3051 kfree(ctxt_tbls); 3052 3053 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3054 3055 ret = 0; 3056 3057 out_unmap: 3058 memunmap(old_rt); 3059 3060 return ret; 3061 } 3062 3063 #ifdef CONFIG_INTEL_IOMMU_SVM 3064 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3065 { 3066 struct intel_iommu *iommu = data; 3067 ioasid_t ioasid; 3068 3069 if (!iommu) 3070 return INVALID_IOASID; 3071 /* 3072 * VT-d virtual command interface always uses the full 20 bit 3073 * PASID range. Host can partition guest PASID range based on 3074 * policies but it is out of guest's control. 3075 */ 3076 if (min < PASID_MIN || max > intel_pasid_max_id) 3077 return INVALID_IOASID; 3078 3079 if (vcmd_alloc_pasid(iommu, &ioasid)) 3080 return INVALID_IOASID; 3081 3082 return ioasid; 3083 } 3084 3085 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3086 { 3087 struct intel_iommu *iommu = data; 3088 3089 if (!iommu) 3090 return; 3091 /* 3092 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3093 * We can only free the PASID when all the devices are unbound. 3094 */ 3095 if (ioasid_find(NULL, ioasid, NULL)) { 3096 pr_alert("Cannot free active IOASID %d\n", ioasid); 3097 return; 3098 } 3099 vcmd_free_pasid(iommu, ioasid); 3100 } 3101 3102 static void register_pasid_allocator(struct intel_iommu *iommu) 3103 { 3104 /* 3105 * If we are running in the host, no need for custom allocator 3106 * in that PASIDs are allocated from the host system-wide. 3107 */ 3108 if (!cap_caching_mode(iommu->cap)) 3109 return; 3110 3111 if (!sm_supported(iommu)) { 3112 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3113 return; 3114 } 3115 3116 /* 3117 * Register a custom PASID allocator if we are running in a guest, 3118 * guest PASID must be obtained via virtual command interface. 3119 * There can be multiple vIOMMUs in each guest but only one allocator 3120 * is active. All vIOMMU allocators will eventually be calling the same 3121 * host allocator. 3122 */ 3123 if (!vccap_pasid(iommu->vccap)) 3124 return; 3125 3126 pr_info("Register custom PASID allocator\n"); 3127 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3128 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3129 iommu->pasid_allocator.pdata = (void *)iommu; 3130 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3131 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3132 /* 3133 * Disable scalable mode on this IOMMU if there 3134 * is no custom allocator. Mixing SM capable vIOMMU 3135 * and non-SM vIOMMU are not supported. 3136 */ 3137 intel_iommu_sm = 0; 3138 } 3139 } 3140 #endif 3141 3142 static int __init init_dmars(void) 3143 { 3144 struct dmar_drhd_unit *drhd; 3145 struct intel_iommu *iommu; 3146 int ret; 3147 3148 /* 3149 * for each drhd 3150 * allocate root 3151 * initialize and program root entry to not present 3152 * endfor 3153 */ 3154 for_each_drhd_unit(drhd) { 3155 /* 3156 * lock not needed as this is only incremented in the single 3157 * threaded kernel __init code path all other access are read 3158 * only 3159 */ 3160 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3161 g_num_of_iommus++; 3162 continue; 3163 } 3164 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3165 } 3166 3167 /* Preallocate enough resources for IOMMU hot-addition */ 3168 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3169 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3170 3171 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3172 GFP_KERNEL); 3173 if (!g_iommus) { 3174 pr_err("Allocating global iommu array failed\n"); 3175 ret = -ENOMEM; 3176 goto error; 3177 } 3178 3179 for_each_iommu(iommu, drhd) { 3180 if (drhd->ignored) { 3181 iommu_disable_translation(iommu); 3182 continue; 3183 } 3184 3185 /* 3186 * Find the max pasid size of all IOMMU's in the system. 3187 * We need to ensure the system pasid table is no bigger 3188 * than the smallest supported. 3189 */ 3190 if (pasid_supported(iommu)) { 3191 u32 temp = 2 << ecap_pss(iommu->ecap); 3192 3193 intel_pasid_max_id = min_t(u32, temp, 3194 intel_pasid_max_id); 3195 } 3196 3197 g_iommus[iommu->seq_id] = iommu; 3198 3199 intel_iommu_init_qi(iommu); 3200 3201 ret = iommu_init_domains(iommu); 3202 if (ret) 3203 goto free_iommu; 3204 3205 init_translation_status(iommu); 3206 3207 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3208 iommu_disable_translation(iommu); 3209 clear_translation_pre_enabled(iommu); 3210 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3211 iommu->name); 3212 } 3213 3214 /* 3215 * TBD: 3216 * we could share the same root & context tables 3217 * among all IOMMU's. Need to Split it later. 3218 */ 3219 ret = iommu_alloc_root_entry(iommu); 3220 if (ret) 3221 goto free_iommu; 3222 3223 if (translation_pre_enabled(iommu)) { 3224 pr_info("Translation already enabled - trying to copy translation structures\n"); 3225 3226 ret = copy_translation_tables(iommu); 3227 if (ret) { 3228 /* 3229 * We found the IOMMU with translation 3230 * enabled - but failed to copy over the 3231 * old root-entry table. Try to proceed 3232 * by disabling translation now and 3233 * allocating a clean root-entry table. 3234 * This might cause DMAR faults, but 3235 * probably the dump will still succeed. 3236 */ 3237 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3238 iommu->name); 3239 iommu_disable_translation(iommu); 3240 clear_translation_pre_enabled(iommu); 3241 } else { 3242 pr_info("Copied translation tables from previous kernel for %s\n", 3243 iommu->name); 3244 } 3245 } 3246 3247 if (!ecap_pass_through(iommu->ecap)) 3248 hw_pass_through = 0; 3249 intel_svm_check(iommu); 3250 } 3251 3252 /* 3253 * Now that qi is enabled on all iommus, set the root entry and flush 3254 * caches. This is required on some Intel X58 chipsets, otherwise the 3255 * flush_context function will loop forever and the boot hangs. 3256 */ 3257 for_each_active_iommu(iommu, drhd) { 3258 iommu_flush_write_buffer(iommu); 3259 #ifdef CONFIG_INTEL_IOMMU_SVM 3260 register_pasid_allocator(iommu); 3261 #endif 3262 iommu_set_root_entry(iommu); 3263 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3264 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3265 } 3266 3267 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3268 dmar_map_gfx = 0; 3269 #endif 3270 3271 if (!dmar_map_gfx) 3272 iommu_identity_mapping |= IDENTMAP_GFX; 3273 3274 check_tylersburg_isoch(); 3275 3276 ret = si_domain_init(hw_pass_through); 3277 if (ret) 3278 goto free_iommu; 3279 3280 /* 3281 * for each drhd 3282 * enable fault log 3283 * global invalidate context cache 3284 * global invalidate iotlb 3285 * enable translation 3286 */ 3287 for_each_iommu(iommu, drhd) { 3288 if (drhd->ignored) { 3289 /* 3290 * we always have to disable PMRs or DMA may fail on 3291 * this device 3292 */ 3293 if (force_on) 3294 iommu_disable_protect_mem_regions(iommu); 3295 continue; 3296 } 3297 3298 iommu_flush_write_buffer(iommu); 3299 3300 #ifdef CONFIG_INTEL_IOMMU_SVM 3301 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3302 /* 3303 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3304 * could cause possible lock race condition. 3305 */ 3306 up_write(&dmar_global_lock); 3307 ret = intel_svm_enable_prq(iommu); 3308 down_write(&dmar_global_lock); 3309 if (ret) 3310 goto free_iommu; 3311 } 3312 #endif 3313 ret = dmar_set_interrupt(iommu); 3314 if (ret) 3315 goto free_iommu; 3316 } 3317 3318 return 0; 3319 3320 free_iommu: 3321 for_each_active_iommu(iommu, drhd) { 3322 disable_dmar_iommu(iommu); 3323 free_dmar_iommu(iommu); 3324 } 3325 3326 kfree(g_iommus); 3327 3328 error: 3329 return ret; 3330 } 3331 3332 static inline int iommu_domain_cache_init(void) 3333 { 3334 int ret = 0; 3335 3336 iommu_domain_cache = kmem_cache_create("iommu_domain", 3337 sizeof(struct dmar_domain), 3338 0, 3339 SLAB_HWCACHE_ALIGN, 3340 3341 NULL); 3342 if (!iommu_domain_cache) { 3343 pr_err("Couldn't create iommu_domain cache\n"); 3344 ret = -ENOMEM; 3345 } 3346 3347 return ret; 3348 } 3349 3350 static inline int iommu_devinfo_cache_init(void) 3351 { 3352 int ret = 0; 3353 3354 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3355 sizeof(struct device_domain_info), 3356 0, 3357 SLAB_HWCACHE_ALIGN, 3358 NULL); 3359 if (!iommu_devinfo_cache) { 3360 pr_err("Couldn't create devinfo cache\n"); 3361 ret = -ENOMEM; 3362 } 3363 3364 return ret; 3365 } 3366 3367 static int __init iommu_init_mempool(void) 3368 { 3369 int ret; 3370 ret = iova_cache_get(); 3371 if (ret) 3372 return ret; 3373 3374 ret = iommu_domain_cache_init(); 3375 if (ret) 3376 goto domain_error; 3377 3378 ret = iommu_devinfo_cache_init(); 3379 if (!ret) 3380 return ret; 3381 3382 kmem_cache_destroy(iommu_domain_cache); 3383 domain_error: 3384 iova_cache_put(); 3385 3386 return -ENOMEM; 3387 } 3388 3389 static void __init iommu_exit_mempool(void) 3390 { 3391 kmem_cache_destroy(iommu_devinfo_cache); 3392 kmem_cache_destroy(iommu_domain_cache); 3393 iova_cache_put(); 3394 } 3395 3396 static void __init init_no_remapping_devices(void) 3397 { 3398 struct dmar_drhd_unit *drhd; 3399 struct device *dev; 3400 int i; 3401 3402 for_each_drhd_unit(drhd) { 3403 if (!drhd->include_all) { 3404 for_each_active_dev_scope(drhd->devices, 3405 drhd->devices_cnt, i, dev) 3406 break; 3407 /* ignore DMAR unit if no devices exist */ 3408 if (i == drhd->devices_cnt) 3409 drhd->ignored = 1; 3410 } 3411 } 3412 3413 for_each_active_drhd_unit(drhd) { 3414 if (drhd->include_all) 3415 continue; 3416 3417 for_each_active_dev_scope(drhd->devices, 3418 drhd->devices_cnt, i, dev) 3419 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3420 break; 3421 if (i < drhd->devices_cnt) 3422 continue; 3423 3424 /* This IOMMU has *only* gfx devices. Either bypass it or 3425 set the gfx_mapped flag, as appropriate */ 3426 drhd->gfx_dedicated = 1; 3427 if (!dmar_map_gfx) 3428 drhd->ignored = 1; 3429 } 3430 } 3431 3432 #ifdef CONFIG_SUSPEND 3433 static int init_iommu_hw(void) 3434 { 3435 struct dmar_drhd_unit *drhd; 3436 struct intel_iommu *iommu = NULL; 3437 3438 for_each_active_iommu(iommu, drhd) 3439 if (iommu->qi) 3440 dmar_reenable_qi(iommu); 3441 3442 for_each_iommu(iommu, drhd) { 3443 if (drhd->ignored) { 3444 /* 3445 * we always have to disable PMRs or DMA may fail on 3446 * this device 3447 */ 3448 if (force_on) 3449 iommu_disable_protect_mem_regions(iommu); 3450 continue; 3451 } 3452 3453 iommu_flush_write_buffer(iommu); 3454 3455 iommu_set_root_entry(iommu); 3456 3457 iommu->flush.flush_context(iommu, 0, 0, 0, 3458 DMA_CCMD_GLOBAL_INVL); 3459 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3460 iommu_enable_translation(iommu); 3461 iommu_disable_protect_mem_regions(iommu); 3462 } 3463 3464 return 0; 3465 } 3466 3467 static void iommu_flush_all(void) 3468 { 3469 struct dmar_drhd_unit *drhd; 3470 struct intel_iommu *iommu; 3471 3472 for_each_active_iommu(iommu, drhd) { 3473 iommu->flush.flush_context(iommu, 0, 0, 0, 3474 DMA_CCMD_GLOBAL_INVL); 3475 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3476 DMA_TLB_GLOBAL_FLUSH); 3477 } 3478 } 3479 3480 static int iommu_suspend(void) 3481 { 3482 struct dmar_drhd_unit *drhd; 3483 struct intel_iommu *iommu = NULL; 3484 unsigned long flag; 3485 3486 for_each_active_iommu(iommu, drhd) { 3487 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3488 GFP_KERNEL); 3489 if (!iommu->iommu_state) 3490 goto nomem; 3491 } 3492 3493 iommu_flush_all(); 3494 3495 for_each_active_iommu(iommu, drhd) { 3496 iommu_disable_translation(iommu); 3497 3498 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3499 3500 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3501 readl(iommu->reg + DMAR_FECTL_REG); 3502 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3503 readl(iommu->reg + DMAR_FEDATA_REG); 3504 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3505 readl(iommu->reg + DMAR_FEADDR_REG); 3506 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3507 readl(iommu->reg + DMAR_FEUADDR_REG); 3508 3509 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3510 } 3511 return 0; 3512 3513 nomem: 3514 for_each_active_iommu(iommu, drhd) 3515 kfree(iommu->iommu_state); 3516 3517 return -ENOMEM; 3518 } 3519 3520 static void iommu_resume(void) 3521 { 3522 struct dmar_drhd_unit *drhd; 3523 struct intel_iommu *iommu = NULL; 3524 unsigned long flag; 3525 3526 if (init_iommu_hw()) { 3527 if (force_on) 3528 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3529 else 3530 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3531 return; 3532 } 3533 3534 for_each_active_iommu(iommu, drhd) { 3535 3536 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3537 3538 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3539 iommu->reg + DMAR_FECTL_REG); 3540 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3541 iommu->reg + DMAR_FEDATA_REG); 3542 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3543 iommu->reg + DMAR_FEADDR_REG); 3544 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3545 iommu->reg + DMAR_FEUADDR_REG); 3546 3547 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3548 } 3549 3550 for_each_active_iommu(iommu, drhd) 3551 kfree(iommu->iommu_state); 3552 } 3553 3554 static struct syscore_ops iommu_syscore_ops = { 3555 .resume = iommu_resume, 3556 .suspend = iommu_suspend, 3557 }; 3558 3559 static void __init init_iommu_pm_ops(void) 3560 { 3561 register_syscore_ops(&iommu_syscore_ops); 3562 } 3563 3564 #else 3565 static inline void init_iommu_pm_ops(void) {} 3566 #endif /* CONFIG_PM */ 3567 3568 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3569 { 3570 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3571 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3572 rmrr->end_address <= rmrr->base_address || 3573 arch_rmrr_sanity_check(rmrr)) 3574 return -EINVAL; 3575 3576 return 0; 3577 } 3578 3579 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3580 { 3581 struct acpi_dmar_reserved_memory *rmrr; 3582 struct dmar_rmrr_unit *rmrru; 3583 3584 rmrr = (struct acpi_dmar_reserved_memory *)header; 3585 if (rmrr_sanity_check(rmrr)) { 3586 pr_warn(FW_BUG 3587 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3588 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3589 rmrr->base_address, rmrr->end_address, 3590 dmi_get_system_info(DMI_BIOS_VENDOR), 3591 dmi_get_system_info(DMI_BIOS_VERSION), 3592 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3593 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3594 } 3595 3596 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3597 if (!rmrru) 3598 goto out; 3599 3600 rmrru->hdr = header; 3601 3602 rmrru->base_address = rmrr->base_address; 3603 rmrru->end_address = rmrr->end_address; 3604 3605 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3606 ((void *)rmrr) + rmrr->header.length, 3607 &rmrru->devices_cnt); 3608 if (rmrru->devices_cnt && rmrru->devices == NULL) 3609 goto free_rmrru; 3610 3611 list_add(&rmrru->list, &dmar_rmrr_units); 3612 3613 return 0; 3614 free_rmrru: 3615 kfree(rmrru); 3616 out: 3617 return -ENOMEM; 3618 } 3619 3620 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3621 { 3622 struct dmar_atsr_unit *atsru; 3623 struct acpi_dmar_atsr *tmp; 3624 3625 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3626 dmar_rcu_check()) { 3627 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3628 if (atsr->segment != tmp->segment) 3629 continue; 3630 if (atsr->header.length != tmp->header.length) 3631 continue; 3632 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3633 return atsru; 3634 } 3635 3636 return NULL; 3637 } 3638 3639 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3640 { 3641 struct acpi_dmar_atsr *atsr; 3642 struct dmar_atsr_unit *atsru; 3643 3644 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3645 return 0; 3646 3647 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3648 atsru = dmar_find_atsr(atsr); 3649 if (atsru) 3650 return 0; 3651 3652 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3653 if (!atsru) 3654 return -ENOMEM; 3655 3656 /* 3657 * If memory is allocated from slab by ACPI _DSM method, we need to 3658 * copy the memory content because the memory buffer will be freed 3659 * on return. 3660 */ 3661 atsru->hdr = (void *)(atsru + 1); 3662 memcpy(atsru->hdr, hdr, hdr->length); 3663 atsru->include_all = atsr->flags & 0x1; 3664 if (!atsru->include_all) { 3665 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3666 (void *)atsr + atsr->header.length, 3667 &atsru->devices_cnt); 3668 if (atsru->devices_cnt && atsru->devices == NULL) { 3669 kfree(atsru); 3670 return -ENOMEM; 3671 } 3672 } 3673 3674 list_add_rcu(&atsru->list, &dmar_atsr_units); 3675 3676 return 0; 3677 } 3678 3679 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3680 { 3681 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3682 kfree(atsru); 3683 } 3684 3685 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3686 { 3687 struct acpi_dmar_atsr *atsr; 3688 struct dmar_atsr_unit *atsru; 3689 3690 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3691 atsru = dmar_find_atsr(atsr); 3692 if (atsru) { 3693 list_del_rcu(&atsru->list); 3694 synchronize_rcu(); 3695 intel_iommu_free_atsr(atsru); 3696 } 3697 3698 return 0; 3699 } 3700 3701 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3702 { 3703 int i; 3704 struct device *dev; 3705 struct acpi_dmar_atsr *atsr; 3706 struct dmar_atsr_unit *atsru; 3707 3708 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3709 atsru = dmar_find_atsr(atsr); 3710 if (!atsru) 3711 return 0; 3712 3713 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3714 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3715 i, dev) 3716 return -EBUSY; 3717 } 3718 3719 return 0; 3720 } 3721 3722 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3723 { 3724 int sp, ret; 3725 struct intel_iommu *iommu = dmaru->iommu; 3726 3727 if (g_iommus[iommu->seq_id]) 3728 return 0; 3729 3730 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3731 pr_warn("%s: Doesn't support hardware pass through.\n", 3732 iommu->name); 3733 return -ENXIO; 3734 } 3735 if (!ecap_sc_support(iommu->ecap) && 3736 domain_update_iommu_snooping(iommu)) { 3737 pr_warn("%s: Doesn't support snooping.\n", 3738 iommu->name); 3739 return -ENXIO; 3740 } 3741 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3742 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3743 pr_warn("%s: Doesn't support large page.\n", 3744 iommu->name); 3745 return -ENXIO; 3746 } 3747 3748 /* 3749 * Disable translation if already enabled prior to OS handover. 3750 */ 3751 if (iommu->gcmd & DMA_GCMD_TE) 3752 iommu_disable_translation(iommu); 3753 3754 g_iommus[iommu->seq_id] = iommu; 3755 ret = iommu_init_domains(iommu); 3756 if (ret == 0) 3757 ret = iommu_alloc_root_entry(iommu); 3758 if (ret) 3759 goto out; 3760 3761 intel_svm_check(iommu); 3762 3763 if (dmaru->ignored) { 3764 /* 3765 * we always have to disable PMRs or DMA may fail on this device 3766 */ 3767 if (force_on) 3768 iommu_disable_protect_mem_regions(iommu); 3769 return 0; 3770 } 3771 3772 intel_iommu_init_qi(iommu); 3773 iommu_flush_write_buffer(iommu); 3774 3775 #ifdef CONFIG_INTEL_IOMMU_SVM 3776 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3777 ret = intel_svm_enable_prq(iommu); 3778 if (ret) 3779 goto disable_iommu; 3780 } 3781 #endif 3782 ret = dmar_set_interrupt(iommu); 3783 if (ret) 3784 goto disable_iommu; 3785 3786 iommu_set_root_entry(iommu); 3787 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3788 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3789 iommu_enable_translation(iommu); 3790 3791 iommu_disable_protect_mem_regions(iommu); 3792 return 0; 3793 3794 disable_iommu: 3795 disable_dmar_iommu(iommu); 3796 out: 3797 free_dmar_iommu(iommu); 3798 return ret; 3799 } 3800 3801 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3802 { 3803 int ret = 0; 3804 struct intel_iommu *iommu = dmaru->iommu; 3805 3806 if (!intel_iommu_enabled) 3807 return 0; 3808 if (iommu == NULL) 3809 return -EINVAL; 3810 3811 if (insert) { 3812 ret = intel_iommu_add(dmaru); 3813 } else { 3814 disable_dmar_iommu(iommu); 3815 free_dmar_iommu(iommu); 3816 } 3817 3818 return ret; 3819 } 3820 3821 static void intel_iommu_free_dmars(void) 3822 { 3823 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3824 struct dmar_atsr_unit *atsru, *atsr_n; 3825 3826 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3827 list_del(&rmrru->list); 3828 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3829 kfree(rmrru); 3830 } 3831 3832 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3833 list_del(&atsru->list); 3834 intel_iommu_free_atsr(atsru); 3835 } 3836 } 3837 3838 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 3839 { 3840 int i, ret = 1; 3841 struct pci_bus *bus; 3842 struct pci_dev *bridge = NULL; 3843 struct device *tmp; 3844 struct acpi_dmar_atsr *atsr; 3845 struct dmar_atsr_unit *atsru; 3846 3847 dev = pci_physfn(dev); 3848 for (bus = dev->bus; bus; bus = bus->parent) { 3849 bridge = bus->self; 3850 /* If it's an integrated device, allow ATS */ 3851 if (!bridge) 3852 return 1; 3853 /* Connected via non-PCIe: no ATS */ 3854 if (!pci_is_pcie(bridge) || 3855 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3856 return 0; 3857 /* If we found the root port, look it up in the ATSR */ 3858 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3859 break; 3860 } 3861 3862 rcu_read_lock(); 3863 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3864 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3865 if (atsr->segment != pci_domain_nr(dev->bus)) 3866 continue; 3867 3868 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3869 if (tmp == &bridge->dev) 3870 goto out; 3871 3872 if (atsru->include_all) 3873 goto out; 3874 } 3875 ret = 0; 3876 out: 3877 rcu_read_unlock(); 3878 3879 return ret; 3880 } 3881 3882 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3883 { 3884 int ret; 3885 struct dmar_rmrr_unit *rmrru; 3886 struct dmar_atsr_unit *atsru; 3887 struct acpi_dmar_atsr *atsr; 3888 struct acpi_dmar_reserved_memory *rmrr; 3889 3890 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3891 return 0; 3892 3893 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3894 rmrr = container_of(rmrru->hdr, 3895 struct acpi_dmar_reserved_memory, header); 3896 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3897 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3898 ((void *)rmrr) + rmrr->header.length, 3899 rmrr->segment, rmrru->devices, 3900 rmrru->devices_cnt); 3901 if (ret < 0) 3902 return ret; 3903 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3904 dmar_remove_dev_scope(info, rmrr->segment, 3905 rmrru->devices, rmrru->devices_cnt); 3906 } 3907 } 3908 3909 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3910 if (atsru->include_all) 3911 continue; 3912 3913 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3914 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3915 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3916 (void *)atsr + atsr->header.length, 3917 atsr->segment, atsru->devices, 3918 atsru->devices_cnt); 3919 if (ret > 0) 3920 break; 3921 else if (ret < 0) 3922 return ret; 3923 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3924 if (dmar_remove_dev_scope(info, atsr->segment, 3925 atsru->devices, atsru->devices_cnt)) 3926 break; 3927 } 3928 } 3929 3930 return 0; 3931 } 3932 3933 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3934 unsigned long val, void *v) 3935 { 3936 struct memory_notify *mhp = v; 3937 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3938 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3939 mhp->nr_pages - 1); 3940 3941 switch (val) { 3942 case MEM_GOING_ONLINE: 3943 if (iommu_domain_identity_map(si_domain, 3944 start_vpfn, last_vpfn)) { 3945 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3946 start_vpfn, last_vpfn); 3947 return NOTIFY_BAD; 3948 } 3949 break; 3950 3951 case MEM_OFFLINE: 3952 case MEM_CANCEL_ONLINE: 3953 { 3954 struct dmar_drhd_unit *drhd; 3955 struct intel_iommu *iommu; 3956 struct page *freelist; 3957 3958 freelist = domain_unmap(si_domain, 3959 start_vpfn, last_vpfn, 3960 NULL); 3961 3962 rcu_read_lock(); 3963 for_each_active_iommu(iommu, drhd) 3964 iommu_flush_iotlb_psi(iommu, si_domain, 3965 start_vpfn, mhp->nr_pages, 3966 !freelist, 0); 3967 rcu_read_unlock(); 3968 dma_free_pagelist(freelist); 3969 } 3970 break; 3971 } 3972 3973 return NOTIFY_OK; 3974 } 3975 3976 static struct notifier_block intel_iommu_memory_nb = { 3977 .notifier_call = intel_iommu_memory_notifier, 3978 .priority = 0 3979 }; 3980 3981 static void free_all_cpu_cached_iovas(unsigned int cpu) 3982 { 3983 int i; 3984 3985 for (i = 0; i < g_num_of_iommus; i++) { 3986 struct intel_iommu *iommu = g_iommus[i]; 3987 struct dmar_domain *domain; 3988 int did; 3989 3990 if (!iommu) 3991 continue; 3992 3993 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 3994 domain = get_iommu_domain(iommu, (u16)did); 3995 3996 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 3997 continue; 3998 3999 iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain); 4000 } 4001 } 4002 } 4003 4004 static int intel_iommu_cpu_dead(unsigned int cpu) 4005 { 4006 free_all_cpu_cached_iovas(cpu); 4007 return 0; 4008 } 4009 4010 static void intel_disable_iommus(void) 4011 { 4012 struct intel_iommu *iommu = NULL; 4013 struct dmar_drhd_unit *drhd; 4014 4015 for_each_iommu(iommu, drhd) 4016 iommu_disable_translation(iommu); 4017 } 4018 4019 void intel_iommu_shutdown(void) 4020 { 4021 struct dmar_drhd_unit *drhd; 4022 struct intel_iommu *iommu = NULL; 4023 4024 if (no_iommu || dmar_disabled) 4025 return; 4026 4027 down_write(&dmar_global_lock); 4028 4029 /* Disable PMRs explicitly here. */ 4030 for_each_iommu(iommu, drhd) 4031 iommu_disable_protect_mem_regions(iommu); 4032 4033 /* Make sure the IOMMUs are switched off */ 4034 intel_disable_iommus(); 4035 4036 up_write(&dmar_global_lock); 4037 } 4038 4039 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4040 { 4041 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4042 4043 return container_of(iommu_dev, struct intel_iommu, iommu); 4044 } 4045 4046 static ssize_t intel_iommu_show_version(struct device *dev, 4047 struct device_attribute *attr, 4048 char *buf) 4049 { 4050 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4051 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4052 return sprintf(buf, "%d:%d\n", 4053 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4054 } 4055 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4056 4057 static ssize_t intel_iommu_show_address(struct device *dev, 4058 struct device_attribute *attr, 4059 char *buf) 4060 { 4061 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4062 return sprintf(buf, "%llx\n", iommu->reg_phys); 4063 } 4064 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4065 4066 static ssize_t intel_iommu_show_cap(struct device *dev, 4067 struct device_attribute *attr, 4068 char *buf) 4069 { 4070 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4071 return sprintf(buf, "%llx\n", iommu->cap); 4072 } 4073 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4074 4075 static ssize_t intel_iommu_show_ecap(struct device *dev, 4076 struct device_attribute *attr, 4077 char *buf) 4078 { 4079 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4080 return sprintf(buf, "%llx\n", iommu->ecap); 4081 } 4082 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4083 4084 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4085 struct device_attribute *attr, 4086 char *buf) 4087 { 4088 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4089 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4090 } 4091 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4092 4093 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4094 struct device_attribute *attr, 4095 char *buf) 4096 { 4097 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4098 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4099 cap_ndoms(iommu->cap))); 4100 } 4101 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4102 4103 static struct attribute *intel_iommu_attrs[] = { 4104 &dev_attr_version.attr, 4105 &dev_attr_address.attr, 4106 &dev_attr_cap.attr, 4107 &dev_attr_ecap.attr, 4108 &dev_attr_domains_supported.attr, 4109 &dev_attr_domains_used.attr, 4110 NULL, 4111 }; 4112 4113 static struct attribute_group intel_iommu_group = { 4114 .name = "intel-iommu", 4115 .attrs = intel_iommu_attrs, 4116 }; 4117 4118 const struct attribute_group *intel_iommu_groups[] = { 4119 &intel_iommu_group, 4120 NULL, 4121 }; 4122 4123 static inline bool has_external_pci(void) 4124 { 4125 struct pci_dev *pdev = NULL; 4126 4127 for_each_pci_dev(pdev) 4128 if (pdev->external_facing) 4129 return true; 4130 4131 return false; 4132 } 4133 4134 static int __init platform_optin_force_iommu(void) 4135 { 4136 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4137 return 0; 4138 4139 if (no_iommu || dmar_disabled) 4140 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4141 4142 /* 4143 * If Intel-IOMMU is disabled by default, we will apply identity 4144 * map for all devices except those marked as being untrusted. 4145 */ 4146 if (dmar_disabled) 4147 iommu_set_default_passthrough(false); 4148 4149 dmar_disabled = 0; 4150 no_iommu = 0; 4151 4152 return 1; 4153 } 4154 4155 static int __init probe_acpi_namespace_devices(void) 4156 { 4157 struct dmar_drhd_unit *drhd; 4158 /* To avoid a -Wunused-but-set-variable warning. */ 4159 struct intel_iommu *iommu __maybe_unused; 4160 struct device *dev; 4161 int i, ret = 0; 4162 4163 for_each_active_iommu(iommu, drhd) { 4164 for_each_active_dev_scope(drhd->devices, 4165 drhd->devices_cnt, i, dev) { 4166 struct acpi_device_physical_node *pn; 4167 struct iommu_group *group; 4168 struct acpi_device *adev; 4169 4170 if (dev->bus != &acpi_bus_type) 4171 continue; 4172 4173 adev = to_acpi_device(dev); 4174 mutex_lock(&adev->physical_node_lock); 4175 list_for_each_entry(pn, 4176 &adev->physical_node_list, node) { 4177 group = iommu_group_get(pn->dev); 4178 if (group) { 4179 iommu_group_put(group); 4180 continue; 4181 } 4182 4183 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4184 ret = iommu_probe_device(pn->dev); 4185 if (ret) 4186 break; 4187 } 4188 mutex_unlock(&adev->physical_node_lock); 4189 4190 if (ret) 4191 return ret; 4192 } 4193 } 4194 4195 return 0; 4196 } 4197 4198 int __init intel_iommu_init(void) 4199 { 4200 int ret = -ENODEV; 4201 struct dmar_drhd_unit *drhd; 4202 struct intel_iommu *iommu; 4203 4204 /* 4205 * Intel IOMMU is required for a TXT/tboot launch or platform 4206 * opt in, so enforce that. 4207 */ 4208 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4209 platform_optin_force_iommu(); 4210 4211 if (iommu_init_mempool()) { 4212 if (force_on) 4213 panic("tboot: Failed to initialize iommu memory\n"); 4214 return -ENOMEM; 4215 } 4216 4217 down_write(&dmar_global_lock); 4218 if (dmar_table_init()) { 4219 if (force_on) 4220 panic("tboot: Failed to initialize DMAR table\n"); 4221 goto out_free_dmar; 4222 } 4223 4224 if (dmar_dev_scope_init() < 0) { 4225 if (force_on) 4226 panic("tboot: Failed to initialize DMAR device scope\n"); 4227 goto out_free_dmar; 4228 } 4229 4230 up_write(&dmar_global_lock); 4231 4232 /* 4233 * The bus notifier takes the dmar_global_lock, so lockdep will 4234 * complain later when we register it under the lock. 4235 */ 4236 dmar_register_bus_notifier(); 4237 4238 down_write(&dmar_global_lock); 4239 4240 if (!no_iommu) 4241 intel_iommu_debugfs_init(); 4242 4243 if (no_iommu || dmar_disabled) { 4244 /* 4245 * We exit the function here to ensure IOMMU's remapping and 4246 * mempool aren't setup, which means that the IOMMU's PMRs 4247 * won't be disabled via the call to init_dmars(). So disable 4248 * it explicitly here. The PMRs were setup by tboot prior to 4249 * calling SENTER, but the kernel is expected to reset/tear 4250 * down the PMRs. 4251 */ 4252 if (intel_iommu_tboot_noforce) { 4253 for_each_iommu(iommu, drhd) 4254 iommu_disable_protect_mem_regions(iommu); 4255 } 4256 4257 /* 4258 * Make sure the IOMMUs are switched off, even when we 4259 * boot into a kexec kernel and the previous kernel left 4260 * them enabled 4261 */ 4262 intel_disable_iommus(); 4263 goto out_free_dmar; 4264 } 4265 4266 if (list_empty(&dmar_rmrr_units)) 4267 pr_info("No RMRR found\n"); 4268 4269 if (list_empty(&dmar_atsr_units)) 4270 pr_info("No ATSR found\n"); 4271 4272 if (dmar_map_gfx) 4273 intel_iommu_gfx_mapped = 1; 4274 4275 init_no_remapping_devices(); 4276 4277 ret = init_dmars(); 4278 if (ret) { 4279 if (force_on) 4280 panic("tboot: Failed to initialize DMARs\n"); 4281 pr_err("Initialization failed\n"); 4282 goto out_free_dmar; 4283 } 4284 up_write(&dmar_global_lock); 4285 4286 init_iommu_pm_ops(); 4287 4288 down_read(&dmar_global_lock); 4289 for_each_active_iommu(iommu, drhd) { 4290 iommu_device_sysfs_add(&iommu->iommu, NULL, 4291 intel_iommu_groups, 4292 "%s", iommu->name); 4293 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4294 iommu_device_register(&iommu->iommu); 4295 } 4296 up_read(&dmar_global_lock); 4297 4298 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4299 if (si_domain && !hw_pass_through) 4300 register_memory_notifier(&intel_iommu_memory_nb); 4301 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4302 intel_iommu_cpu_dead); 4303 4304 down_read(&dmar_global_lock); 4305 if (probe_acpi_namespace_devices()) 4306 pr_warn("ACPI name space devices didn't probe correctly\n"); 4307 4308 /* Finally, we enable the DMA remapping hardware. */ 4309 for_each_iommu(iommu, drhd) { 4310 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4311 iommu_enable_translation(iommu); 4312 4313 iommu_disable_protect_mem_regions(iommu); 4314 } 4315 up_read(&dmar_global_lock); 4316 4317 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4318 4319 intel_iommu_enabled = 1; 4320 4321 return 0; 4322 4323 out_free_dmar: 4324 intel_iommu_free_dmars(); 4325 up_write(&dmar_global_lock); 4326 iommu_exit_mempool(); 4327 return ret; 4328 } 4329 4330 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4331 { 4332 struct intel_iommu *iommu = opaque; 4333 4334 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 4335 return 0; 4336 } 4337 4338 /* 4339 * NB - intel-iommu lacks any sort of reference counting for the users of 4340 * dependent devices. If multiple endpoints have intersecting dependent 4341 * devices, unbinding the driver from any one of them will possibly leave 4342 * the others unable to operate. 4343 */ 4344 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 4345 { 4346 if (!iommu || !dev || !dev_is_pci(dev)) 4347 return; 4348 4349 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 4350 } 4351 4352 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4353 { 4354 struct dmar_domain *domain; 4355 struct intel_iommu *iommu; 4356 unsigned long flags; 4357 4358 assert_spin_locked(&device_domain_lock); 4359 4360 if (WARN_ON(!info)) 4361 return; 4362 4363 iommu = info->iommu; 4364 domain = info->domain; 4365 4366 if (info->dev) { 4367 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4368 intel_pasid_tear_down_entry(iommu, info->dev, 4369 PASID_RID2PASID, false); 4370 4371 iommu_disable_dev_iotlb(info); 4372 if (!dev_is_real_dma_subdevice(info->dev)) 4373 domain_context_clear(iommu, info->dev); 4374 intel_pasid_free_table(info->dev); 4375 } 4376 4377 unlink_domain_info(info); 4378 4379 spin_lock_irqsave(&iommu->lock, flags); 4380 domain_detach_iommu(domain, iommu); 4381 spin_unlock_irqrestore(&iommu->lock, flags); 4382 4383 free_devinfo_mem(info); 4384 } 4385 4386 static void dmar_remove_one_dev_info(struct device *dev) 4387 { 4388 struct device_domain_info *info; 4389 unsigned long flags; 4390 4391 spin_lock_irqsave(&device_domain_lock, flags); 4392 info = get_domain_info(dev); 4393 if (info) 4394 __dmar_remove_one_dev_info(info); 4395 spin_unlock_irqrestore(&device_domain_lock, flags); 4396 } 4397 4398 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4399 { 4400 int adjust_width; 4401 4402 /* calculate AGAW */ 4403 domain->gaw = guest_width; 4404 adjust_width = guestwidth_to_adjustwidth(guest_width); 4405 domain->agaw = width_to_agaw(adjust_width); 4406 4407 domain->iommu_coherency = 0; 4408 domain->iommu_snooping = 0; 4409 domain->iommu_superpage = 0; 4410 domain->max_addr = 0; 4411 4412 /* always allocate the top pgd */ 4413 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 4414 if (!domain->pgd) 4415 return -ENOMEM; 4416 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4417 return 0; 4418 } 4419 4420 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4421 { 4422 struct dmar_domain *dmar_domain; 4423 struct iommu_domain *domain; 4424 4425 switch (type) { 4426 case IOMMU_DOMAIN_DMA: 4427 case IOMMU_DOMAIN_UNMANAGED: 4428 dmar_domain = alloc_domain(0); 4429 if (!dmar_domain) { 4430 pr_err("Can't allocate dmar_domain\n"); 4431 return NULL; 4432 } 4433 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4434 pr_err("Domain initialization failed\n"); 4435 domain_exit(dmar_domain); 4436 return NULL; 4437 } 4438 4439 if (type == IOMMU_DOMAIN_DMA && 4440 iommu_get_dma_cookie(&dmar_domain->domain)) 4441 return NULL; 4442 4443 domain = &dmar_domain->domain; 4444 domain->geometry.aperture_start = 0; 4445 domain->geometry.aperture_end = 4446 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4447 domain->geometry.force_aperture = true; 4448 4449 return domain; 4450 case IOMMU_DOMAIN_IDENTITY: 4451 return &si_domain->domain; 4452 default: 4453 return NULL; 4454 } 4455 4456 return NULL; 4457 } 4458 4459 static void intel_iommu_domain_free(struct iommu_domain *domain) 4460 { 4461 if (domain != &si_domain->domain) 4462 domain_exit(to_dmar_domain(domain)); 4463 } 4464 4465 /* 4466 * Check whether a @domain could be attached to the @dev through the 4467 * aux-domain attach/detach APIs. 4468 */ 4469 static inline bool 4470 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4471 { 4472 struct device_domain_info *info = get_domain_info(dev); 4473 4474 return info && info->auxd_enabled && 4475 domain->type == IOMMU_DOMAIN_UNMANAGED; 4476 } 4477 4478 static void auxiliary_link_device(struct dmar_domain *domain, 4479 struct device *dev) 4480 { 4481 struct device_domain_info *info = get_domain_info(dev); 4482 4483 assert_spin_locked(&device_domain_lock); 4484 if (WARN_ON(!info)) 4485 return; 4486 4487 domain->auxd_refcnt++; 4488 list_add(&domain->auxd, &info->auxiliary_domains); 4489 } 4490 4491 static void auxiliary_unlink_device(struct dmar_domain *domain, 4492 struct device *dev) 4493 { 4494 struct device_domain_info *info = get_domain_info(dev); 4495 4496 assert_spin_locked(&device_domain_lock); 4497 if (WARN_ON(!info)) 4498 return; 4499 4500 list_del(&domain->auxd); 4501 domain->auxd_refcnt--; 4502 4503 if (!domain->auxd_refcnt && domain->default_pasid > 0) 4504 ioasid_put(domain->default_pasid); 4505 } 4506 4507 static int aux_domain_add_dev(struct dmar_domain *domain, 4508 struct device *dev) 4509 { 4510 int ret; 4511 unsigned long flags; 4512 struct intel_iommu *iommu; 4513 4514 iommu = device_to_iommu(dev, NULL, NULL); 4515 if (!iommu) 4516 return -ENODEV; 4517 4518 if (domain->default_pasid <= 0) { 4519 u32 pasid; 4520 4521 /* No private data needed for the default pasid */ 4522 pasid = ioasid_alloc(NULL, PASID_MIN, 4523 pci_max_pasids(to_pci_dev(dev)) - 1, 4524 NULL); 4525 if (pasid == INVALID_IOASID) { 4526 pr_err("Can't allocate default pasid\n"); 4527 return -ENODEV; 4528 } 4529 domain->default_pasid = pasid; 4530 } 4531 4532 spin_lock_irqsave(&device_domain_lock, flags); 4533 /* 4534 * iommu->lock must be held to attach domain to iommu and setup the 4535 * pasid entry for second level translation. 4536 */ 4537 spin_lock(&iommu->lock); 4538 ret = domain_attach_iommu(domain, iommu); 4539 if (ret) 4540 goto attach_failed; 4541 4542 /* Setup the PASID entry for mediated devices: */ 4543 if (domain_use_first_level(domain)) 4544 ret = domain_setup_first_level(iommu, domain, dev, 4545 domain->default_pasid); 4546 else 4547 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4548 domain->default_pasid); 4549 if (ret) 4550 goto table_failed; 4551 spin_unlock(&iommu->lock); 4552 4553 auxiliary_link_device(domain, dev); 4554 4555 spin_unlock_irqrestore(&device_domain_lock, flags); 4556 4557 return 0; 4558 4559 table_failed: 4560 domain_detach_iommu(domain, iommu); 4561 attach_failed: 4562 spin_unlock(&iommu->lock); 4563 spin_unlock_irqrestore(&device_domain_lock, flags); 4564 if (!domain->auxd_refcnt && domain->default_pasid > 0) 4565 ioasid_put(domain->default_pasid); 4566 4567 return ret; 4568 } 4569 4570 static void aux_domain_remove_dev(struct dmar_domain *domain, 4571 struct device *dev) 4572 { 4573 struct device_domain_info *info; 4574 struct intel_iommu *iommu; 4575 unsigned long flags; 4576 4577 if (!is_aux_domain(dev, &domain->domain)) 4578 return; 4579 4580 spin_lock_irqsave(&device_domain_lock, flags); 4581 info = get_domain_info(dev); 4582 iommu = info->iommu; 4583 4584 auxiliary_unlink_device(domain, dev); 4585 4586 spin_lock(&iommu->lock); 4587 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); 4588 domain_detach_iommu(domain, iommu); 4589 spin_unlock(&iommu->lock); 4590 4591 spin_unlock_irqrestore(&device_domain_lock, flags); 4592 } 4593 4594 static int prepare_domain_attach_device(struct iommu_domain *domain, 4595 struct device *dev) 4596 { 4597 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4598 struct intel_iommu *iommu; 4599 int addr_width; 4600 4601 iommu = device_to_iommu(dev, NULL, NULL); 4602 if (!iommu) 4603 return -ENODEV; 4604 4605 /* check if this iommu agaw is sufficient for max mapped address */ 4606 addr_width = agaw_to_width(iommu->agaw); 4607 if (addr_width > cap_mgaw(iommu->cap)) 4608 addr_width = cap_mgaw(iommu->cap); 4609 4610 if (dmar_domain->max_addr > (1LL << addr_width)) { 4611 dev_err(dev, "%s: iommu width (%d) is not " 4612 "sufficient for the mapped address (%llx)\n", 4613 __func__, addr_width, dmar_domain->max_addr); 4614 return -EFAULT; 4615 } 4616 dmar_domain->gaw = addr_width; 4617 4618 /* 4619 * Knock out extra levels of page tables if necessary 4620 */ 4621 while (iommu->agaw < dmar_domain->agaw) { 4622 struct dma_pte *pte; 4623 4624 pte = dmar_domain->pgd; 4625 if (dma_pte_present(pte)) { 4626 dmar_domain->pgd = (struct dma_pte *) 4627 phys_to_virt(dma_pte_addr(pte)); 4628 free_pgtable_page(pte); 4629 } 4630 dmar_domain->agaw--; 4631 } 4632 4633 return 0; 4634 } 4635 4636 static int intel_iommu_attach_device(struct iommu_domain *domain, 4637 struct device *dev) 4638 { 4639 int ret; 4640 4641 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4642 device_is_rmrr_locked(dev)) { 4643 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4644 return -EPERM; 4645 } 4646 4647 if (is_aux_domain(dev, domain)) 4648 return -EPERM; 4649 4650 /* normally dev is not mapped */ 4651 if (unlikely(domain_context_mapped(dev))) { 4652 struct dmar_domain *old_domain; 4653 4654 old_domain = find_domain(dev); 4655 if (old_domain) 4656 dmar_remove_one_dev_info(dev); 4657 } 4658 4659 ret = prepare_domain_attach_device(domain, dev); 4660 if (ret) 4661 return ret; 4662 4663 return domain_add_dev_info(to_dmar_domain(domain), dev); 4664 } 4665 4666 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4667 struct device *dev) 4668 { 4669 int ret; 4670 4671 if (!is_aux_domain(dev, domain)) 4672 return -EPERM; 4673 4674 ret = prepare_domain_attach_device(domain, dev); 4675 if (ret) 4676 return ret; 4677 4678 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4679 } 4680 4681 static void intel_iommu_detach_device(struct iommu_domain *domain, 4682 struct device *dev) 4683 { 4684 dmar_remove_one_dev_info(dev); 4685 } 4686 4687 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4688 struct device *dev) 4689 { 4690 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4691 } 4692 4693 #ifdef CONFIG_INTEL_IOMMU_SVM 4694 /* 4695 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4696 * VT-d granularity. Invalidation is typically included in the unmap operation 4697 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4698 * owns the first level page tables. Invalidations of translation caches in the 4699 * guest are trapped and passed down to the host. 4700 * 4701 * vIOMMU in the guest will only expose first level page tables, therefore 4702 * we do not support IOTLB granularity for request without PASID (second level). 4703 * 4704 * For example, to find the VT-d granularity encoding for IOTLB 4705 * type and page selective granularity within PASID: 4706 * X: indexed by iommu cache type 4707 * Y: indexed by enum iommu_inv_granularity 4708 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4709 */ 4710 4711 static const int 4712 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4713 /* 4714 * PASID based IOTLB invalidation: PASID selective (per PASID), 4715 * page selective (address granularity) 4716 */ 4717 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4718 /* PASID based dev TLBs */ 4719 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4720 /* PASID cache */ 4721 {-EINVAL, -EINVAL, -EINVAL} 4722 }; 4723 4724 static inline int to_vtd_granularity(int type, int granu) 4725 { 4726 return inv_type_granu_table[type][granu]; 4727 } 4728 4729 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4730 { 4731 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4732 4733 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 4734 * IOMMU cache invalidate API passes granu_size in bytes, and number of 4735 * granu size in contiguous memory. 4736 */ 4737 return order_base_2(nr_pages); 4738 } 4739 4740 static int 4741 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 4742 struct iommu_cache_invalidate_info *inv_info) 4743 { 4744 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4745 struct device_domain_info *info; 4746 struct intel_iommu *iommu; 4747 unsigned long flags; 4748 int cache_type; 4749 u8 bus, devfn; 4750 u16 did, sid; 4751 int ret = 0; 4752 u64 size = 0; 4753 4754 if (!inv_info || !dmar_domain) 4755 return -EINVAL; 4756 4757 if (!dev || !dev_is_pci(dev)) 4758 return -ENODEV; 4759 4760 iommu = device_to_iommu(dev, &bus, &devfn); 4761 if (!iommu) 4762 return -ENODEV; 4763 4764 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 4765 return -EINVAL; 4766 4767 spin_lock_irqsave(&device_domain_lock, flags); 4768 spin_lock(&iommu->lock); 4769 info = get_domain_info(dev); 4770 if (!info) { 4771 ret = -EINVAL; 4772 goto out_unlock; 4773 } 4774 did = dmar_domain->iommu_did[iommu->seq_id]; 4775 sid = PCI_DEVID(bus, devfn); 4776 4777 /* Size is only valid in address selective invalidation */ 4778 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 4779 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 4780 inv_info->granu.addr_info.nb_granules); 4781 4782 for_each_set_bit(cache_type, 4783 (unsigned long *)&inv_info->cache, 4784 IOMMU_CACHE_INV_TYPE_NR) { 4785 int granu = 0; 4786 u64 pasid = 0; 4787 u64 addr = 0; 4788 4789 granu = to_vtd_granularity(cache_type, inv_info->granularity); 4790 if (granu == -EINVAL) { 4791 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 4792 cache_type, inv_info->granularity); 4793 break; 4794 } 4795 4796 /* 4797 * PASID is stored in different locations based on the 4798 * granularity. 4799 */ 4800 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 4801 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 4802 pasid = inv_info->granu.pasid_info.pasid; 4803 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4804 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 4805 pasid = inv_info->granu.addr_info.pasid; 4806 4807 switch (BIT(cache_type)) { 4808 case IOMMU_CACHE_INV_TYPE_IOTLB: 4809 /* HW will ignore LSB bits based on address mask */ 4810 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4811 size && 4812 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 4813 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 4814 inv_info->granu.addr_info.addr, size); 4815 } 4816 4817 /* 4818 * If granu is PASID-selective, address is ignored. 4819 * We use npages = -1 to indicate that. 4820 */ 4821 qi_flush_piotlb(iommu, did, pasid, 4822 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 4823 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 4824 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 4825 4826 if (!info->ats_enabled) 4827 break; 4828 /* 4829 * Always flush device IOTLB if ATS is enabled. vIOMMU 4830 * in the guest may assume IOTLB flush is inclusive, 4831 * which is more efficient. 4832 */ 4833 fallthrough; 4834 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 4835 /* 4836 * PASID based device TLB invalidation does not support 4837 * IOMMU_INV_GRANU_PASID granularity but only supports 4838 * IOMMU_INV_GRANU_ADDR. 4839 * The equivalent of that is we set the size to be the 4840 * entire range of 64 bit. User only provides PASID info 4841 * without address info. So we set addr to 0. 4842 */ 4843 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 4844 size = 64 - VTD_PAGE_SHIFT; 4845 addr = 0; 4846 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 4847 addr = inv_info->granu.addr_info.addr; 4848 } 4849 4850 if (info->ats_enabled) 4851 qi_flush_dev_iotlb_pasid(iommu, sid, 4852 info->pfsid, pasid, 4853 info->ats_qdep, addr, 4854 size); 4855 else 4856 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 4857 break; 4858 default: 4859 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 4860 cache_type); 4861 ret = -EINVAL; 4862 } 4863 } 4864 out_unlock: 4865 spin_unlock(&iommu->lock); 4866 spin_unlock_irqrestore(&device_domain_lock, flags); 4867 4868 return ret; 4869 } 4870 #endif 4871 4872 static int intel_iommu_map(struct iommu_domain *domain, 4873 unsigned long iova, phys_addr_t hpa, 4874 size_t size, int iommu_prot, gfp_t gfp) 4875 { 4876 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4877 u64 max_addr; 4878 int prot = 0; 4879 int ret; 4880 4881 if (iommu_prot & IOMMU_READ) 4882 prot |= DMA_PTE_READ; 4883 if (iommu_prot & IOMMU_WRITE) 4884 prot |= DMA_PTE_WRITE; 4885 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 4886 prot |= DMA_PTE_SNP; 4887 4888 max_addr = iova + size; 4889 if (dmar_domain->max_addr < max_addr) { 4890 u64 end; 4891 4892 /* check if minimum agaw is sufficient for mapped address */ 4893 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4894 if (end < max_addr) { 4895 pr_err("%s: iommu width (%d) is not " 4896 "sufficient for the mapped address (%llx)\n", 4897 __func__, dmar_domain->gaw, max_addr); 4898 return -EFAULT; 4899 } 4900 dmar_domain->max_addr = max_addr; 4901 } 4902 /* Round up size to next multiple of PAGE_SIZE, if it and 4903 the low bits of hpa would take us onto the next page */ 4904 size = aligned_nrpages(hpa, size); 4905 ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4906 hpa >> VTD_PAGE_SHIFT, size, prot); 4907 return ret; 4908 } 4909 4910 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4911 unsigned long iova, size_t size, 4912 struct iommu_iotlb_gather *gather) 4913 { 4914 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4915 unsigned long start_pfn, last_pfn; 4916 int level = 0; 4917 4918 /* Cope with horrid API which requires us to unmap more than the 4919 size argument if it happens to be a large-page mapping. */ 4920 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4921 4922 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4923 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4924 4925 start_pfn = iova >> VTD_PAGE_SHIFT; 4926 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4927 4928 gather->freelist = domain_unmap(dmar_domain, start_pfn, 4929 last_pfn, gather->freelist); 4930 4931 if (dmar_domain->max_addr == iova + size) 4932 dmar_domain->max_addr = iova; 4933 4934 iommu_iotlb_gather_add_page(domain, gather, iova, size); 4935 4936 return size; 4937 } 4938 4939 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 4940 struct iommu_iotlb_gather *gather) 4941 { 4942 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4943 unsigned long iova_pfn = IOVA_PFN(gather->start); 4944 size_t size = gather->end - gather->start; 4945 unsigned long start_pfn; 4946 unsigned long nrpages; 4947 int iommu_id; 4948 4949 nrpages = aligned_nrpages(gather->start, size); 4950 start_pfn = mm_to_dma_pfn(iova_pfn); 4951 4952 for_each_domain_iommu(iommu_id, dmar_domain) 4953 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 4954 start_pfn, nrpages, !gather->freelist, 0); 4955 4956 dma_free_pagelist(gather->freelist); 4957 } 4958 4959 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 4960 dma_addr_t iova) 4961 { 4962 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4963 struct dma_pte *pte; 4964 int level = 0; 4965 u64 phys = 0; 4966 4967 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 4968 if (pte && dma_pte_present(pte)) 4969 phys = dma_pte_addr(pte) + 4970 (iova & (BIT_MASK(level_to_offset_bits(level) + 4971 VTD_PAGE_SHIFT) - 1)); 4972 4973 return phys; 4974 } 4975 4976 static inline bool scalable_mode_support(void) 4977 { 4978 struct dmar_drhd_unit *drhd; 4979 struct intel_iommu *iommu; 4980 bool ret = true; 4981 4982 rcu_read_lock(); 4983 for_each_active_iommu(iommu, drhd) { 4984 if (!sm_supported(iommu)) { 4985 ret = false; 4986 break; 4987 } 4988 } 4989 rcu_read_unlock(); 4990 4991 return ret; 4992 } 4993 4994 static inline bool iommu_pasid_support(void) 4995 { 4996 struct dmar_drhd_unit *drhd; 4997 struct intel_iommu *iommu; 4998 bool ret = true; 4999 5000 rcu_read_lock(); 5001 for_each_active_iommu(iommu, drhd) { 5002 if (!pasid_supported(iommu)) { 5003 ret = false; 5004 break; 5005 } 5006 } 5007 rcu_read_unlock(); 5008 5009 return ret; 5010 } 5011 5012 static inline bool nested_mode_support(void) 5013 { 5014 struct dmar_drhd_unit *drhd; 5015 struct intel_iommu *iommu; 5016 bool ret = true; 5017 5018 rcu_read_lock(); 5019 for_each_active_iommu(iommu, drhd) { 5020 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5021 ret = false; 5022 break; 5023 } 5024 } 5025 rcu_read_unlock(); 5026 5027 return ret; 5028 } 5029 5030 static bool intel_iommu_capable(enum iommu_cap cap) 5031 { 5032 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5033 return domain_update_iommu_snooping(NULL) == 1; 5034 if (cap == IOMMU_CAP_INTR_REMAP) 5035 return irq_remapping_enabled == 1; 5036 5037 return false; 5038 } 5039 5040 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5041 { 5042 struct intel_iommu *iommu; 5043 5044 iommu = device_to_iommu(dev, NULL, NULL); 5045 if (!iommu) 5046 return ERR_PTR(-ENODEV); 5047 5048 if (translation_pre_enabled(iommu)) 5049 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5050 5051 return &iommu->iommu; 5052 } 5053 5054 static void intel_iommu_release_device(struct device *dev) 5055 { 5056 struct intel_iommu *iommu; 5057 5058 iommu = device_to_iommu(dev, NULL, NULL); 5059 if (!iommu) 5060 return; 5061 5062 dmar_remove_one_dev_info(dev); 5063 5064 set_dma_ops(dev, NULL); 5065 } 5066 5067 static void intel_iommu_probe_finalize(struct device *dev) 5068 { 5069 dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT; 5070 struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 5071 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5072 5073 if (domain && domain->type == IOMMU_DOMAIN_DMA) 5074 iommu_setup_dma_ops(dev, base, 5075 __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base); 5076 else 5077 set_dma_ops(dev, NULL); 5078 } 5079 5080 static void intel_iommu_get_resv_regions(struct device *device, 5081 struct list_head *head) 5082 { 5083 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5084 struct iommu_resv_region *reg; 5085 struct dmar_rmrr_unit *rmrr; 5086 struct device *i_dev; 5087 int i; 5088 5089 down_read(&dmar_global_lock); 5090 for_each_rmrr_units(rmrr) { 5091 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5092 i, i_dev) { 5093 struct iommu_resv_region *resv; 5094 enum iommu_resv_type type; 5095 size_t length; 5096 5097 if (i_dev != device && 5098 !is_downstream_to_pci_bridge(device, i_dev)) 5099 continue; 5100 5101 length = rmrr->end_address - rmrr->base_address + 1; 5102 5103 type = device_rmrr_is_relaxable(device) ? 5104 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5105 5106 resv = iommu_alloc_resv_region(rmrr->base_address, 5107 length, prot, type); 5108 if (!resv) 5109 break; 5110 5111 list_add_tail(&resv->list, head); 5112 } 5113 } 5114 up_read(&dmar_global_lock); 5115 5116 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5117 if (dev_is_pci(device)) { 5118 struct pci_dev *pdev = to_pci_dev(device); 5119 5120 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5121 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5122 IOMMU_RESV_DIRECT_RELAXABLE); 5123 if (reg) 5124 list_add_tail(®->list, head); 5125 } 5126 } 5127 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5128 5129 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5130 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5131 0, IOMMU_RESV_MSI); 5132 if (!reg) 5133 return; 5134 list_add_tail(®->list, head); 5135 } 5136 5137 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5138 { 5139 struct device_domain_info *info; 5140 struct context_entry *context; 5141 struct dmar_domain *domain; 5142 unsigned long flags; 5143 u64 ctx_lo; 5144 int ret; 5145 5146 domain = find_domain(dev); 5147 if (!domain) 5148 return -EINVAL; 5149 5150 spin_lock_irqsave(&device_domain_lock, flags); 5151 spin_lock(&iommu->lock); 5152 5153 ret = -EINVAL; 5154 info = get_domain_info(dev); 5155 if (!info || !info->pasid_supported) 5156 goto out; 5157 5158 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5159 if (WARN_ON(!context)) 5160 goto out; 5161 5162 ctx_lo = context[0].lo; 5163 5164 if (!(ctx_lo & CONTEXT_PASIDE)) { 5165 ctx_lo |= CONTEXT_PASIDE; 5166 context[0].lo = ctx_lo; 5167 wmb(); 5168 iommu->flush.flush_context(iommu, 5169 domain->iommu_did[iommu->seq_id], 5170 PCI_DEVID(info->bus, info->devfn), 5171 DMA_CCMD_MASK_NOBIT, 5172 DMA_CCMD_DEVICE_INVL); 5173 } 5174 5175 /* Enable PASID support in the device, if it wasn't already */ 5176 if (!info->pasid_enabled) 5177 iommu_enable_dev_iotlb(info); 5178 5179 ret = 0; 5180 5181 out: 5182 spin_unlock(&iommu->lock); 5183 spin_unlock_irqrestore(&device_domain_lock, flags); 5184 5185 return ret; 5186 } 5187 5188 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5189 { 5190 if (dev_is_pci(dev)) 5191 return pci_device_group(dev); 5192 return generic_device_group(dev); 5193 } 5194 5195 static int intel_iommu_enable_auxd(struct device *dev) 5196 { 5197 struct device_domain_info *info; 5198 struct intel_iommu *iommu; 5199 unsigned long flags; 5200 int ret; 5201 5202 iommu = device_to_iommu(dev, NULL, NULL); 5203 if (!iommu || dmar_disabled) 5204 return -EINVAL; 5205 5206 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5207 return -EINVAL; 5208 5209 ret = intel_iommu_enable_pasid(iommu, dev); 5210 if (ret) 5211 return -ENODEV; 5212 5213 spin_lock_irqsave(&device_domain_lock, flags); 5214 info = get_domain_info(dev); 5215 info->auxd_enabled = 1; 5216 spin_unlock_irqrestore(&device_domain_lock, flags); 5217 5218 return 0; 5219 } 5220 5221 static int intel_iommu_disable_auxd(struct device *dev) 5222 { 5223 struct device_domain_info *info; 5224 unsigned long flags; 5225 5226 spin_lock_irqsave(&device_domain_lock, flags); 5227 info = get_domain_info(dev); 5228 if (!WARN_ON(!info)) 5229 info->auxd_enabled = 0; 5230 spin_unlock_irqrestore(&device_domain_lock, flags); 5231 5232 return 0; 5233 } 5234 5235 /* 5236 * A PCI express designated vendor specific extended capability is defined 5237 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5238 * for system software and tools to detect endpoint devices supporting the 5239 * Intel scalable IO virtualization without host driver dependency. 5240 * 5241 * Returns the address of the matching extended capability structure within 5242 * the device's PCI configuration space or 0 if the device does not support 5243 * it. 5244 */ 5245 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5246 { 5247 int pos; 5248 u16 vendor, id; 5249 5250 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5251 while (pos) { 5252 pci_read_config_word(pdev, pos + 4, &vendor); 5253 pci_read_config_word(pdev, pos + 8, &id); 5254 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5255 return pos; 5256 5257 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5258 } 5259 5260 return 0; 5261 } 5262 5263 static bool 5264 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5265 { 5266 if (feat == IOMMU_DEV_FEAT_AUX) { 5267 int ret; 5268 5269 if (!dev_is_pci(dev) || dmar_disabled || 5270 !scalable_mode_support() || !iommu_pasid_support()) 5271 return false; 5272 5273 ret = pci_pasid_features(to_pci_dev(dev)); 5274 if (ret < 0) 5275 return false; 5276 5277 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5278 } 5279 5280 if (feat == IOMMU_DEV_FEAT_SVA) { 5281 struct device_domain_info *info = get_domain_info(dev); 5282 5283 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5284 info->pasid_supported && info->pri_supported && 5285 info->ats_supported; 5286 } 5287 5288 return false; 5289 } 5290 5291 static int 5292 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5293 { 5294 if (feat == IOMMU_DEV_FEAT_AUX) 5295 return intel_iommu_enable_auxd(dev); 5296 5297 if (feat == IOMMU_DEV_FEAT_SVA) { 5298 struct device_domain_info *info = get_domain_info(dev); 5299 5300 if (!info) 5301 return -EINVAL; 5302 5303 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5304 return 0; 5305 } 5306 5307 return -ENODEV; 5308 } 5309 5310 static int 5311 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5312 { 5313 if (feat == IOMMU_DEV_FEAT_AUX) 5314 return intel_iommu_disable_auxd(dev); 5315 5316 return -ENODEV; 5317 } 5318 5319 static bool 5320 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5321 { 5322 struct device_domain_info *info = get_domain_info(dev); 5323 5324 if (feat == IOMMU_DEV_FEAT_AUX) 5325 return scalable_mode_support() && info && info->auxd_enabled; 5326 5327 return false; 5328 } 5329 5330 static int 5331 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5332 { 5333 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5334 5335 return dmar_domain->default_pasid > 0 ? 5336 dmar_domain->default_pasid : -EINVAL; 5337 } 5338 5339 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5340 struct device *dev) 5341 { 5342 return attach_deferred(dev); 5343 } 5344 5345 static int 5346 intel_iommu_domain_set_attr(struct iommu_domain *domain, 5347 enum iommu_attr attr, void *data) 5348 { 5349 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5350 unsigned long flags; 5351 int ret = 0; 5352 5353 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 5354 return -EINVAL; 5355 5356 switch (attr) { 5357 case DOMAIN_ATTR_NESTING: 5358 spin_lock_irqsave(&device_domain_lock, flags); 5359 if (nested_mode_support() && 5360 list_empty(&dmar_domain->devices)) { 5361 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5362 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5363 } else { 5364 ret = -ENODEV; 5365 } 5366 spin_unlock_irqrestore(&device_domain_lock, flags); 5367 break; 5368 default: 5369 ret = -EINVAL; 5370 break; 5371 } 5372 5373 return ret; 5374 } 5375 5376 static int 5377 intel_iommu_domain_get_attr(struct iommu_domain *domain, 5378 enum iommu_attr attr, void *data) 5379 { 5380 switch (domain->type) { 5381 case IOMMU_DOMAIN_UNMANAGED: 5382 return -ENODEV; 5383 case IOMMU_DOMAIN_DMA: 5384 switch (attr) { 5385 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: 5386 *(int *)data = !intel_iommu_strict; 5387 return 0; 5388 default: 5389 return -ENODEV; 5390 } 5391 break; 5392 default: 5393 return -EINVAL; 5394 } 5395 } 5396 5397 /* 5398 * Check that the device does not live on an external facing PCI port that is 5399 * marked as untrusted. Such devices should not be able to apply quirks and 5400 * thus not be able to bypass the IOMMU restrictions. 5401 */ 5402 static bool risky_device(struct pci_dev *pdev) 5403 { 5404 if (pdev->untrusted) { 5405 pci_info(pdev, 5406 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5407 pdev->vendor, pdev->device); 5408 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5409 return true; 5410 } 5411 return false; 5412 } 5413 5414 const struct iommu_ops intel_iommu_ops = { 5415 .capable = intel_iommu_capable, 5416 .domain_alloc = intel_iommu_domain_alloc, 5417 .domain_free = intel_iommu_domain_free, 5418 .domain_get_attr = intel_iommu_domain_get_attr, 5419 .domain_set_attr = intel_iommu_domain_set_attr, 5420 .attach_dev = intel_iommu_attach_device, 5421 .detach_dev = intel_iommu_detach_device, 5422 .aux_attach_dev = intel_iommu_aux_attach_device, 5423 .aux_detach_dev = intel_iommu_aux_detach_device, 5424 .aux_get_pasid = intel_iommu_aux_get_pasid, 5425 .map = intel_iommu_map, 5426 .unmap = intel_iommu_unmap, 5427 .flush_iotlb_all = intel_flush_iotlb_all, 5428 .iotlb_sync = intel_iommu_tlb_sync, 5429 .iova_to_phys = intel_iommu_iova_to_phys, 5430 .probe_device = intel_iommu_probe_device, 5431 .probe_finalize = intel_iommu_probe_finalize, 5432 .release_device = intel_iommu_release_device, 5433 .get_resv_regions = intel_iommu_get_resv_regions, 5434 .put_resv_regions = generic_iommu_put_resv_regions, 5435 .device_group = intel_iommu_device_group, 5436 .dev_has_feat = intel_iommu_dev_has_feat, 5437 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5438 .dev_enable_feat = intel_iommu_dev_enable_feat, 5439 .dev_disable_feat = intel_iommu_dev_disable_feat, 5440 .is_attach_deferred = intel_iommu_is_attach_deferred, 5441 .def_domain_type = device_def_domain_type, 5442 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 5443 #ifdef CONFIG_INTEL_IOMMU_SVM 5444 .cache_invalidate = intel_iommu_sva_invalidate, 5445 .sva_bind_gpasid = intel_svm_bind_gpasid, 5446 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5447 .sva_bind = intel_svm_bind, 5448 .sva_unbind = intel_svm_unbind, 5449 .sva_get_pasid = intel_svm_get_pasid, 5450 .page_response = intel_svm_page_response, 5451 #endif 5452 }; 5453 5454 static void quirk_iommu_igfx(struct pci_dev *dev) 5455 { 5456 if (risky_device(dev)) 5457 return; 5458 5459 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5460 dmar_map_gfx = 0; 5461 } 5462 5463 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5470 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5471 5472 /* Broadwell igfx malfunctions with dmar */ 5473 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5488 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5489 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5491 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5492 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5493 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5497 5498 static void quirk_iommu_rwbf(struct pci_dev *dev) 5499 { 5500 if (risky_device(dev)) 5501 return; 5502 5503 /* 5504 * Mobile 4 Series Chipset neglects to set RWBF capability, 5505 * but needs it. Same seems to hold for the desktop versions. 5506 */ 5507 pci_info(dev, "Forcing write-buffer flush capability\n"); 5508 rwbf_quirk = 1; 5509 } 5510 5511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5518 5519 #define GGC 0x52 5520 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5521 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5522 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5523 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5524 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5525 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5526 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5527 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5528 5529 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5530 { 5531 unsigned short ggc; 5532 5533 if (risky_device(dev)) 5534 return; 5535 5536 if (pci_read_config_word(dev, GGC, &ggc)) 5537 return; 5538 5539 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5540 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5541 dmar_map_gfx = 0; 5542 } else if (dmar_map_gfx) { 5543 /* we have to ensure the gfx device is idle before we flush */ 5544 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5545 intel_iommu_strict = 1; 5546 } 5547 } 5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5552 5553 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5554 { 5555 unsigned short ver; 5556 5557 if (!IS_GFX_DEVICE(dev)) 5558 return; 5559 5560 ver = (dev->device >> 8) & 0xff; 5561 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5562 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5563 ver != 0x9a) 5564 return; 5565 5566 if (risky_device(dev)) 5567 return; 5568 5569 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5570 iommu_skip_te_disable = 1; 5571 } 5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5573 5574 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5575 ISOCH DMAR unit for the Azalia sound device, but not give it any 5576 TLB entries, which causes it to deadlock. Check for that. We do 5577 this in a function called from init_dmars(), instead of in a PCI 5578 quirk, because we don't want to print the obnoxious "BIOS broken" 5579 message if VT-d is actually disabled. 5580 */ 5581 static void __init check_tylersburg_isoch(void) 5582 { 5583 struct pci_dev *pdev; 5584 uint32_t vtisochctrl; 5585 5586 /* If there's no Azalia in the system anyway, forget it. */ 5587 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5588 if (!pdev) 5589 return; 5590 5591 if (risky_device(pdev)) { 5592 pci_dev_put(pdev); 5593 return; 5594 } 5595 5596 pci_dev_put(pdev); 5597 5598 /* System Management Registers. Might be hidden, in which case 5599 we can't do the sanity check. But that's OK, because the 5600 known-broken BIOSes _don't_ actually hide it, so far. */ 5601 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5602 if (!pdev) 5603 return; 5604 5605 if (risky_device(pdev)) { 5606 pci_dev_put(pdev); 5607 return; 5608 } 5609 5610 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5611 pci_dev_put(pdev); 5612 return; 5613 } 5614 5615 pci_dev_put(pdev); 5616 5617 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5618 if (vtisochctrl & 1) 5619 return; 5620 5621 /* Drop all bits other than the number of TLB entries */ 5622 vtisochctrl &= 0x1c; 5623 5624 /* If we have the recommended number of TLB entries (16), fine. */ 5625 if (vtisochctrl == 0x10) 5626 return; 5627 5628 /* Zero TLB entries? You get to ride the short bus to school. */ 5629 if (!vtisochctrl) { 5630 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5631 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5632 dmi_get_system_info(DMI_BIOS_VENDOR), 5633 dmi_get_system_info(DMI_BIOS_VERSION), 5634 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5635 iommu_identity_mapping |= IDENTMAP_AZALIA; 5636 return; 5637 } 5638 5639 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5640 vtisochctrl); 5641 } 5642