1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/syscore_ops.h> 37 #include <linux/tboot.h> 38 #include <linux/dmi.h> 39 #include <linux/pci-ats.h> 40 #include <linux/memblock.h> 41 #include <linux/dma-direct.h> 42 #include <linux/crash_dump.h> 43 #include <linux/numa.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/cacheflush.h> 46 #include <asm/iommu.h> 47 #include <trace/events/intel_iommu.h> 48 49 #include "../irq_remapping.h" 50 #include "pasid.h" 51 52 #define ROOT_SIZE VTD_PAGE_SIZE 53 #define CONTEXT_SIZE VTD_PAGE_SIZE 54 55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 59 60 #define IOAPIC_RANGE_START (0xfee00000) 61 #define IOAPIC_RANGE_END (0xfeefffff) 62 #define IOVA_START_ADDR (0x1000) 63 64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 65 66 #define MAX_AGAW_WIDTH 64 67 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 68 69 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 71 72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 73 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 74 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 75 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 76 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 77 78 /* IO virtual address start page frame number */ 79 #define IOVA_START_PFN (1) 80 81 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 82 83 /* page table handling */ 84 #define LEVEL_STRIDE (9) 85 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 86 87 /* 88 * This bitmap is used to advertise the page sizes our hardware support 89 * to the IOMMU core, which will then use this information to split 90 * physically contiguous memory regions it is mapping into page sizes 91 * that we support. 92 * 93 * Traditionally the IOMMU core just handed us the mappings directly, 94 * after making sure the size is an order of a 4KiB page and that the 95 * mapping has natural alignment. 96 * 97 * To retain this behavior, we currently advertise that we support 98 * all page sizes that are an order of 4KiB. 99 * 100 * If at some point we'd like to utilize the IOMMU core's new behavior, 101 * we could change this to advertise the real page sizes we support. 102 */ 103 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 104 105 static inline int agaw_to_level(int agaw) 106 { 107 return agaw + 2; 108 } 109 110 static inline int agaw_to_width(int agaw) 111 { 112 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 113 } 114 115 static inline int width_to_agaw(int width) 116 { 117 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 118 } 119 120 static inline unsigned int level_to_offset_bits(int level) 121 { 122 return (level - 1) * LEVEL_STRIDE; 123 } 124 125 static inline int pfn_level_offset(u64 pfn, int level) 126 { 127 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 128 } 129 130 static inline u64 level_mask(int level) 131 { 132 return -1ULL << level_to_offset_bits(level); 133 } 134 135 static inline u64 level_size(int level) 136 { 137 return 1ULL << level_to_offset_bits(level); 138 } 139 140 static inline u64 align_to_level(u64 pfn, int level) 141 { 142 return (pfn + level_size(level) - 1) & level_mask(level); 143 } 144 145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 146 { 147 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 148 } 149 150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 151 are never going to work. */ 152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 153 { 154 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 155 } 156 157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 158 { 159 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 160 } 161 static inline unsigned long page_to_dma_pfn(struct page *pg) 162 { 163 return mm_to_dma_pfn(page_to_pfn(pg)); 164 } 165 static inline unsigned long virt_to_dma_pfn(void *p) 166 { 167 return page_to_dma_pfn(virt_to_page(p)); 168 } 169 170 /* global iommu list, set NULL for ignored DMAR units */ 171 static struct intel_iommu **g_iommus; 172 173 static void __init check_tylersburg_isoch(void); 174 static int rwbf_quirk; 175 176 /* 177 * set to 1 to panic kernel if can't successfully enable VT-d 178 * (used when kernel is launched w/ TXT) 179 */ 180 static int force_on = 0; 181 static int intel_iommu_tboot_noforce; 182 static int no_platform_optin; 183 184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 185 186 /* 187 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 188 * if marked present. 189 */ 190 static phys_addr_t root_entry_lctp(struct root_entry *re) 191 { 192 if (!(re->lo & 1)) 193 return 0; 194 195 return re->lo & VTD_PAGE_MASK; 196 } 197 198 /* 199 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 200 * if marked present. 201 */ 202 static phys_addr_t root_entry_uctp(struct root_entry *re) 203 { 204 if (!(re->hi & 1)) 205 return 0; 206 207 return re->hi & VTD_PAGE_MASK; 208 } 209 210 static inline void context_clear_pasid_enable(struct context_entry *context) 211 { 212 context->lo &= ~(1ULL << 11); 213 } 214 215 static inline bool context_pasid_enabled(struct context_entry *context) 216 { 217 return !!(context->lo & (1ULL << 11)); 218 } 219 220 static inline void context_set_copied(struct context_entry *context) 221 { 222 context->hi |= (1ull << 3); 223 } 224 225 static inline bool context_copied(struct context_entry *context) 226 { 227 return !!(context->hi & (1ULL << 3)); 228 } 229 230 static inline bool __context_present(struct context_entry *context) 231 { 232 return (context->lo & 1); 233 } 234 235 bool context_present(struct context_entry *context) 236 { 237 return context_pasid_enabled(context) ? 238 __context_present(context) : 239 __context_present(context) && !context_copied(context); 240 } 241 242 static inline void context_set_present(struct context_entry *context) 243 { 244 context->lo |= 1; 245 } 246 247 static inline void context_set_fault_enable(struct context_entry *context) 248 { 249 context->lo &= (((u64)-1) << 2) | 1; 250 } 251 252 static inline void context_set_translation_type(struct context_entry *context, 253 unsigned long value) 254 { 255 context->lo &= (((u64)-1) << 4) | 3; 256 context->lo |= (value & 3) << 2; 257 } 258 259 static inline void context_set_address_root(struct context_entry *context, 260 unsigned long value) 261 { 262 context->lo &= ~VTD_PAGE_MASK; 263 context->lo |= value & VTD_PAGE_MASK; 264 } 265 266 static inline void context_set_address_width(struct context_entry *context, 267 unsigned long value) 268 { 269 context->hi |= value & 7; 270 } 271 272 static inline void context_set_domain_id(struct context_entry *context, 273 unsigned long value) 274 { 275 context->hi |= (value & ((1 << 16) - 1)) << 8; 276 } 277 278 static inline int context_domain_id(struct context_entry *c) 279 { 280 return((c->hi >> 8) & 0xffff); 281 } 282 283 static inline void context_clear_entry(struct context_entry *context) 284 { 285 context->lo = 0; 286 context->hi = 0; 287 } 288 289 /* 290 * This domain is a statically identity mapping domain. 291 * 1. This domain creats a static 1:1 mapping to all usable memory. 292 * 2. It maps to each iommu if successful. 293 * 3. Each iommu mapps to this domain if successful. 294 */ 295 static struct dmar_domain *si_domain; 296 static int hw_pass_through = 1; 297 298 #define for_each_domain_iommu(idx, domain) \ 299 for (idx = 0; idx < g_num_of_iommus; idx++) \ 300 if (domain->iommu_refcnt[idx]) 301 302 struct dmar_rmrr_unit { 303 struct list_head list; /* list of rmrr units */ 304 struct acpi_dmar_header *hdr; /* ACPI header */ 305 u64 base_address; /* reserved base address*/ 306 u64 end_address; /* reserved end address */ 307 struct dmar_dev_scope *devices; /* target devices */ 308 int devices_cnt; /* target device count */ 309 }; 310 311 struct dmar_atsr_unit { 312 struct list_head list; /* list of ATSR units */ 313 struct acpi_dmar_header *hdr; /* ACPI header */ 314 struct dmar_dev_scope *devices; /* target devices */ 315 int devices_cnt; /* target device count */ 316 u8 include_all:1; /* include all ports */ 317 }; 318 319 static LIST_HEAD(dmar_atsr_units); 320 static LIST_HEAD(dmar_rmrr_units); 321 322 #define for_each_rmrr_units(rmrr) \ 323 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 324 325 /* bitmap for indexing intel_iommus */ 326 static int g_num_of_iommus; 327 328 static void domain_exit(struct dmar_domain *domain); 329 static void domain_remove_dev_info(struct dmar_domain *domain); 330 static void dmar_remove_one_dev_info(struct device *dev); 331 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 332 static int intel_iommu_attach_device(struct iommu_domain *domain, 333 struct device *dev); 334 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 335 dma_addr_t iova); 336 337 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 338 int dmar_disabled = 0; 339 #else 340 int dmar_disabled = 1; 341 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 342 343 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 344 int intel_iommu_sm = 1; 345 #else 346 int intel_iommu_sm; 347 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 348 349 int intel_iommu_enabled = 0; 350 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 351 352 static int dmar_map_gfx = 1; 353 static int dmar_forcedac; 354 static int intel_iommu_strict; 355 static int intel_iommu_superpage = 1; 356 static int iommu_identity_mapping; 357 static int iommu_skip_te_disable; 358 359 #define IDENTMAP_GFX 2 360 #define IDENTMAP_AZALIA 4 361 362 int intel_iommu_gfx_mapped; 363 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 364 365 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 366 struct device_domain_info *get_domain_info(struct device *dev) 367 { 368 struct device_domain_info *info; 369 370 if (!dev) 371 return NULL; 372 373 info = dev_iommu_priv_get(dev); 374 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 375 return NULL; 376 377 return info; 378 } 379 380 DEFINE_SPINLOCK(device_domain_lock); 381 static LIST_HEAD(device_domain_list); 382 383 /* 384 * Iterate over elements in device_domain_list and call the specified 385 * callback @fn against each element. 386 */ 387 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 388 void *data), void *data) 389 { 390 int ret = 0; 391 unsigned long flags; 392 struct device_domain_info *info; 393 394 spin_lock_irqsave(&device_domain_lock, flags); 395 list_for_each_entry(info, &device_domain_list, global) { 396 ret = fn(info, data); 397 if (ret) { 398 spin_unlock_irqrestore(&device_domain_lock, flags); 399 return ret; 400 } 401 } 402 spin_unlock_irqrestore(&device_domain_lock, flags); 403 404 return 0; 405 } 406 407 const struct iommu_ops intel_iommu_ops; 408 409 static bool translation_pre_enabled(struct intel_iommu *iommu) 410 { 411 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 412 } 413 414 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 415 { 416 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 417 } 418 419 static void init_translation_status(struct intel_iommu *iommu) 420 { 421 u32 gsts; 422 423 gsts = readl(iommu->reg + DMAR_GSTS_REG); 424 if (gsts & DMA_GSTS_TES) 425 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 426 } 427 428 static int __init intel_iommu_setup(char *str) 429 { 430 if (!str) 431 return -EINVAL; 432 while (*str) { 433 if (!strncmp(str, "on", 2)) { 434 dmar_disabled = 0; 435 pr_info("IOMMU enabled\n"); 436 } else if (!strncmp(str, "off", 3)) { 437 dmar_disabled = 1; 438 no_platform_optin = 1; 439 pr_info("IOMMU disabled\n"); 440 } else if (!strncmp(str, "igfx_off", 8)) { 441 dmar_map_gfx = 0; 442 pr_info("Disable GFX device mapping\n"); 443 } else if (!strncmp(str, "forcedac", 8)) { 444 pr_info("Forcing DAC for PCI devices\n"); 445 dmar_forcedac = 1; 446 } else if (!strncmp(str, "strict", 6)) { 447 pr_info("Disable batched IOTLB flush\n"); 448 intel_iommu_strict = 1; 449 } else if (!strncmp(str, "sp_off", 6)) { 450 pr_info("Disable supported super page\n"); 451 intel_iommu_superpage = 0; 452 } else if (!strncmp(str, "sm_on", 5)) { 453 pr_info("Intel-IOMMU: scalable mode supported\n"); 454 intel_iommu_sm = 1; 455 } else if (!strncmp(str, "tboot_noforce", 13)) { 456 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 457 intel_iommu_tboot_noforce = 1; 458 } 459 460 str += strcspn(str, ","); 461 while (*str == ',') 462 str++; 463 } 464 return 0; 465 } 466 __setup("intel_iommu=", intel_iommu_setup); 467 468 static struct kmem_cache *iommu_domain_cache; 469 static struct kmem_cache *iommu_devinfo_cache; 470 471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 472 { 473 struct dmar_domain **domains; 474 int idx = did >> 8; 475 476 domains = iommu->domains[idx]; 477 if (!domains) 478 return NULL; 479 480 return domains[did & 0xff]; 481 } 482 483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 484 struct dmar_domain *domain) 485 { 486 struct dmar_domain **domains; 487 int idx = did >> 8; 488 489 if (!iommu->domains[idx]) { 490 size_t size = 256 * sizeof(struct dmar_domain *); 491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 492 } 493 494 domains = iommu->domains[idx]; 495 if (WARN_ON(!domains)) 496 return; 497 else 498 domains[did & 0xff] = domain; 499 } 500 501 void *alloc_pgtable_page(int node) 502 { 503 struct page *page; 504 void *vaddr = NULL; 505 506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 507 if (page) 508 vaddr = page_address(page); 509 return vaddr; 510 } 511 512 void free_pgtable_page(void *vaddr) 513 { 514 free_page((unsigned long)vaddr); 515 } 516 517 static inline void *alloc_domain_mem(void) 518 { 519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 520 } 521 522 static void free_domain_mem(void *vaddr) 523 { 524 kmem_cache_free(iommu_domain_cache, vaddr); 525 } 526 527 static inline void * alloc_devinfo_mem(void) 528 { 529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 530 } 531 532 static inline void free_devinfo_mem(void *vaddr) 533 { 534 kmem_cache_free(iommu_devinfo_cache, vaddr); 535 } 536 537 static inline int domain_type_is_si(struct dmar_domain *domain) 538 { 539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 540 } 541 542 static inline bool domain_use_first_level(struct dmar_domain *domain) 543 { 544 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 545 } 546 547 static inline int domain_pfn_supported(struct dmar_domain *domain, 548 unsigned long pfn) 549 { 550 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 551 552 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 553 } 554 555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 556 { 557 unsigned long sagaw; 558 int agaw = -1; 559 560 sagaw = cap_sagaw(iommu->cap); 561 for (agaw = width_to_agaw(max_gaw); 562 agaw >= 0; agaw--) { 563 if (test_bit(agaw, &sagaw)) 564 break; 565 } 566 567 return agaw; 568 } 569 570 /* 571 * Calculate max SAGAW for each iommu. 572 */ 573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 574 { 575 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 576 } 577 578 /* 579 * calculate agaw for each iommu. 580 * "SAGAW" may be different across iommus, use a default agaw, and 581 * get a supported less agaw for iommus that don't support the default agaw. 582 */ 583 int iommu_calculate_agaw(struct intel_iommu *iommu) 584 { 585 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 586 } 587 588 /* This functionin only returns single iommu in a domain */ 589 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 590 { 591 int iommu_id; 592 593 /* si_domain and vm domain should not get here. */ 594 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 595 return NULL; 596 597 for_each_domain_iommu(iommu_id, domain) 598 break; 599 600 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 601 return NULL; 602 603 return g_iommus[iommu_id]; 604 } 605 606 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 607 { 608 return sm_supported(iommu) ? 609 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 610 } 611 612 static void domain_update_iommu_coherency(struct dmar_domain *domain) 613 { 614 struct dmar_drhd_unit *drhd; 615 struct intel_iommu *iommu; 616 bool found = false; 617 int i; 618 619 domain->iommu_coherency = 1; 620 621 for_each_domain_iommu(i, domain) { 622 found = true; 623 if (!iommu_paging_structure_coherency(g_iommus[i])) { 624 domain->iommu_coherency = 0; 625 break; 626 } 627 } 628 if (found) 629 return; 630 631 /* No hardware attached; use lowest common denominator */ 632 rcu_read_lock(); 633 for_each_active_iommu(iommu, drhd) { 634 if (!iommu_paging_structure_coherency(iommu)) { 635 domain->iommu_coherency = 0; 636 break; 637 } 638 } 639 rcu_read_unlock(); 640 } 641 642 static int domain_update_iommu_snooping(struct intel_iommu *skip) 643 { 644 struct dmar_drhd_unit *drhd; 645 struct intel_iommu *iommu; 646 int ret = 1; 647 648 rcu_read_lock(); 649 for_each_active_iommu(iommu, drhd) { 650 if (iommu != skip) { 651 if (!ecap_sc_support(iommu->ecap)) { 652 ret = 0; 653 break; 654 } 655 } 656 } 657 rcu_read_unlock(); 658 659 return ret; 660 } 661 662 static int domain_update_iommu_superpage(struct dmar_domain *domain, 663 struct intel_iommu *skip) 664 { 665 struct dmar_drhd_unit *drhd; 666 struct intel_iommu *iommu; 667 int mask = 0x3; 668 669 if (!intel_iommu_superpage) { 670 return 0; 671 } 672 673 /* set iommu_superpage to the smallest common denominator */ 674 rcu_read_lock(); 675 for_each_active_iommu(iommu, drhd) { 676 if (iommu != skip) { 677 if (domain && domain_use_first_level(domain)) { 678 if (!cap_fl1gp_support(iommu->cap)) 679 mask = 0x1; 680 } else { 681 mask &= cap_super_page_val(iommu->cap); 682 } 683 684 if (!mask) 685 break; 686 } 687 } 688 rcu_read_unlock(); 689 690 return fls(mask); 691 } 692 693 static int domain_update_device_node(struct dmar_domain *domain) 694 { 695 struct device_domain_info *info; 696 int nid = NUMA_NO_NODE; 697 698 assert_spin_locked(&device_domain_lock); 699 700 if (list_empty(&domain->devices)) 701 return NUMA_NO_NODE; 702 703 list_for_each_entry(info, &domain->devices, link) { 704 if (!info->dev) 705 continue; 706 707 /* 708 * There could possibly be multiple device numa nodes as devices 709 * within the same domain may sit behind different IOMMUs. There 710 * isn't perfect answer in such situation, so we select first 711 * come first served policy. 712 */ 713 nid = dev_to_node(info->dev); 714 if (nid != NUMA_NO_NODE) 715 break; 716 } 717 718 return nid; 719 } 720 721 static void domain_update_iotlb(struct dmar_domain *domain); 722 723 /* Some capabilities may be different across iommus */ 724 static void domain_update_iommu_cap(struct dmar_domain *domain) 725 { 726 domain_update_iommu_coherency(domain); 727 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 728 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 729 730 /* 731 * If RHSA is missing, we should default to the device numa domain 732 * as fall back. 733 */ 734 if (domain->nid == NUMA_NO_NODE) 735 domain->nid = domain_update_device_node(domain); 736 737 /* 738 * First-level translation restricts the input-address to a 739 * canonical address (i.e., address bits 63:N have the same 740 * value as address bit [N-1], where N is 48-bits with 4-level 741 * paging and 57-bits with 5-level paging). Hence, skip bit 742 * [N-1]. 743 */ 744 if (domain_use_first_level(domain)) 745 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 746 else 747 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 748 749 domain_update_iotlb(domain); 750 } 751 752 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 753 u8 devfn, int alloc) 754 { 755 struct root_entry *root = &iommu->root_entry[bus]; 756 struct context_entry *context; 757 u64 *entry; 758 759 entry = &root->lo; 760 if (sm_supported(iommu)) { 761 if (devfn >= 0x80) { 762 devfn -= 0x80; 763 entry = &root->hi; 764 } 765 devfn *= 2; 766 } 767 if (*entry & 1) 768 context = phys_to_virt(*entry & VTD_PAGE_MASK); 769 else { 770 unsigned long phy_addr; 771 if (!alloc) 772 return NULL; 773 774 context = alloc_pgtable_page(iommu->node); 775 if (!context) 776 return NULL; 777 778 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 779 phy_addr = virt_to_phys((void *)context); 780 *entry = phy_addr | 1; 781 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 782 } 783 return &context[devfn]; 784 } 785 786 static bool attach_deferred(struct device *dev) 787 { 788 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 789 } 790 791 /** 792 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 793 * sub-hierarchy of a candidate PCI-PCI bridge 794 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 795 * @bridge: the candidate PCI-PCI bridge 796 * 797 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 798 */ 799 static bool 800 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 801 { 802 struct pci_dev *pdev, *pbridge; 803 804 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 805 return false; 806 807 pdev = to_pci_dev(dev); 808 pbridge = to_pci_dev(bridge); 809 810 if (pbridge->subordinate && 811 pbridge->subordinate->number <= pdev->bus->number && 812 pbridge->subordinate->busn_res.end >= pdev->bus->number) 813 return true; 814 815 return false; 816 } 817 818 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 819 { 820 struct dmar_drhd_unit *drhd; 821 u32 vtbar; 822 int rc; 823 824 /* We know that this device on this chipset has its own IOMMU. 825 * If we find it under a different IOMMU, then the BIOS is lying 826 * to us. Hope that the IOMMU for this device is actually 827 * disabled, and it needs no translation... 828 */ 829 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 830 if (rc) { 831 /* "can't" happen */ 832 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 833 return false; 834 } 835 vtbar &= 0xffff0000; 836 837 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 838 drhd = dmar_find_matched_drhd_unit(pdev); 839 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 840 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 841 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 842 return true; 843 } 844 845 return false; 846 } 847 848 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 849 { 850 if (!iommu || iommu->drhd->ignored) 851 return true; 852 853 if (dev_is_pci(dev)) { 854 struct pci_dev *pdev = to_pci_dev(dev); 855 856 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 857 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 858 quirk_ioat_snb_local_iommu(pdev)) 859 return true; 860 } 861 862 return false; 863 } 864 865 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 866 { 867 struct dmar_drhd_unit *drhd = NULL; 868 struct pci_dev *pdev = NULL; 869 struct intel_iommu *iommu; 870 struct device *tmp; 871 u16 segment = 0; 872 int i; 873 874 if (!dev) 875 return NULL; 876 877 if (dev_is_pci(dev)) { 878 struct pci_dev *pf_pdev; 879 880 pdev = pci_real_dma_dev(to_pci_dev(dev)); 881 882 /* VFs aren't listed in scope tables; we need to look up 883 * the PF instead to find the IOMMU. */ 884 pf_pdev = pci_physfn(pdev); 885 dev = &pf_pdev->dev; 886 segment = pci_domain_nr(pdev->bus); 887 } else if (has_acpi_companion(dev)) 888 dev = &ACPI_COMPANION(dev)->dev; 889 890 rcu_read_lock(); 891 for_each_iommu(iommu, drhd) { 892 if (pdev && segment != drhd->segment) 893 continue; 894 895 for_each_active_dev_scope(drhd->devices, 896 drhd->devices_cnt, i, tmp) { 897 if (tmp == dev) { 898 /* For a VF use its original BDF# not that of the PF 899 * which we used for the IOMMU lookup. Strictly speaking 900 * we could do this for all PCI devices; we only need to 901 * get the BDF# from the scope table for ACPI matches. */ 902 if (pdev && pdev->is_virtfn) 903 goto got_pdev; 904 905 if (bus && devfn) { 906 *bus = drhd->devices[i].bus; 907 *devfn = drhd->devices[i].devfn; 908 } 909 goto out; 910 } 911 912 if (is_downstream_to_pci_bridge(dev, tmp)) 913 goto got_pdev; 914 } 915 916 if (pdev && drhd->include_all) { 917 got_pdev: 918 if (bus && devfn) { 919 *bus = pdev->bus->number; 920 *devfn = pdev->devfn; 921 } 922 goto out; 923 } 924 } 925 iommu = NULL; 926 out: 927 if (iommu_is_dummy(iommu, dev)) 928 iommu = NULL; 929 930 rcu_read_unlock(); 931 932 return iommu; 933 } 934 935 static void domain_flush_cache(struct dmar_domain *domain, 936 void *addr, int size) 937 { 938 if (!domain->iommu_coherency) 939 clflush_cache_range(addr, size); 940 } 941 942 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 943 { 944 struct context_entry *context; 945 int ret = 0; 946 unsigned long flags; 947 948 spin_lock_irqsave(&iommu->lock, flags); 949 context = iommu_context_addr(iommu, bus, devfn, 0); 950 if (context) 951 ret = context_present(context); 952 spin_unlock_irqrestore(&iommu->lock, flags); 953 return ret; 954 } 955 956 static void free_context_table(struct intel_iommu *iommu) 957 { 958 int i; 959 unsigned long flags; 960 struct context_entry *context; 961 962 spin_lock_irqsave(&iommu->lock, flags); 963 if (!iommu->root_entry) { 964 goto out; 965 } 966 for (i = 0; i < ROOT_ENTRY_NR; i++) { 967 context = iommu_context_addr(iommu, i, 0, 0); 968 if (context) 969 free_pgtable_page(context); 970 971 if (!sm_supported(iommu)) 972 continue; 973 974 context = iommu_context_addr(iommu, i, 0x80, 0); 975 if (context) 976 free_pgtable_page(context); 977 978 } 979 free_pgtable_page(iommu->root_entry); 980 iommu->root_entry = NULL; 981 out: 982 spin_unlock_irqrestore(&iommu->lock, flags); 983 } 984 985 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 986 unsigned long pfn, int *target_level) 987 { 988 struct dma_pte *parent, *pte; 989 int level = agaw_to_level(domain->agaw); 990 int offset; 991 992 BUG_ON(!domain->pgd); 993 994 if (!domain_pfn_supported(domain, pfn)) 995 /* Address beyond IOMMU's addressing capabilities. */ 996 return NULL; 997 998 parent = domain->pgd; 999 1000 while (1) { 1001 void *tmp_page; 1002 1003 offset = pfn_level_offset(pfn, level); 1004 pte = &parent[offset]; 1005 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1006 break; 1007 if (level == *target_level) 1008 break; 1009 1010 if (!dma_pte_present(pte)) { 1011 uint64_t pteval; 1012 1013 tmp_page = alloc_pgtable_page(domain->nid); 1014 1015 if (!tmp_page) 1016 return NULL; 1017 1018 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1019 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1020 if (domain_use_first_level(domain)) 1021 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1022 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1023 /* Someone else set it while we were thinking; use theirs. */ 1024 free_pgtable_page(tmp_page); 1025 else 1026 domain_flush_cache(domain, pte, sizeof(*pte)); 1027 } 1028 if (level == 1) 1029 break; 1030 1031 parent = phys_to_virt(dma_pte_addr(pte)); 1032 level--; 1033 } 1034 1035 if (!*target_level) 1036 *target_level = level; 1037 1038 return pte; 1039 } 1040 1041 /* return address's pte at specific level */ 1042 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1043 unsigned long pfn, 1044 int level, int *large_page) 1045 { 1046 struct dma_pte *parent, *pte; 1047 int total = agaw_to_level(domain->agaw); 1048 int offset; 1049 1050 parent = domain->pgd; 1051 while (level <= total) { 1052 offset = pfn_level_offset(pfn, total); 1053 pte = &parent[offset]; 1054 if (level == total) 1055 return pte; 1056 1057 if (!dma_pte_present(pte)) { 1058 *large_page = total; 1059 break; 1060 } 1061 1062 if (dma_pte_superpage(pte)) { 1063 *large_page = total; 1064 return pte; 1065 } 1066 1067 parent = phys_to_virt(dma_pte_addr(pte)); 1068 total--; 1069 } 1070 return NULL; 1071 } 1072 1073 /* clear last level pte, a tlb flush should be followed */ 1074 static void dma_pte_clear_range(struct dmar_domain *domain, 1075 unsigned long start_pfn, 1076 unsigned long last_pfn) 1077 { 1078 unsigned int large_page; 1079 struct dma_pte *first_pte, *pte; 1080 1081 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1082 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1083 BUG_ON(start_pfn > last_pfn); 1084 1085 /* we don't need lock here; nobody else touches the iova range */ 1086 do { 1087 large_page = 1; 1088 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1089 if (!pte) { 1090 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1091 continue; 1092 } 1093 do { 1094 dma_clear_pte(pte); 1095 start_pfn += lvl_to_nr_pages(large_page); 1096 pte++; 1097 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1098 1099 domain_flush_cache(domain, first_pte, 1100 (void *)pte - (void *)first_pte); 1101 1102 } while (start_pfn && start_pfn <= last_pfn); 1103 } 1104 1105 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1106 int retain_level, struct dma_pte *pte, 1107 unsigned long pfn, unsigned long start_pfn, 1108 unsigned long last_pfn) 1109 { 1110 pfn = max(start_pfn, pfn); 1111 pte = &pte[pfn_level_offset(pfn, level)]; 1112 1113 do { 1114 unsigned long level_pfn; 1115 struct dma_pte *level_pte; 1116 1117 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1118 goto next; 1119 1120 level_pfn = pfn & level_mask(level); 1121 level_pte = phys_to_virt(dma_pte_addr(pte)); 1122 1123 if (level > 2) { 1124 dma_pte_free_level(domain, level - 1, retain_level, 1125 level_pte, level_pfn, start_pfn, 1126 last_pfn); 1127 } 1128 1129 /* 1130 * Free the page table if we're below the level we want to 1131 * retain and the range covers the entire table. 1132 */ 1133 if (level < retain_level && !(start_pfn > level_pfn || 1134 last_pfn < level_pfn + level_size(level) - 1)) { 1135 dma_clear_pte(pte); 1136 domain_flush_cache(domain, pte, sizeof(*pte)); 1137 free_pgtable_page(level_pte); 1138 } 1139 next: 1140 pfn += level_size(level); 1141 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1142 } 1143 1144 /* 1145 * clear last level (leaf) ptes and free page table pages below the 1146 * level we wish to keep intact. 1147 */ 1148 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1149 unsigned long start_pfn, 1150 unsigned long last_pfn, 1151 int retain_level) 1152 { 1153 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1154 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1155 BUG_ON(start_pfn > last_pfn); 1156 1157 dma_pte_clear_range(domain, start_pfn, last_pfn); 1158 1159 /* We don't need lock here; nobody else touches the iova range */ 1160 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1161 domain->pgd, 0, start_pfn, last_pfn); 1162 1163 /* free pgd */ 1164 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1165 free_pgtable_page(domain->pgd); 1166 domain->pgd = NULL; 1167 } 1168 } 1169 1170 /* When a page at a given level is being unlinked from its parent, we don't 1171 need to *modify* it at all. All we need to do is make a list of all the 1172 pages which can be freed just as soon as we've flushed the IOTLB and we 1173 know the hardware page-walk will no longer touch them. 1174 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1175 be freed. */ 1176 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1177 int level, struct dma_pte *pte, 1178 struct page *freelist) 1179 { 1180 struct page *pg; 1181 1182 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1183 pg->freelist = freelist; 1184 freelist = pg; 1185 1186 if (level == 1) 1187 return freelist; 1188 1189 pte = page_address(pg); 1190 do { 1191 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1192 freelist = dma_pte_list_pagetables(domain, level - 1, 1193 pte, freelist); 1194 pte++; 1195 } while (!first_pte_in_page(pte)); 1196 1197 return freelist; 1198 } 1199 1200 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1201 struct dma_pte *pte, unsigned long pfn, 1202 unsigned long start_pfn, 1203 unsigned long last_pfn, 1204 struct page *freelist) 1205 { 1206 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1207 1208 pfn = max(start_pfn, pfn); 1209 pte = &pte[pfn_level_offset(pfn, level)]; 1210 1211 do { 1212 unsigned long level_pfn; 1213 1214 if (!dma_pte_present(pte)) 1215 goto next; 1216 1217 level_pfn = pfn & level_mask(level); 1218 1219 /* If range covers entire pagetable, free it */ 1220 if (start_pfn <= level_pfn && 1221 last_pfn >= level_pfn + level_size(level) - 1) { 1222 /* These suborbinate page tables are going away entirely. Don't 1223 bother to clear them; we're just going to *free* them. */ 1224 if (level > 1 && !dma_pte_superpage(pte)) 1225 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1226 1227 dma_clear_pte(pte); 1228 if (!first_pte) 1229 first_pte = pte; 1230 last_pte = pte; 1231 } else if (level > 1) { 1232 /* Recurse down into a level that isn't *entirely* obsolete */ 1233 freelist = dma_pte_clear_level(domain, level - 1, 1234 phys_to_virt(dma_pte_addr(pte)), 1235 level_pfn, start_pfn, last_pfn, 1236 freelist); 1237 } 1238 next: 1239 pfn += level_size(level); 1240 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1241 1242 if (first_pte) 1243 domain_flush_cache(domain, first_pte, 1244 (void *)++last_pte - (void *)first_pte); 1245 1246 return freelist; 1247 } 1248 1249 /* We can't just free the pages because the IOMMU may still be walking 1250 the page tables, and may have cached the intermediate levels. The 1251 pages can only be freed after the IOTLB flush has been done. */ 1252 static struct page *domain_unmap(struct dmar_domain *domain, 1253 unsigned long start_pfn, 1254 unsigned long last_pfn, 1255 struct page *freelist) 1256 { 1257 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1258 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1259 BUG_ON(start_pfn > last_pfn); 1260 1261 /* we don't need lock here; nobody else touches the iova range */ 1262 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1263 domain->pgd, 0, start_pfn, last_pfn, 1264 freelist); 1265 1266 /* free pgd */ 1267 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1268 struct page *pgd_page = virt_to_page(domain->pgd); 1269 pgd_page->freelist = freelist; 1270 freelist = pgd_page; 1271 1272 domain->pgd = NULL; 1273 } 1274 1275 return freelist; 1276 } 1277 1278 static void dma_free_pagelist(struct page *freelist) 1279 { 1280 struct page *pg; 1281 1282 while ((pg = freelist)) { 1283 freelist = pg->freelist; 1284 free_pgtable_page(page_address(pg)); 1285 } 1286 } 1287 1288 /* iommu handling */ 1289 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1290 { 1291 struct root_entry *root; 1292 unsigned long flags; 1293 1294 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1295 if (!root) { 1296 pr_err("Allocating root entry for %s failed\n", 1297 iommu->name); 1298 return -ENOMEM; 1299 } 1300 1301 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1302 1303 spin_lock_irqsave(&iommu->lock, flags); 1304 iommu->root_entry = root; 1305 spin_unlock_irqrestore(&iommu->lock, flags); 1306 1307 return 0; 1308 } 1309 1310 static void iommu_set_root_entry(struct intel_iommu *iommu) 1311 { 1312 u64 addr; 1313 u32 sts; 1314 unsigned long flag; 1315 1316 addr = virt_to_phys(iommu->root_entry); 1317 if (sm_supported(iommu)) 1318 addr |= DMA_RTADDR_SMT; 1319 1320 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1321 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1322 1323 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1324 1325 /* Make sure hardware complete it */ 1326 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1327 readl, (sts & DMA_GSTS_RTPS), sts); 1328 1329 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1330 } 1331 1332 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1333 { 1334 u32 val; 1335 unsigned long flag; 1336 1337 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1338 return; 1339 1340 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1341 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1342 1343 /* Make sure hardware complete it */ 1344 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1345 readl, (!(val & DMA_GSTS_WBFS)), val); 1346 1347 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1348 } 1349 1350 /* return value determine if we need a write buffer flush */ 1351 static void __iommu_flush_context(struct intel_iommu *iommu, 1352 u16 did, u16 source_id, u8 function_mask, 1353 u64 type) 1354 { 1355 u64 val = 0; 1356 unsigned long flag; 1357 1358 switch (type) { 1359 case DMA_CCMD_GLOBAL_INVL: 1360 val = DMA_CCMD_GLOBAL_INVL; 1361 break; 1362 case DMA_CCMD_DOMAIN_INVL: 1363 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1364 break; 1365 case DMA_CCMD_DEVICE_INVL: 1366 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1367 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1368 break; 1369 default: 1370 BUG(); 1371 } 1372 val |= DMA_CCMD_ICC; 1373 1374 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1375 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1376 1377 /* Make sure hardware complete it */ 1378 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1379 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1380 1381 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1382 } 1383 1384 /* return value determine if we need a write buffer flush */ 1385 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1386 u64 addr, unsigned int size_order, u64 type) 1387 { 1388 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1389 u64 val = 0, val_iva = 0; 1390 unsigned long flag; 1391 1392 switch (type) { 1393 case DMA_TLB_GLOBAL_FLUSH: 1394 /* global flush doesn't need set IVA_REG */ 1395 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1396 break; 1397 case DMA_TLB_DSI_FLUSH: 1398 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1399 break; 1400 case DMA_TLB_PSI_FLUSH: 1401 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1402 /* IH bit is passed in as part of address */ 1403 val_iva = size_order | addr; 1404 break; 1405 default: 1406 BUG(); 1407 } 1408 /* Note: set drain read/write */ 1409 #if 0 1410 /* 1411 * This is probably to be super secure.. Looks like we can 1412 * ignore it without any impact. 1413 */ 1414 if (cap_read_drain(iommu->cap)) 1415 val |= DMA_TLB_READ_DRAIN; 1416 #endif 1417 if (cap_write_drain(iommu->cap)) 1418 val |= DMA_TLB_WRITE_DRAIN; 1419 1420 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1421 /* Note: Only uses first TLB reg currently */ 1422 if (val_iva) 1423 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1424 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1425 1426 /* Make sure hardware complete it */ 1427 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1428 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1429 1430 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1431 1432 /* check IOTLB invalidation granularity */ 1433 if (DMA_TLB_IAIG(val) == 0) 1434 pr_err("Flush IOTLB failed\n"); 1435 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1436 pr_debug("TLB flush request %Lx, actual %Lx\n", 1437 (unsigned long long)DMA_TLB_IIRG(type), 1438 (unsigned long long)DMA_TLB_IAIG(val)); 1439 } 1440 1441 static struct device_domain_info * 1442 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1443 u8 bus, u8 devfn) 1444 { 1445 struct device_domain_info *info; 1446 1447 assert_spin_locked(&device_domain_lock); 1448 1449 if (!iommu->qi) 1450 return NULL; 1451 1452 list_for_each_entry(info, &domain->devices, link) 1453 if (info->iommu == iommu && info->bus == bus && 1454 info->devfn == devfn) { 1455 if (info->ats_supported && info->dev) 1456 return info; 1457 break; 1458 } 1459 1460 return NULL; 1461 } 1462 1463 static void domain_update_iotlb(struct dmar_domain *domain) 1464 { 1465 struct device_domain_info *info; 1466 bool has_iotlb_device = false; 1467 1468 assert_spin_locked(&device_domain_lock); 1469 1470 list_for_each_entry(info, &domain->devices, link) 1471 if (info->ats_enabled) { 1472 has_iotlb_device = true; 1473 break; 1474 } 1475 1476 if (!has_iotlb_device) { 1477 struct subdev_domain_info *sinfo; 1478 1479 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1480 info = get_domain_info(sinfo->pdev); 1481 if (info && info->ats_enabled) { 1482 has_iotlb_device = true; 1483 break; 1484 } 1485 } 1486 } 1487 1488 domain->has_iotlb_device = has_iotlb_device; 1489 } 1490 1491 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1492 { 1493 struct pci_dev *pdev; 1494 1495 assert_spin_locked(&device_domain_lock); 1496 1497 if (!info || !dev_is_pci(info->dev)) 1498 return; 1499 1500 pdev = to_pci_dev(info->dev); 1501 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1502 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1503 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1504 * reserved, which should be set to 0. 1505 */ 1506 if (!ecap_dit(info->iommu->ecap)) 1507 info->pfsid = 0; 1508 else { 1509 struct pci_dev *pf_pdev; 1510 1511 /* pdev will be returned if device is not a vf */ 1512 pf_pdev = pci_physfn(pdev); 1513 info->pfsid = pci_dev_id(pf_pdev); 1514 } 1515 1516 #ifdef CONFIG_INTEL_IOMMU_SVM 1517 /* The PCIe spec, in its wisdom, declares that the behaviour of 1518 the device if you enable PASID support after ATS support is 1519 undefined. So always enable PASID support on devices which 1520 have it, even if we can't yet know if we're ever going to 1521 use it. */ 1522 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1523 info->pasid_enabled = 1; 1524 1525 if (info->pri_supported && 1526 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1527 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1528 info->pri_enabled = 1; 1529 #endif 1530 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1531 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1532 info->ats_enabled = 1; 1533 domain_update_iotlb(info->domain); 1534 info->ats_qdep = pci_ats_queue_depth(pdev); 1535 } 1536 } 1537 1538 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1539 { 1540 struct pci_dev *pdev; 1541 1542 assert_spin_locked(&device_domain_lock); 1543 1544 if (!dev_is_pci(info->dev)) 1545 return; 1546 1547 pdev = to_pci_dev(info->dev); 1548 1549 if (info->ats_enabled) { 1550 pci_disable_ats(pdev); 1551 info->ats_enabled = 0; 1552 domain_update_iotlb(info->domain); 1553 } 1554 #ifdef CONFIG_INTEL_IOMMU_SVM 1555 if (info->pri_enabled) { 1556 pci_disable_pri(pdev); 1557 info->pri_enabled = 0; 1558 } 1559 if (info->pasid_enabled) { 1560 pci_disable_pasid(pdev); 1561 info->pasid_enabled = 0; 1562 } 1563 #endif 1564 } 1565 1566 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1567 u64 addr, unsigned int mask) 1568 { 1569 u16 sid, qdep; 1570 1571 if (!info || !info->ats_enabled) 1572 return; 1573 1574 sid = info->bus << 8 | info->devfn; 1575 qdep = info->ats_qdep; 1576 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1577 qdep, addr, mask); 1578 } 1579 1580 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1581 u64 addr, unsigned mask) 1582 { 1583 unsigned long flags; 1584 struct device_domain_info *info; 1585 struct subdev_domain_info *sinfo; 1586 1587 if (!domain->has_iotlb_device) 1588 return; 1589 1590 spin_lock_irqsave(&device_domain_lock, flags); 1591 list_for_each_entry(info, &domain->devices, link) 1592 __iommu_flush_dev_iotlb(info, addr, mask); 1593 1594 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1595 info = get_domain_info(sinfo->pdev); 1596 __iommu_flush_dev_iotlb(info, addr, mask); 1597 } 1598 spin_unlock_irqrestore(&device_domain_lock, flags); 1599 } 1600 1601 static void domain_flush_piotlb(struct intel_iommu *iommu, 1602 struct dmar_domain *domain, 1603 u64 addr, unsigned long npages, bool ih) 1604 { 1605 u16 did = domain->iommu_did[iommu->seq_id]; 1606 1607 if (domain->default_pasid) 1608 qi_flush_piotlb(iommu, did, domain->default_pasid, 1609 addr, npages, ih); 1610 1611 if (!list_empty(&domain->devices)) 1612 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1613 } 1614 1615 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1616 struct dmar_domain *domain, 1617 unsigned long pfn, unsigned int pages, 1618 int ih, int map) 1619 { 1620 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1621 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1622 u16 did = domain->iommu_did[iommu->seq_id]; 1623 1624 BUG_ON(pages == 0); 1625 1626 if (ih) 1627 ih = 1 << 6; 1628 1629 if (domain_use_first_level(domain)) { 1630 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1631 } else { 1632 /* 1633 * Fallback to domain selective flush if no PSI support or 1634 * the size is too big. PSI requires page size to be 2 ^ x, 1635 * and the base address is naturally aligned to the size. 1636 */ 1637 if (!cap_pgsel_inv(iommu->cap) || 1638 mask > cap_max_amask_val(iommu->cap)) 1639 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1640 DMA_TLB_DSI_FLUSH); 1641 else 1642 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1643 DMA_TLB_PSI_FLUSH); 1644 } 1645 1646 /* 1647 * In caching mode, changes of pages from non-present to present require 1648 * flush. However, device IOTLB doesn't need to be flushed in this case. 1649 */ 1650 if (!cap_caching_mode(iommu->cap) || !map) 1651 iommu_flush_dev_iotlb(domain, addr, mask); 1652 } 1653 1654 /* Notification for newly created mappings */ 1655 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1656 struct dmar_domain *domain, 1657 unsigned long pfn, unsigned int pages) 1658 { 1659 /* 1660 * It's a non-present to present mapping. Only flush if caching mode 1661 * and second level. 1662 */ 1663 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1664 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1665 else 1666 iommu_flush_write_buffer(iommu); 1667 } 1668 1669 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1670 { 1671 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1672 int idx; 1673 1674 for_each_domain_iommu(idx, dmar_domain) { 1675 struct intel_iommu *iommu = g_iommus[idx]; 1676 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1677 1678 if (domain_use_first_level(dmar_domain)) 1679 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1680 else 1681 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1682 DMA_TLB_DSI_FLUSH); 1683 1684 if (!cap_caching_mode(iommu->cap)) 1685 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1686 0, MAX_AGAW_PFN_WIDTH); 1687 } 1688 } 1689 1690 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1691 { 1692 u32 pmen; 1693 unsigned long flags; 1694 1695 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1696 return; 1697 1698 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1699 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1700 pmen &= ~DMA_PMEN_EPM; 1701 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1702 1703 /* wait for the protected region status bit to clear */ 1704 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1705 readl, !(pmen & DMA_PMEN_PRS), pmen); 1706 1707 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1708 } 1709 1710 static void iommu_enable_translation(struct intel_iommu *iommu) 1711 { 1712 u32 sts; 1713 unsigned long flags; 1714 1715 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1716 iommu->gcmd |= DMA_GCMD_TE; 1717 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1718 1719 /* Make sure hardware complete it */ 1720 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1721 readl, (sts & DMA_GSTS_TES), sts); 1722 1723 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1724 } 1725 1726 static void iommu_disable_translation(struct intel_iommu *iommu) 1727 { 1728 u32 sts; 1729 unsigned long flag; 1730 1731 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1732 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1733 return; 1734 1735 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1736 iommu->gcmd &= ~DMA_GCMD_TE; 1737 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1738 1739 /* Make sure hardware complete it */ 1740 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1741 readl, (!(sts & DMA_GSTS_TES)), sts); 1742 1743 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1744 } 1745 1746 static int iommu_init_domains(struct intel_iommu *iommu) 1747 { 1748 u32 ndomains, nlongs; 1749 size_t size; 1750 1751 ndomains = cap_ndoms(iommu->cap); 1752 pr_debug("%s: Number of Domains supported <%d>\n", 1753 iommu->name, ndomains); 1754 nlongs = BITS_TO_LONGS(ndomains); 1755 1756 spin_lock_init(&iommu->lock); 1757 1758 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1759 if (!iommu->domain_ids) { 1760 pr_err("%s: Allocating domain id array failed\n", 1761 iommu->name); 1762 return -ENOMEM; 1763 } 1764 1765 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1766 iommu->domains = kzalloc(size, GFP_KERNEL); 1767 1768 if (iommu->domains) { 1769 size = 256 * sizeof(struct dmar_domain *); 1770 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1771 } 1772 1773 if (!iommu->domains || !iommu->domains[0]) { 1774 pr_err("%s: Allocating domain array failed\n", 1775 iommu->name); 1776 kfree(iommu->domain_ids); 1777 kfree(iommu->domains); 1778 iommu->domain_ids = NULL; 1779 iommu->domains = NULL; 1780 return -ENOMEM; 1781 } 1782 1783 /* 1784 * If Caching mode is set, then invalid translations are tagged 1785 * with domain-id 0, hence we need to pre-allocate it. We also 1786 * use domain-id 0 as a marker for non-allocated domain-id, so 1787 * make sure it is not used for a real domain. 1788 */ 1789 set_bit(0, iommu->domain_ids); 1790 1791 /* 1792 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1793 * entry for first-level or pass-through translation modes should 1794 * be programmed with a domain id different from those used for 1795 * second-level or nested translation. We reserve a domain id for 1796 * this purpose. 1797 */ 1798 if (sm_supported(iommu)) 1799 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1800 1801 return 0; 1802 } 1803 1804 static void disable_dmar_iommu(struct intel_iommu *iommu) 1805 { 1806 struct device_domain_info *info, *tmp; 1807 unsigned long flags; 1808 1809 if (!iommu->domains || !iommu->domain_ids) 1810 return; 1811 1812 spin_lock_irqsave(&device_domain_lock, flags); 1813 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1814 if (info->iommu != iommu) 1815 continue; 1816 1817 if (!info->dev || !info->domain) 1818 continue; 1819 1820 __dmar_remove_one_dev_info(info); 1821 } 1822 spin_unlock_irqrestore(&device_domain_lock, flags); 1823 1824 if (iommu->gcmd & DMA_GCMD_TE) 1825 iommu_disable_translation(iommu); 1826 } 1827 1828 static void free_dmar_iommu(struct intel_iommu *iommu) 1829 { 1830 if ((iommu->domains) && (iommu->domain_ids)) { 1831 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1832 int i; 1833 1834 for (i = 0; i < elems; i++) 1835 kfree(iommu->domains[i]); 1836 kfree(iommu->domains); 1837 kfree(iommu->domain_ids); 1838 iommu->domains = NULL; 1839 iommu->domain_ids = NULL; 1840 } 1841 1842 g_iommus[iommu->seq_id] = NULL; 1843 1844 /* free context mapping */ 1845 free_context_table(iommu); 1846 1847 #ifdef CONFIG_INTEL_IOMMU_SVM 1848 if (pasid_supported(iommu)) { 1849 if (ecap_prs(iommu->ecap)) 1850 intel_svm_finish_prq(iommu); 1851 } 1852 if (vccap_pasid(iommu->vccap)) 1853 ioasid_unregister_allocator(&iommu->pasid_allocator); 1854 1855 #endif 1856 } 1857 1858 /* 1859 * Check and return whether first level is used by default for 1860 * DMA translation. 1861 */ 1862 static bool first_level_by_default(void) 1863 { 1864 struct dmar_drhd_unit *drhd; 1865 struct intel_iommu *iommu; 1866 static int first_level_support = -1; 1867 1868 if (likely(first_level_support != -1)) 1869 return first_level_support; 1870 1871 first_level_support = 1; 1872 1873 rcu_read_lock(); 1874 for_each_active_iommu(iommu, drhd) { 1875 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1876 first_level_support = 0; 1877 break; 1878 } 1879 } 1880 rcu_read_unlock(); 1881 1882 return first_level_support; 1883 } 1884 1885 static struct dmar_domain *alloc_domain(int flags) 1886 { 1887 struct dmar_domain *domain; 1888 1889 domain = alloc_domain_mem(); 1890 if (!domain) 1891 return NULL; 1892 1893 memset(domain, 0, sizeof(*domain)); 1894 domain->nid = NUMA_NO_NODE; 1895 domain->flags = flags; 1896 if (first_level_by_default()) 1897 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1898 domain->has_iotlb_device = false; 1899 INIT_LIST_HEAD(&domain->devices); 1900 INIT_LIST_HEAD(&domain->subdevices); 1901 1902 return domain; 1903 } 1904 1905 /* Must be called with iommu->lock */ 1906 static int domain_attach_iommu(struct dmar_domain *domain, 1907 struct intel_iommu *iommu) 1908 { 1909 unsigned long ndomains; 1910 int num; 1911 1912 assert_spin_locked(&device_domain_lock); 1913 assert_spin_locked(&iommu->lock); 1914 1915 domain->iommu_refcnt[iommu->seq_id] += 1; 1916 domain->iommu_count += 1; 1917 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1918 ndomains = cap_ndoms(iommu->cap); 1919 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1920 1921 if (num >= ndomains) { 1922 pr_err("%s: No free domain ids\n", iommu->name); 1923 domain->iommu_refcnt[iommu->seq_id] -= 1; 1924 domain->iommu_count -= 1; 1925 return -ENOSPC; 1926 } 1927 1928 set_bit(num, iommu->domain_ids); 1929 set_iommu_domain(iommu, num, domain); 1930 1931 domain->iommu_did[iommu->seq_id] = num; 1932 domain->nid = iommu->node; 1933 1934 domain_update_iommu_cap(domain); 1935 } 1936 1937 return 0; 1938 } 1939 1940 static int domain_detach_iommu(struct dmar_domain *domain, 1941 struct intel_iommu *iommu) 1942 { 1943 int num, count; 1944 1945 assert_spin_locked(&device_domain_lock); 1946 assert_spin_locked(&iommu->lock); 1947 1948 domain->iommu_refcnt[iommu->seq_id] -= 1; 1949 count = --domain->iommu_count; 1950 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1951 num = domain->iommu_did[iommu->seq_id]; 1952 clear_bit(num, iommu->domain_ids); 1953 set_iommu_domain(iommu, num, NULL); 1954 1955 domain_update_iommu_cap(domain); 1956 domain->iommu_did[iommu->seq_id] = 0; 1957 } 1958 1959 return count; 1960 } 1961 1962 static inline int guestwidth_to_adjustwidth(int gaw) 1963 { 1964 int agaw; 1965 int r = (gaw - 12) % 9; 1966 1967 if (r == 0) 1968 agaw = gaw; 1969 else 1970 agaw = gaw + 9 - r; 1971 if (agaw > 64) 1972 agaw = 64; 1973 return agaw; 1974 } 1975 1976 static void domain_exit(struct dmar_domain *domain) 1977 { 1978 1979 /* Remove associated devices and clear attached or cached domains */ 1980 domain_remove_dev_info(domain); 1981 1982 /* destroy iovas */ 1983 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1984 iommu_put_dma_cookie(&domain->domain); 1985 1986 if (domain->pgd) { 1987 struct page *freelist; 1988 1989 freelist = domain_unmap(domain, 0, 1990 DOMAIN_MAX_PFN(domain->gaw), NULL); 1991 dma_free_pagelist(freelist); 1992 } 1993 1994 free_domain_mem(domain); 1995 } 1996 1997 /* 1998 * Get the PASID directory size for scalable mode context entry. 1999 * Value of X in the PDTS field of a scalable mode context entry 2000 * indicates PASID directory with 2^(X + 7) entries. 2001 */ 2002 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2003 { 2004 int pds, max_pde; 2005 2006 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2007 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2008 if (pds < 7) 2009 return 0; 2010 2011 return pds - 7; 2012 } 2013 2014 /* 2015 * Set the RID_PASID field of a scalable mode context entry. The 2016 * IOMMU hardware will use the PASID value set in this field for 2017 * DMA translations of DMA requests without PASID. 2018 */ 2019 static inline void 2020 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2021 { 2022 context->hi |= pasid & ((1 << 20) - 1); 2023 } 2024 2025 /* 2026 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2027 * entry. 2028 */ 2029 static inline void context_set_sm_dte(struct context_entry *context) 2030 { 2031 context->lo |= (1 << 2); 2032 } 2033 2034 /* 2035 * Set the PRE(Page Request Enable) field of a scalable mode context 2036 * entry. 2037 */ 2038 static inline void context_set_sm_pre(struct context_entry *context) 2039 { 2040 context->lo |= (1 << 4); 2041 } 2042 2043 /* Convert value to context PASID directory size field coding. */ 2044 #define context_pdts(pds) (((pds) & 0x7) << 9) 2045 2046 static int domain_context_mapping_one(struct dmar_domain *domain, 2047 struct intel_iommu *iommu, 2048 struct pasid_table *table, 2049 u8 bus, u8 devfn) 2050 { 2051 u16 did = domain->iommu_did[iommu->seq_id]; 2052 int translation = CONTEXT_TT_MULTI_LEVEL; 2053 struct device_domain_info *info = NULL; 2054 struct context_entry *context; 2055 unsigned long flags; 2056 int ret; 2057 2058 WARN_ON(did == 0); 2059 2060 if (hw_pass_through && domain_type_is_si(domain)) 2061 translation = CONTEXT_TT_PASS_THROUGH; 2062 2063 pr_debug("Set context mapping for %02x:%02x.%d\n", 2064 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2065 2066 BUG_ON(!domain->pgd); 2067 2068 spin_lock_irqsave(&device_domain_lock, flags); 2069 spin_lock(&iommu->lock); 2070 2071 ret = -ENOMEM; 2072 context = iommu_context_addr(iommu, bus, devfn, 1); 2073 if (!context) 2074 goto out_unlock; 2075 2076 ret = 0; 2077 if (context_present(context)) 2078 goto out_unlock; 2079 2080 /* 2081 * For kdump cases, old valid entries may be cached due to the 2082 * in-flight DMA and copied pgtable, but there is no unmapping 2083 * behaviour for them, thus we need an explicit cache flush for 2084 * the newly-mapped device. For kdump, at this point, the device 2085 * is supposed to finish reset at its driver probe stage, so no 2086 * in-flight DMA will exist, and we don't need to worry anymore 2087 * hereafter. 2088 */ 2089 if (context_copied(context)) { 2090 u16 did_old = context_domain_id(context); 2091 2092 if (did_old < cap_ndoms(iommu->cap)) { 2093 iommu->flush.flush_context(iommu, did_old, 2094 (((u16)bus) << 8) | devfn, 2095 DMA_CCMD_MASK_NOBIT, 2096 DMA_CCMD_DEVICE_INVL); 2097 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2098 DMA_TLB_DSI_FLUSH); 2099 } 2100 } 2101 2102 context_clear_entry(context); 2103 2104 if (sm_supported(iommu)) { 2105 unsigned long pds; 2106 2107 WARN_ON(!table); 2108 2109 /* Setup the PASID DIR pointer: */ 2110 pds = context_get_sm_pds(table); 2111 context->lo = (u64)virt_to_phys(table->table) | 2112 context_pdts(pds); 2113 2114 /* Setup the RID_PASID field: */ 2115 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2116 2117 /* 2118 * Setup the Device-TLB enable bit and Page request 2119 * Enable bit: 2120 */ 2121 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2122 if (info && info->ats_supported) 2123 context_set_sm_dte(context); 2124 if (info && info->pri_supported) 2125 context_set_sm_pre(context); 2126 } else { 2127 struct dma_pte *pgd = domain->pgd; 2128 int agaw; 2129 2130 context_set_domain_id(context, did); 2131 2132 if (translation != CONTEXT_TT_PASS_THROUGH) { 2133 /* 2134 * Skip top levels of page tables for iommu which has 2135 * less agaw than default. Unnecessary for PT mode. 2136 */ 2137 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2138 ret = -ENOMEM; 2139 pgd = phys_to_virt(dma_pte_addr(pgd)); 2140 if (!dma_pte_present(pgd)) 2141 goto out_unlock; 2142 } 2143 2144 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2145 if (info && info->ats_supported) 2146 translation = CONTEXT_TT_DEV_IOTLB; 2147 else 2148 translation = CONTEXT_TT_MULTI_LEVEL; 2149 2150 context_set_address_root(context, virt_to_phys(pgd)); 2151 context_set_address_width(context, agaw); 2152 } else { 2153 /* 2154 * In pass through mode, AW must be programmed to 2155 * indicate the largest AGAW value supported by 2156 * hardware. And ASR is ignored by hardware. 2157 */ 2158 context_set_address_width(context, iommu->msagaw); 2159 } 2160 2161 context_set_translation_type(context, translation); 2162 } 2163 2164 context_set_fault_enable(context); 2165 context_set_present(context); 2166 if (!ecap_coherent(iommu->ecap)) 2167 clflush_cache_range(context, sizeof(*context)); 2168 2169 /* 2170 * It's a non-present to present mapping. If hardware doesn't cache 2171 * non-present entry we only need to flush the write-buffer. If the 2172 * _does_ cache non-present entries, then it does so in the special 2173 * domain #0, which we have to flush: 2174 */ 2175 if (cap_caching_mode(iommu->cap)) { 2176 iommu->flush.flush_context(iommu, 0, 2177 (((u16)bus) << 8) | devfn, 2178 DMA_CCMD_MASK_NOBIT, 2179 DMA_CCMD_DEVICE_INVL); 2180 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2181 } else { 2182 iommu_flush_write_buffer(iommu); 2183 } 2184 iommu_enable_dev_iotlb(info); 2185 2186 ret = 0; 2187 2188 out_unlock: 2189 spin_unlock(&iommu->lock); 2190 spin_unlock_irqrestore(&device_domain_lock, flags); 2191 2192 return ret; 2193 } 2194 2195 struct domain_context_mapping_data { 2196 struct dmar_domain *domain; 2197 struct intel_iommu *iommu; 2198 struct pasid_table *table; 2199 }; 2200 2201 static int domain_context_mapping_cb(struct pci_dev *pdev, 2202 u16 alias, void *opaque) 2203 { 2204 struct domain_context_mapping_data *data = opaque; 2205 2206 return domain_context_mapping_one(data->domain, data->iommu, 2207 data->table, PCI_BUS_NUM(alias), 2208 alias & 0xff); 2209 } 2210 2211 static int 2212 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2213 { 2214 struct domain_context_mapping_data data; 2215 struct pasid_table *table; 2216 struct intel_iommu *iommu; 2217 u8 bus, devfn; 2218 2219 iommu = device_to_iommu(dev, &bus, &devfn); 2220 if (!iommu) 2221 return -ENODEV; 2222 2223 table = intel_pasid_get_table(dev); 2224 2225 if (!dev_is_pci(dev)) 2226 return domain_context_mapping_one(domain, iommu, table, 2227 bus, devfn); 2228 2229 data.domain = domain; 2230 data.iommu = iommu; 2231 data.table = table; 2232 2233 return pci_for_each_dma_alias(to_pci_dev(dev), 2234 &domain_context_mapping_cb, &data); 2235 } 2236 2237 static int domain_context_mapped_cb(struct pci_dev *pdev, 2238 u16 alias, void *opaque) 2239 { 2240 struct intel_iommu *iommu = opaque; 2241 2242 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2243 } 2244 2245 static int domain_context_mapped(struct device *dev) 2246 { 2247 struct intel_iommu *iommu; 2248 u8 bus, devfn; 2249 2250 iommu = device_to_iommu(dev, &bus, &devfn); 2251 if (!iommu) 2252 return -ENODEV; 2253 2254 if (!dev_is_pci(dev)) 2255 return device_context_mapped(iommu, bus, devfn); 2256 2257 return !pci_for_each_dma_alias(to_pci_dev(dev), 2258 domain_context_mapped_cb, iommu); 2259 } 2260 2261 /* Returns a number of VTD pages, but aligned to MM page size */ 2262 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2263 size_t size) 2264 { 2265 host_addr &= ~PAGE_MASK; 2266 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2267 } 2268 2269 /* Return largest possible superpage level for a given mapping */ 2270 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2271 unsigned long iov_pfn, 2272 unsigned long phy_pfn, 2273 unsigned long pages) 2274 { 2275 int support, level = 1; 2276 unsigned long pfnmerge; 2277 2278 support = domain->iommu_superpage; 2279 2280 /* To use a large page, the virtual *and* physical addresses 2281 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2282 of them will mean we have to use smaller pages. So just 2283 merge them and check both at once. */ 2284 pfnmerge = iov_pfn | phy_pfn; 2285 2286 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2287 pages >>= VTD_STRIDE_SHIFT; 2288 if (!pages) 2289 break; 2290 pfnmerge >>= VTD_STRIDE_SHIFT; 2291 level++; 2292 support--; 2293 } 2294 return level; 2295 } 2296 2297 static int 2298 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2299 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2300 { 2301 struct dma_pte *first_pte = NULL, *pte = NULL; 2302 unsigned int largepage_lvl = 0; 2303 unsigned long lvl_pages = 0; 2304 phys_addr_t pteval; 2305 u64 attr; 2306 2307 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2308 2309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2310 return -EINVAL; 2311 2312 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2313 if (domain_use_first_level(domain)) 2314 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2315 2316 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2317 2318 while (nr_pages > 0) { 2319 uint64_t tmp; 2320 2321 if (!pte) { 2322 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2323 phys_pfn, nr_pages); 2324 2325 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2326 if (!pte) 2327 return -ENOMEM; 2328 /* It is large page*/ 2329 if (largepage_lvl > 1) { 2330 unsigned long nr_superpages, end_pfn; 2331 2332 pteval |= DMA_PTE_LARGE_PAGE; 2333 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2334 2335 nr_superpages = nr_pages / lvl_pages; 2336 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2337 2338 /* 2339 * Ensure that old small page tables are 2340 * removed to make room for superpage(s). 2341 * We're adding new large pages, so make sure 2342 * we don't remove their parent tables. 2343 */ 2344 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2345 largepage_lvl + 1); 2346 } else { 2347 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2348 } 2349 2350 } 2351 /* We don't need lock here, nobody else 2352 * touches the iova range 2353 */ 2354 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2355 if (tmp) { 2356 static int dumps = 5; 2357 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2358 iov_pfn, tmp, (unsigned long long)pteval); 2359 if (dumps) { 2360 dumps--; 2361 debug_dma_dump_mappings(NULL); 2362 } 2363 WARN_ON(1); 2364 } 2365 2366 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2367 2368 BUG_ON(nr_pages < lvl_pages); 2369 2370 nr_pages -= lvl_pages; 2371 iov_pfn += lvl_pages; 2372 phys_pfn += lvl_pages; 2373 pteval += lvl_pages * VTD_PAGE_SIZE; 2374 2375 /* If the next PTE would be the first in a new page, then we 2376 * need to flush the cache on the entries we've just written. 2377 * And then we'll need to recalculate 'pte', so clear it and 2378 * let it get set again in the if (!pte) block above. 2379 * 2380 * If we're done (!nr_pages) we need to flush the cache too. 2381 * 2382 * Also if we've been setting superpages, we may need to 2383 * recalculate 'pte' and switch back to smaller pages for the 2384 * end of the mapping, if the trailing size is not enough to 2385 * use another superpage (i.e. nr_pages < lvl_pages). 2386 */ 2387 pte++; 2388 if (!nr_pages || first_pte_in_page(pte) || 2389 (largepage_lvl > 1 && nr_pages < lvl_pages)) { 2390 domain_flush_cache(domain, first_pte, 2391 (void *)pte - (void *)first_pte); 2392 pte = NULL; 2393 } 2394 } 2395 2396 return 0; 2397 } 2398 2399 static int 2400 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2401 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2402 { 2403 int iommu_id, ret; 2404 struct intel_iommu *iommu; 2405 2406 /* Do the real mapping first */ 2407 ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot); 2408 if (ret) 2409 return ret; 2410 2411 for_each_domain_iommu(iommu_id, domain) { 2412 iommu = g_iommus[iommu_id]; 2413 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2414 } 2415 2416 return 0; 2417 } 2418 2419 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2420 { 2421 unsigned long flags; 2422 struct context_entry *context; 2423 u16 did_old; 2424 2425 if (!iommu) 2426 return; 2427 2428 spin_lock_irqsave(&iommu->lock, flags); 2429 context = iommu_context_addr(iommu, bus, devfn, 0); 2430 if (!context) { 2431 spin_unlock_irqrestore(&iommu->lock, flags); 2432 return; 2433 } 2434 did_old = context_domain_id(context); 2435 context_clear_entry(context); 2436 __iommu_flush_cache(iommu, context, sizeof(*context)); 2437 spin_unlock_irqrestore(&iommu->lock, flags); 2438 iommu->flush.flush_context(iommu, 2439 did_old, 2440 (((u16)bus) << 8) | devfn, 2441 DMA_CCMD_MASK_NOBIT, 2442 DMA_CCMD_DEVICE_INVL); 2443 iommu->flush.flush_iotlb(iommu, 2444 did_old, 2445 0, 2446 0, 2447 DMA_TLB_DSI_FLUSH); 2448 } 2449 2450 static inline void unlink_domain_info(struct device_domain_info *info) 2451 { 2452 assert_spin_locked(&device_domain_lock); 2453 list_del(&info->link); 2454 list_del(&info->global); 2455 if (info->dev) 2456 dev_iommu_priv_set(info->dev, NULL); 2457 } 2458 2459 static void domain_remove_dev_info(struct dmar_domain *domain) 2460 { 2461 struct device_domain_info *info, *tmp; 2462 unsigned long flags; 2463 2464 spin_lock_irqsave(&device_domain_lock, flags); 2465 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2466 __dmar_remove_one_dev_info(info); 2467 spin_unlock_irqrestore(&device_domain_lock, flags); 2468 } 2469 2470 struct dmar_domain *find_domain(struct device *dev) 2471 { 2472 struct device_domain_info *info; 2473 2474 if (unlikely(!dev || !dev->iommu)) 2475 return NULL; 2476 2477 if (unlikely(attach_deferred(dev))) 2478 return NULL; 2479 2480 /* No lock here, assumes no domain exit in normal case */ 2481 info = get_domain_info(dev); 2482 if (likely(info)) 2483 return info->domain; 2484 2485 return NULL; 2486 } 2487 2488 static inline struct device_domain_info * 2489 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2490 { 2491 struct device_domain_info *info; 2492 2493 list_for_each_entry(info, &device_domain_list, global) 2494 if (info->segment == segment && info->bus == bus && 2495 info->devfn == devfn) 2496 return info; 2497 2498 return NULL; 2499 } 2500 2501 static int domain_setup_first_level(struct intel_iommu *iommu, 2502 struct dmar_domain *domain, 2503 struct device *dev, 2504 u32 pasid) 2505 { 2506 int flags = PASID_FLAG_SUPERVISOR_MODE; 2507 struct dma_pte *pgd = domain->pgd; 2508 int agaw, level; 2509 2510 /* 2511 * Skip top levels of page tables for iommu which has 2512 * less agaw than default. Unnecessary for PT mode. 2513 */ 2514 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2515 pgd = phys_to_virt(dma_pte_addr(pgd)); 2516 if (!dma_pte_present(pgd)) 2517 return -ENOMEM; 2518 } 2519 2520 level = agaw_to_level(agaw); 2521 if (level != 4 && level != 5) 2522 return -EINVAL; 2523 2524 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2525 2526 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2527 domain->iommu_did[iommu->seq_id], 2528 flags); 2529 } 2530 2531 static bool dev_is_real_dma_subdevice(struct device *dev) 2532 { 2533 return dev && dev_is_pci(dev) && 2534 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2535 } 2536 2537 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2538 int bus, int devfn, 2539 struct device *dev, 2540 struct dmar_domain *domain) 2541 { 2542 struct dmar_domain *found = NULL; 2543 struct device_domain_info *info; 2544 unsigned long flags; 2545 int ret; 2546 2547 info = alloc_devinfo_mem(); 2548 if (!info) 2549 return NULL; 2550 2551 if (!dev_is_real_dma_subdevice(dev)) { 2552 info->bus = bus; 2553 info->devfn = devfn; 2554 info->segment = iommu->segment; 2555 } else { 2556 struct pci_dev *pdev = to_pci_dev(dev); 2557 2558 info->bus = pdev->bus->number; 2559 info->devfn = pdev->devfn; 2560 info->segment = pci_domain_nr(pdev->bus); 2561 } 2562 2563 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2564 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2565 info->ats_qdep = 0; 2566 info->dev = dev; 2567 info->domain = domain; 2568 info->iommu = iommu; 2569 info->pasid_table = NULL; 2570 info->auxd_enabled = 0; 2571 INIT_LIST_HEAD(&info->subdevices); 2572 2573 if (dev && dev_is_pci(dev)) { 2574 struct pci_dev *pdev = to_pci_dev(info->dev); 2575 2576 if (ecap_dev_iotlb_support(iommu->ecap) && 2577 pci_ats_supported(pdev) && 2578 dmar_find_matched_atsr_unit(pdev)) 2579 info->ats_supported = 1; 2580 2581 if (sm_supported(iommu)) { 2582 if (pasid_supported(iommu)) { 2583 int features = pci_pasid_features(pdev); 2584 if (features >= 0) 2585 info->pasid_supported = features | 1; 2586 } 2587 2588 if (info->ats_supported && ecap_prs(iommu->ecap) && 2589 pci_pri_supported(pdev)) 2590 info->pri_supported = 1; 2591 } 2592 } 2593 2594 spin_lock_irqsave(&device_domain_lock, flags); 2595 if (dev) 2596 found = find_domain(dev); 2597 2598 if (!found) { 2599 struct device_domain_info *info2; 2600 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2601 info->devfn); 2602 if (info2) { 2603 found = info2->domain; 2604 info2->dev = dev; 2605 } 2606 } 2607 2608 if (found) { 2609 spin_unlock_irqrestore(&device_domain_lock, flags); 2610 free_devinfo_mem(info); 2611 /* Caller must free the original domain */ 2612 return found; 2613 } 2614 2615 spin_lock(&iommu->lock); 2616 ret = domain_attach_iommu(domain, iommu); 2617 spin_unlock(&iommu->lock); 2618 2619 if (ret) { 2620 spin_unlock_irqrestore(&device_domain_lock, flags); 2621 free_devinfo_mem(info); 2622 return NULL; 2623 } 2624 2625 list_add(&info->link, &domain->devices); 2626 list_add(&info->global, &device_domain_list); 2627 if (dev) 2628 dev_iommu_priv_set(dev, info); 2629 spin_unlock_irqrestore(&device_domain_lock, flags); 2630 2631 /* PASID table is mandatory for a PCI device in scalable mode. */ 2632 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2633 ret = intel_pasid_alloc_table(dev); 2634 if (ret) { 2635 dev_err(dev, "PASID table allocation failed\n"); 2636 dmar_remove_one_dev_info(dev); 2637 return NULL; 2638 } 2639 2640 /* Setup the PASID entry for requests without PASID: */ 2641 spin_lock_irqsave(&iommu->lock, flags); 2642 if (hw_pass_through && domain_type_is_si(domain)) 2643 ret = intel_pasid_setup_pass_through(iommu, domain, 2644 dev, PASID_RID2PASID); 2645 else if (domain_use_first_level(domain)) 2646 ret = domain_setup_first_level(iommu, domain, dev, 2647 PASID_RID2PASID); 2648 else 2649 ret = intel_pasid_setup_second_level(iommu, domain, 2650 dev, PASID_RID2PASID); 2651 spin_unlock_irqrestore(&iommu->lock, flags); 2652 if (ret) { 2653 dev_err(dev, "Setup RID2PASID failed\n"); 2654 dmar_remove_one_dev_info(dev); 2655 return NULL; 2656 } 2657 } 2658 2659 if (dev && domain_context_mapping(domain, dev)) { 2660 dev_err(dev, "Domain context map failed\n"); 2661 dmar_remove_one_dev_info(dev); 2662 return NULL; 2663 } 2664 2665 return domain; 2666 } 2667 2668 static int iommu_domain_identity_map(struct dmar_domain *domain, 2669 unsigned long first_vpfn, 2670 unsigned long last_vpfn) 2671 { 2672 /* 2673 * RMRR range might have overlap with physical memory range, 2674 * clear it first 2675 */ 2676 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2677 2678 return __domain_mapping(domain, first_vpfn, 2679 first_vpfn, last_vpfn - first_vpfn + 1, 2680 DMA_PTE_READ|DMA_PTE_WRITE); 2681 } 2682 2683 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2684 2685 static int __init si_domain_init(int hw) 2686 { 2687 struct dmar_rmrr_unit *rmrr; 2688 struct device *dev; 2689 int i, nid, ret; 2690 2691 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2692 if (!si_domain) 2693 return -EFAULT; 2694 2695 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2696 domain_exit(si_domain); 2697 return -EFAULT; 2698 } 2699 2700 if (hw) 2701 return 0; 2702 2703 for_each_online_node(nid) { 2704 unsigned long start_pfn, end_pfn; 2705 int i; 2706 2707 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2708 ret = iommu_domain_identity_map(si_domain, 2709 mm_to_dma_pfn(start_pfn), 2710 mm_to_dma_pfn(end_pfn)); 2711 if (ret) 2712 return ret; 2713 } 2714 } 2715 2716 /* 2717 * Identity map the RMRRs so that devices with RMRRs could also use 2718 * the si_domain. 2719 */ 2720 for_each_rmrr_units(rmrr) { 2721 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2722 i, dev) { 2723 unsigned long long start = rmrr->base_address; 2724 unsigned long long end = rmrr->end_address; 2725 2726 if (WARN_ON(end < start || 2727 end >> agaw_to_width(si_domain->agaw))) 2728 continue; 2729 2730 ret = iommu_domain_identity_map(si_domain, 2731 mm_to_dma_pfn(start >> PAGE_SHIFT), 2732 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2733 if (ret) 2734 return ret; 2735 } 2736 } 2737 2738 return 0; 2739 } 2740 2741 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2742 { 2743 struct dmar_domain *ndomain; 2744 struct intel_iommu *iommu; 2745 u8 bus, devfn; 2746 2747 iommu = device_to_iommu(dev, &bus, &devfn); 2748 if (!iommu) 2749 return -ENODEV; 2750 2751 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2752 if (ndomain != domain) 2753 return -EBUSY; 2754 2755 return 0; 2756 } 2757 2758 static bool device_has_rmrr(struct device *dev) 2759 { 2760 struct dmar_rmrr_unit *rmrr; 2761 struct device *tmp; 2762 int i; 2763 2764 rcu_read_lock(); 2765 for_each_rmrr_units(rmrr) { 2766 /* 2767 * Return TRUE if this RMRR contains the device that 2768 * is passed in. 2769 */ 2770 for_each_active_dev_scope(rmrr->devices, 2771 rmrr->devices_cnt, i, tmp) 2772 if (tmp == dev || 2773 is_downstream_to_pci_bridge(dev, tmp)) { 2774 rcu_read_unlock(); 2775 return true; 2776 } 2777 } 2778 rcu_read_unlock(); 2779 return false; 2780 } 2781 2782 /** 2783 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2784 * is relaxable (ie. is allowed to be not enforced under some conditions) 2785 * @dev: device handle 2786 * 2787 * We assume that PCI USB devices with RMRRs have them largely 2788 * for historical reasons and that the RMRR space is not actively used post 2789 * boot. This exclusion may change if vendors begin to abuse it. 2790 * 2791 * The same exception is made for graphics devices, with the requirement that 2792 * any use of the RMRR regions will be torn down before assigning the device 2793 * to a guest. 2794 * 2795 * Return: true if the RMRR is relaxable, false otherwise 2796 */ 2797 static bool device_rmrr_is_relaxable(struct device *dev) 2798 { 2799 struct pci_dev *pdev; 2800 2801 if (!dev_is_pci(dev)) 2802 return false; 2803 2804 pdev = to_pci_dev(dev); 2805 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2806 return true; 2807 else 2808 return false; 2809 } 2810 2811 /* 2812 * There are a couple cases where we need to restrict the functionality of 2813 * devices associated with RMRRs. The first is when evaluating a device for 2814 * identity mapping because problems exist when devices are moved in and out 2815 * of domains and their respective RMRR information is lost. This means that 2816 * a device with associated RMRRs will never be in a "passthrough" domain. 2817 * The second is use of the device through the IOMMU API. This interface 2818 * expects to have full control of the IOVA space for the device. We cannot 2819 * satisfy both the requirement that RMRR access is maintained and have an 2820 * unencumbered IOVA space. We also have no ability to quiesce the device's 2821 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2822 * We therefore prevent devices associated with an RMRR from participating in 2823 * the IOMMU API, which eliminates them from device assignment. 2824 * 2825 * In both cases, devices which have relaxable RMRRs are not concerned by this 2826 * restriction. See device_rmrr_is_relaxable comment. 2827 */ 2828 static bool device_is_rmrr_locked(struct device *dev) 2829 { 2830 if (!device_has_rmrr(dev)) 2831 return false; 2832 2833 if (device_rmrr_is_relaxable(dev)) 2834 return false; 2835 2836 return true; 2837 } 2838 2839 /* 2840 * Return the required default domain type for a specific device. 2841 * 2842 * @dev: the device in query 2843 * @startup: true if this is during early boot 2844 * 2845 * Returns: 2846 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2847 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2848 * - 0: both identity and dynamic domains work for this device 2849 */ 2850 static int device_def_domain_type(struct device *dev) 2851 { 2852 if (dev_is_pci(dev)) { 2853 struct pci_dev *pdev = to_pci_dev(dev); 2854 2855 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2856 return IOMMU_DOMAIN_IDENTITY; 2857 2858 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2859 return IOMMU_DOMAIN_IDENTITY; 2860 } 2861 2862 return 0; 2863 } 2864 2865 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2866 { 2867 /* 2868 * Start from the sane iommu hardware state. 2869 * If the queued invalidation is already initialized by us 2870 * (for example, while enabling interrupt-remapping) then 2871 * we got the things already rolling from a sane state. 2872 */ 2873 if (!iommu->qi) { 2874 /* 2875 * Clear any previous faults. 2876 */ 2877 dmar_fault(-1, iommu); 2878 /* 2879 * Disable queued invalidation if supported and already enabled 2880 * before OS handover. 2881 */ 2882 dmar_disable_qi(iommu); 2883 } 2884 2885 if (dmar_enable_qi(iommu)) { 2886 /* 2887 * Queued Invalidate not enabled, use Register Based Invalidate 2888 */ 2889 iommu->flush.flush_context = __iommu_flush_context; 2890 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2891 pr_info("%s: Using Register based invalidation\n", 2892 iommu->name); 2893 } else { 2894 iommu->flush.flush_context = qi_flush_context; 2895 iommu->flush.flush_iotlb = qi_flush_iotlb; 2896 pr_info("%s: Using Queued invalidation\n", iommu->name); 2897 } 2898 } 2899 2900 static int copy_context_table(struct intel_iommu *iommu, 2901 struct root_entry *old_re, 2902 struct context_entry **tbl, 2903 int bus, bool ext) 2904 { 2905 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2906 struct context_entry *new_ce = NULL, ce; 2907 struct context_entry *old_ce = NULL; 2908 struct root_entry re; 2909 phys_addr_t old_ce_phys; 2910 2911 tbl_idx = ext ? bus * 2 : bus; 2912 memcpy(&re, old_re, sizeof(re)); 2913 2914 for (devfn = 0; devfn < 256; devfn++) { 2915 /* First calculate the correct index */ 2916 idx = (ext ? devfn * 2 : devfn) % 256; 2917 2918 if (idx == 0) { 2919 /* First save what we may have and clean up */ 2920 if (new_ce) { 2921 tbl[tbl_idx] = new_ce; 2922 __iommu_flush_cache(iommu, new_ce, 2923 VTD_PAGE_SIZE); 2924 pos = 1; 2925 } 2926 2927 if (old_ce) 2928 memunmap(old_ce); 2929 2930 ret = 0; 2931 if (devfn < 0x80) 2932 old_ce_phys = root_entry_lctp(&re); 2933 else 2934 old_ce_phys = root_entry_uctp(&re); 2935 2936 if (!old_ce_phys) { 2937 if (ext && devfn == 0) { 2938 /* No LCTP, try UCTP */ 2939 devfn = 0x7f; 2940 continue; 2941 } else { 2942 goto out; 2943 } 2944 } 2945 2946 ret = -ENOMEM; 2947 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2948 MEMREMAP_WB); 2949 if (!old_ce) 2950 goto out; 2951 2952 new_ce = alloc_pgtable_page(iommu->node); 2953 if (!new_ce) 2954 goto out_unmap; 2955 2956 ret = 0; 2957 } 2958 2959 /* Now copy the context entry */ 2960 memcpy(&ce, old_ce + idx, sizeof(ce)); 2961 2962 if (!__context_present(&ce)) 2963 continue; 2964 2965 did = context_domain_id(&ce); 2966 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2967 set_bit(did, iommu->domain_ids); 2968 2969 /* 2970 * We need a marker for copied context entries. This 2971 * marker needs to work for the old format as well as 2972 * for extended context entries. 2973 * 2974 * Bit 67 of the context entry is used. In the old 2975 * format this bit is available to software, in the 2976 * extended format it is the PGE bit, but PGE is ignored 2977 * by HW if PASIDs are disabled (and thus still 2978 * available). 2979 * 2980 * So disable PASIDs first and then mark the entry 2981 * copied. This means that we don't copy PASID 2982 * translations from the old kernel, but this is fine as 2983 * faults there are not fatal. 2984 */ 2985 context_clear_pasid_enable(&ce); 2986 context_set_copied(&ce); 2987 2988 new_ce[idx] = ce; 2989 } 2990 2991 tbl[tbl_idx + pos] = new_ce; 2992 2993 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2994 2995 out_unmap: 2996 memunmap(old_ce); 2997 2998 out: 2999 return ret; 3000 } 3001 3002 static int copy_translation_tables(struct intel_iommu *iommu) 3003 { 3004 struct context_entry **ctxt_tbls; 3005 struct root_entry *old_rt; 3006 phys_addr_t old_rt_phys; 3007 int ctxt_table_entries; 3008 unsigned long flags; 3009 u64 rtaddr_reg; 3010 int bus, ret; 3011 bool new_ext, ext; 3012 3013 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3014 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3015 new_ext = !!ecap_ecs(iommu->ecap); 3016 3017 /* 3018 * The RTT bit can only be changed when translation is disabled, 3019 * but disabling translation means to open a window for data 3020 * corruption. So bail out and don't copy anything if we would 3021 * have to change the bit. 3022 */ 3023 if (new_ext != ext) 3024 return -EINVAL; 3025 3026 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3027 if (!old_rt_phys) 3028 return -EINVAL; 3029 3030 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3031 if (!old_rt) 3032 return -ENOMEM; 3033 3034 /* This is too big for the stack - allocate it from slab */ 3035 ctxt_table_entries = ext ? 512 : 256; 3036 ret = -ENOMEM; 3037 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3038 if (!ctxt_tbls) 3039 goto out_unmap; 3040 3041 for (bus = 0; bus < 256; bus++) { 3042 ret = copy_context_table(iommu, &old_rt[bus], 3043 ctxt_tbls, bus, ext); 3044 if (ret) { 3045 pr_err("%s: Failed to copy context table for bus %d\n", 3046 iommu->name, bus); 3047 continue; 3048 } 3049 } 3050 3051 spin_lock_irqsave(&iommu->lock, flags); 3052 3053 /* Context tables are copied, now write them to the root_entry table */ 3054 for (bus = 0; bus < 256; bus++) { 3055 int idx = ext ? bus * 2 : bus; 3056 u64 val; 3057 3058 if (ctxt_tbls[idx]) { 3059 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3060 iommu->root_entry[bus].lo = val; 3061 } 3062 3063 if (!ext || !ctxt_tbls[idx + 1]) 3064 continue; 3065 3066 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3067 iommu->root_entry[bus].hi = val; 3068 } 3069 3070 spin_unlock_irqrestore(&iommu->lock, flags); 3071 3072 kfree(ctxt_tbls); 3073 3074 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3075 3076 ret = 0; 3077 3078 out_unmap: 3079 memunmap(old_rt); 3080 3081 return ret; 3082 } 3083 3084 #ifdef CONFIG_INTEL_IOMMU_SVM 3085 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3086 { 3087 struct intel_iommu *iommu = data; 3088 ioasid_t ioasid; 3089 3090 if (!iommu) 3091 return INVALID_IOASID; 3092 /* 3093 * VT-d virtual command interface always uses the full 20 bit 3094 * PASID range. Host can partition guest PASID range based on 3095 * policies but it is out of guest's control. 3096 */ 3097 if (min < PASID_MIN || max > intel_pasid_max_id) 3098 return INVALID_IOASID; 3099 3100 if (vcmd_alloc_pasid(iommu, &ioasid)) 3101 return INVALID_IOASID; 3102 3103 return ioasid; 3104 } 3105 3106 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3107 { 3108 struct intel_iommu *iommu = data; 3109 3110 if (!iommu) 3111 return; 3112 /* 3113 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3114 * We can only free the PASID when all the devices are unbound. 3115 */ 3116 if (ioasid_find(NULL, ioasid, NULL)) { 3117 pr_alert("Cannot free active IOASID %d\n", ioasid); 3118 return; 3119 } 3120 vcmd_free_pasid(iommu, ioasid); 3121 } 3122 3123 static void register_pasid_allocator(struct intel_iommu *iommu) 3124 { 3125 /* 3126 * If we are running in the host, no need for custom allocator 3127 * in that PASIDs are allocated from the host system-wide. 3128 */ 3129 if (!cap_caching_mode(iommu->cap)) 3130 return; 3131 3132 if (!sm_supported(iommu)) { 3133 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3134 return; 3135 } 3136 3137 /* 3138 * Register a custom PASID allocator if we are running in a guest, 3139 * guest PASID must be obtained via virtual command interface. 3140 * There can be multiple vIOMMUs in each guest but only one allocator 3141 * is active. All vIOMMU allocators will eventually be calling the same 3142 * host allocator. 3143 */ 3144 if (!vccap_pasid(iommu->vccap)) 3145 return; 3146 3147 pr_info("Register custom PASID allocator\n"); 3148 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3149 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3150 iommu->pasid_allocator.pdata = (void *)iommu; 3151 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3152 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3153 /* 3154 * Disable scalable mode on this IOMMU if there 3155 * is no custom allocator. Mixing SM capable vIOMMU 3156 * and non-SM vIOMMU are not supported. 3157 */ 3158 intel_iommu_sm = 0; 3159 } 3160 } 3161 #endif 3162 3163 static int __init init_dmars(void) 3164 { 3165 struct dmar_drhd_unit *drhd; 3166 struct intel_iommu *iommu; 3167 int ret; 3168 3169 /* 3170 * for each drhd 3171 * allocate root 3172 * initialize and program root entry to not present 3173 * endfor 3174 */ 3175 for_each_drhd_unit(drhd) { 3176 /* 3177 * lock not needed as this is only incremented in the single 3178 * threaded kernel __init code path all other access are read 3179 * only 3180 */ 3181 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3182 g_num_of_iommus++; 3183 continue; 3184 } 3185 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3186 } 3187 3188 /* Preallocate enough resources for IOMMU hot-addition */ 3189 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3190 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3191 3192 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3193 GFP_KERNEL); 3194 if (!g_iommus) { 3195 pr_err("Allocating global iommu array failed\n"); 3196 ret = -ENOMEM; 3197 goto error; 3198 } 3199 3200 for_each_iommu(iommu, drhd) { 3201 if (drhd->ignored) { 3202 iommu_disable_translation(iommu); 3203 continue; 3204 } 3205 3206 /* 3207 * Find the max pasid size of all IOMMU's in the system. 3208 * We need to ensure the system pasid table is no bigger 3209 * than the smallest supported. 3210 */ 3211 if (pasid_supported(iommu)) { 3212 u32 temp = 2 << ecap_pss(iommu->ecap); 3213 3214 intel_pasid_max_id = min_t(u32, temp, 3215 intel_pasid_max_id); 3216 } 3217 3218 g_iommus[iommu->seq_id] = iommu; 3219 3220 intel_iommu_init_qi(iommu); 3221 3222 ret = iommu_init_domains(iommu); 3223 if (ret) 3224 goto free_iommu; 3225 3226 init_translation_status(iommu); 3227 3228 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3229 iommu_disable_translation(iommu); 3230 clear_translation_pre_enabled(iommu); 3231 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3232 iommu->name); 3233 } 3234 3235 /* 3236 * TBD: 3237 * we could share the same root & context tables 3238 * among all IOMMU's. Need to Split it later. 3239 */ 3240 ret = iommu_alloc_root_entry(iommu); 3241 if (ret) 3242 goto free_iommu; 3243 3244 if (translation_pre_enabled(iommu)) { 3245 pr_info("Translation already enabled - trying to copy translation structures\n"); 3246 3247 ret = copy_translation_tables(iommu); 3248 if (ret) { 3249 /* 3250 * We found the IOMMU with translation 3251 * enabled - but failed to copy over the 3252 * old root-entry table. Try to proceed 3253 * by disabling translation now and 3254 * allocating a clean root-entry table. 3255 * This might cause DMAR faults, but 3256 * probably the dump will still succeed. 3257 */ 3258 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3259 iommu->name); 3260 iommu_disable_translation(iommu); 3261 clear_translation_pre_enabled(iommu); 3262 } else { 3263 pr_info("Copied translation tables from previous kernel for %s\n", 3264 iommu->name); 3265 } 3266 } 3267 3268 if (!ecap_pass_through(iommu->ecap)) 3269 hw_pass_through = 0; 3270 intel_svm_check(iommu); 3271 } 3272 3273 /* 3274 * Now that qi is enabled on all iommus, set the root entry and flush 3275 * caches. This is required on some Intel X58 chipsets, otherwise the 3276 * flush_context function will loop forever and the boot hangs. 3277 */ 3278 for_each_active_iommu(iommu, drhd) { 3279 iommu_flush_write_buffer(iommu); 3280 #ifdef CONFIG_INTEL_IOMMU_SVM 3281 register_pasid_allocator(iommu); 3282 #endif 3283 iommu_set_root_entry(iommu); 3284 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3285 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3286 } 3287 3288 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3289 dmar_map_gfx = 0; 3290 #endif 3291 3292 if (!dmar_map_gfx) 3293 iommu_identity_mapping |= IDENTMAP_GFX; 3294 3295 check_tylersburg_isoch(); 3296 3297 ret = si_domain_init(hw_pass_through); 3298 if (ret) 3299 goto free_iommu; 3300 3301 /* 3302 * for each drhd 3303 * enable fault log 3304 * global invalidate context cache 3305 * global invalidate iotlb 3306 * enable translation 3307 */ 3308 for_each_iommu(iommu, drhd) { 3309 if (drhd->ignored) { 3310 /* 3311 * we always have to disable PMRs or DMA may fail on 3312 * this device 3313 */ 3314 if (force_on) 3315 iommu_disable_protect_mem_regions(iommu); 3316 continue; 3317 } 3318 3319 iommu_flush_write_buffer(iommu); 3320 3321 #ifdef CONFIG_INTEL_IOMMU_SVM 3322 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3323 /* 3324 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3325 * could cause possible lock race condition. 3326 */ 3327 up_write(&dmar_global_lock); 3328 ret = intel_svm_enable_prq(iommu); 3329 down_write(&dmar_global_lock); 3330 if (ret) 3331 goto free_iommu; 3332 } 3333 #endif 3334 ret = dmar_set_interrupt(iommu); 3335 if (ret) 3336 goto free_iommu; 3337 } 3338 3339 return 0; 3340 3341 free_iommu: 3342 for_each_active_iommu(iommu, drhd) { 3343 disable_dmar_iommu(iommu); 3344 free_dmar_iommu(iommu); 3345 } 3346 3347 kfree(g_iommus); 3348 3349 error: 3350 return ret; 3351 } 3352 3353 static inline int iommu_domain_cache_init(void) 3354 { 3355 int ret = 0; 3356 3357 iommu_domain_cache = kmem_cache_create("iommu_domain", 3358 sizeof(struct dmar_domain), 3359 0, 3360 SLAB_HWCACHE_ALIGN, 3361 3362 NULL); 3363 if (!iommu_domain_cache) { 3364 pr_err("Couldn't create iommu_domain cache\n"); 3365 ret = -ENOMEM; 3366 } 3367 3368 return ret; 3369 } 3370 3371 static inline int iommu_devinfo_cache_init(void) 3372 { 3373 int ret = 0; 3374 3375 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3376 sizeof(struct device_domain_info), 3377 0, 3378 SLAB_HWCACHE_ALIGN, 3379 NULL); 3380 if (!iommu_devinfo_cache) { 3381 pr_err("Couldn't create devinfo cache\n"); 3382 ret = -ENOMEM; 3383 } 3384 3385 return ret; 3386 } 3387 3388 static int __init iommu_init_mempool(void) 3389 { 3390 int ret; 3391 ret = iova_cache_get(); 3392 if (ret) 3393 return ret; 3394 3395 ret = iommu_domain_cache_init(); 3396 if (ret) 3397 goto domain_error; 3398 3399 ret = iommu_devinfo_cache_init(); 3400 if (!ret) 3401 return ret; 3402 3403 kmem_cache_destroy(iommu_domain_cache); 3404 domain_error: 3405 iova_cache_put(); 3406 3407 return -ENOMEM; 3408 } 3409 3410 static void __init iommu_exit_mempool(void) 3411 { 3412 kmem_cache_destroy(iommu_devinfo_cache); 3413 kmem_cache_destroy(iommu_domain_cache); 3414 iova_cache_put(); 3415 } 3416 3417 static void __init init_no_remapping_devices(void) 3418 { 3419 struct dmar_drhd_unit *drhd; 3420 struct device *dev; 3421 int i; 3422 3423 for_each_drhd_unit(drhd) { 3424 if (!drhd->include_all) { 3425 for_each_active_dev_scope(drhd->devices, 3426 drhd->devices_cnt, i, dev) 3427 break; 3428 /* ignore DMAR unit if no devices exist */ 3429 if (i == drhd->devices_cnt) 3430 drhd->ignored = 1; 3431 } 3432 } 3433 3434 for_each_active_drhd_unit(drhd) { 3435 if (drhd->include_all) 3436 continue; 3437 3438 for_each_active_dev_scope(drhd->devices, 3439 drhd->devices_cnt, i, dev) 3440 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3441 break; 3442 if (i < drhd->devices_cnt) 3443 continue; 3444 3445 /* This IOMMU has *only* gfx devices. Either bypass it or 3446 set the gfx_mapped flag, as appropriate */ 3447 drhd->gfx_dedicated = 1; 3448 if (!dmar_map_gfx) 3449 drhd->ignored = 1; 3450 } 3451 } 3452 3453 #ifdef CONFIG_SUSPEND 3454 static int init_iommu_hw(void) 3455 { 3456 struct dmar_drhd_unit *drhd; 3457 struct intel_iommu *iommu = NULL; 3458 3459 for_each_active_iommu(iommu, drhd) 3460 if (iommu->qi) 3461 dmar_reenable_qi(iommu); 3462 3463 for_each_iommu(iommu, drhd) { 3464 if (drhd->ignored) { 3465 /* 3466 * we always have to disable PMRs or DMA may fail on 3467 * this device 3468 */ 3469 if (force_on) 3470 iommu_disable_protect_mem_regions(iommu); 3471 continue; 3472 } 3473 3474 iommu_flush_write_buffer(iommu); 3475 3476 iommu_set_root_entry(iommu); 3477 3478 iommu->flush.flush_context(iommu, 0, 0, 0, 3479 DMA_CCMD_GLOBAL_INVL); 3480 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3481 iommu_enable_translation(iommu); 3482 iommu_disable_protect_mem_regions(iommu); 3483 } 3484 3485 return 0; 3486 } 3487 3488 static void iommu_flush_all(void) 3489 { 3490 struct dmar_drhd_unit *drhd; 3491 struct intel_iommu *iommu; 3492 3493 for_each_active_iommu(iommu, drhd) { 3494 iommu->flush.flush_context(iommu, 0, 0, 0, 3495 DMA_CCMD_GLOBAL_INVL); 3496 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3497 DMA_TLB_GLOBAL_FLUSH); 3498 } 3499 } 3500 3501 static int iommu_suspend(void) 3502 { 3503 struct dmar_drhd_unit *drhd; 3504 struct intel_iommu *iommu = NULL; 3505 unsigned long flag; 3506 3507 for_each_active_iommu(iommu, drhd) { 3508 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3509 GFP_KERNEL); 3510 if (!iommu->iommu_state) 3511 goto nomem; 3512 } 3513 3514 iommu_flush_all(); 3515 3516 for_each_active_iommu(iommu, drhd) { 3517 iommu_disable_translation(iommu); 3518 3519 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3520 3521 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3522 readl(iommu->reg + DMAR_FECTL_REG); 3523 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3524 readl(iommu->reg + DMAR_FEDATA_REG); 3525 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3526 readl(iommu->reg + DMAR_FEADDR_REG); 3527 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3528 readl(iommu->reg + DMAR_FEUADDR_REG); 3529 3530 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3531 } 3532 return 0; 3533 3534 nomem: 3535 for_each_active_iommu(iommu, drhd) 3536 kfree(iommu->iommu_state); 3537 3538 return -ENOMEM; 3539 } 3540 3541 static void iommu_resume(void) 3542 { 3543 struct dmar_drhd_unit *drhd; 3544 struct intel_iommu *iommu = NULL; 3545 unsigned long flag; 3546 3547 if (init_iommu_hw()) { 3548 if (force_on) 3549 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3550 else 3551 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3552 return; 3553 } 3554 3555 for_each_active_iommu(iommu, drhd) { 3556 3557 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3558 3559 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3560 iommu->reg + DMAR_FECTL_REG); 3561 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3562 iommu->reg + DMAR_FEDATA_REG); 3563 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3564 iommu->reg + DMAR_FEADDR_REG); 3565 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3566 iommu->reg + DMAR_FEUADDR_REG); 3567 3568 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3569 } 3570 3571 for_each_active_iommu(iommu, drhd) 3572 kfree(iommu->iommu_state); 3573 } 3574 3575 static struct syscore_ops iommu_syscore_ops = { 3576 .resume = iommu_resume, 3577 .suspend = iommu_suspend, 3578 }; 3579 3580 static void __init init_iommu_pm_ops(void) 3581 { 3582 register_syscore_ops(&iommu_syscore_ops); 3583 } 3584 3585 #else 3586 static inline void init_iommu_pm_ops(void) {} 3587 #endif /* CONFIG_PM */ 3588 3589 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3590 { 3591 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3592 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3593 rmrr->end_address <= rmrr->base_address || 3594 arch_rmrr_sanity_check(rmrr)) 3595 return -EINVAL; 3596 3597 return 0; 3598 } 3599 3600 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3601 { 3602 struct acpi_dmar_reserved_memory *rmrr; 3603 struct dmar_rmrr_unit *rmrru; 3604 3605 rmrr = (struct acpi_dmar_reserved_memory *)header; 3606 if (rmrr_sanity_check(rmrr)) { 3607 pr_warn(FW_BUG 3608 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3609 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3610 rmrr->base_address, rmrr->end_address, 3611 dmi_get_system_info(DMI_BIOS_VENDOR), 3612 dmi_get_system_info(DMI_BIOS_VERSION), 3613 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3614 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3615 } 3616 3617 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3618 if (!rmrru) 3619 goto out; 3620 3621 rmrru->hdr = header; 3622 3623 rmrru->base_address = rmrr->base_address; 3624 rmrru->end_address = rmrr->end_address; 3625 3626 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3627 ((void *)rmrr) + rmrr->header.length, 3628 &rmrru->devices_cnt); 3629 if (rmrru->devices_cnt && rmrru->devices == NULL) 3630 goto free_rmrru; 3631 3632 list_add(&rmrru->list, &dmar_rmrr_units); 3633 3634 return 0; 3635 free_rmrru: 3636 kfree(rmrru); 3637 out: 3638 return -ENOMEM; 3639 } 3640 3641 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3642 { 3643 struct dmar_atsr_unit *atsru; 3644 struct acpi_dmar_atsr *tmp; 3645 3646 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3647 dmar_rcu_check()) { 3648 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3649 if (atsr->segment != tmp->segment) 3650 continue; 3651 if (atsr->header.length != tmp->header.length) 3652 continue; 3653 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3654 return atsru; 3655 } 3656 3657 return NULL; 3658 } 3659 3660 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3661 { 3662 struct acpi_dmar_atsr *atsr; 3663 struct dmar_atsr_unit *atsru; 3664 3665 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3666 return 0; 3667 3668 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3669 atsru = dmar_find_atsr(atsr); 3670 if (atsru) 3671 return 0; 3672 3673 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3674 if (!atsru) 3675 return -ENOMEM; 3676 3677 /* 3678 * If memory is allocated from slab by ACPI _DSM method, we need to 3679 * copy the memory content because the memory buffer will be freed 3680 * on return. 3681 */ 3682 atsru->hdr = (void *)(atsru + 1); 3683 memcpy(atsru->hdr, hdr, hdr->length); 3684 atsru->include_all = atsr->flags & 0x1; 3685 if (!atsru->include_all) { 3686 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3687 (void *)atsr + atsr->header.length, 3688 &atsru->devices_cnt); 3689 if (atsru->devices_cnt && atsru->devices == NULL) { 3690 kfree(atsru); 3691 return -ENOMEM; 3692 } 3693 } 3694 3695 list_add_rcu(&atsru->list, &dmar_atsr_units); 3696 3697 return 0; 3698 } 3699 3700 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3701 { 3702 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3703 kfree(atsru); 3704 } 3705 3706 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3707 { 3708 struct acpi_dmar_atsr *atsr; 3709 struct dmar_atsr_unit *atsru; 3710 3711 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3712 atsru = dmar_find_atsr(atsr); 3713 if (atsru) { 3714 list_del_rcu(&atsru->list); 3715 synchronize_rcu(); 3716 intel_iommu_free_atsr(atsru); 3717 } 3718 3719 return 0; 3720 } 3721 3722 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3723 { 3724 int i; 3725 struct device *dev; 3726 struct acpi_dmar_atsr *atsr; 3727 struct dmar_atsr_unit *atsru; 3728 3729 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3730 atsru = dmar_find_atsr(atsr); 3731 if (!atsru) 3732 return 0; 3733 3734 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3735 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3736 i, dev) 3737 return -EBUSY; 3738 } 3739 3740 return 0; 3741 } 3742 3743 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3744 { 3745 int sp, ret; 3746 struct intel_iommu *iommu = dmaru->iommu; 3747 3748 if (g_iommus[iommu->seq_id]) 3749 return 0; 3750 3751 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3752 pr_warn("%s: Doesn't support hardware pass through.\n", 3753 iommu->name); 3754 return -ENXIO; 3755 } 3756 if (!ecap_sc_support(iommu->ecap) && 3757 domain_update_iommu_snooping(iommu)) { 3758 pr_warn("%s: Doesn't support snooping.\n", 3759 iommu->name); 3760 return -ENXIO; 3761 } 3762 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3763 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3764 pr_warn("%s: Doesn't support large page.\n", 3765 iommu->name); 3766 return -ENXIO; 3767 } 3768 3769 /* 3770 * Disable translation if already enabled prior to OS handover. 3771 */ 3772 if (iommu->gcmd & DMA_GCMD_TE) 3773 iommu_disable_translation(iommu); 3774 3775 g_iommus[iommu->seq_id] = iommu; 3776 ret = iommu_init_domains(iommu); 3777 if (ret == 0) 3778 ret = iommu_alloc_root_entry(iommu); 3779 if (ret) 3780 goto out; 3781 3782 intel_svm_check(iommu); 3783 3784 if (dmaru->ignored) { 3785 /* 3786 * we always have to disable PMRs or DMA may fail on this device 3787 */ 3788 if (force_on) 3789 iommu_disable_protect_mem_regions(iommu); 3790 return 0; 3791 } 3792 3793 intel_iommu_init_qi(iommu); 3794 iommu_flush_write_buffer(iommu); 3795 3796 #ifdef CONFIG_INTEL_IOMMU_SVM 3797 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3798 ret = intel_svm_enable_prq(iommu); 3799 if (ret) 3800 goto disable_iommu; 3801 } 3802 #endif 3803 ret = dmar_set_interrupt(iommu); 3804 if (ret) 3805 goto disable_iommu; 3806 3807 iommu_set_root_entry(iommu); 3808 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3809 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3810 iommu_enable_translation(iommu); 3811 3812 iommu_disable_protect_mem_regions(iommu); 3813 return 0; 3814 3815 disable_iommu: 3816 disable_dmar_iommu(iommu); 3817 out: 3818 free_dmar_iommu(iommu); 3819 return ret; 3820 } 3821 3822 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3823 { 3824 int ret = 0; 3825 struct intel_iommu *iommu = dmaru->iommu; 3826 3827 if (!intel_iommu_enabled) 3828 return 0; 3829 if (iommu == NULL) 3830 return -EINVAL; 3831 3832 if (insert) { 3833 ret = intel_iommu_add(dmaru); 3834 } else { 3835 disable_dmar_iommu(iommu); 3836 free_dmar_iommu(iommu); 3837 } 3838 3839 return ret; 3840 } 3841 3842 static void intel_iommu_free_dmars(void) 3843 { 3844 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3845 struct dmar_atsr_unit *atsru, *atsr_n; 3846 3847 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3848 list_del(&rmrru->list); 3849 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3850 kfree(rmrru); 3851 } 3852 3853 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3854 list_del(&atsru->list); 3855 intel_iommu_free_atsr(atsru); 3856 } 3857 } 3858 3859 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 3860 { 3861 int i, ret = 1; 3862 struct pci_bus *bus; 3863 struct pci_dev *bridge = NULL; 3864 struct device *tmp; 3865 struct acpi_dmar_atsr *atsr; 3866 struct dmar_atsr_unit *atsru; 3867 3868 dev = pci_physfn(dev); 3869 for (bus = dev->bus; bus; bus = bus->parent) { 3870 bridge = bus->self; 3871 /* If it's an integrated device, allow ATS */ 3872 if (!bridge) 3873 return 1; 3874 /* Connected via non-PCIe: no ATS */ 3875 if (!pci_is_pcie(bridge) || 3876 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3877 return 0; 3878 /* If we found the root port, look it up in the ATSR */ 3879 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3880 break; 3881 } 3882 3883 rcu_read_lock(); 3884 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3885 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3886 if (atsr->segment != pci_domain_nr(dev->bus)) 3887 continue; 3888 3889 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3890 if (tmp == &bridge->dev) 3891 goto out; 3892 3893 if (atsru->include_all) 3894 goto out; 3895 } 3896 ret = 0; 3897 out: 3898 rcu_read_unlock(); 3899 3900 return ret; 3901 } 3902 3903 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3904 { 3905 int ret; 3906 struct dmar_rmrr_unit *rmrru; 3907 struct dmar_atsr_unit *atsru; 3908 struct acpi_dmar_atsr *atsr; 3909 struct acpi_dmar_reserved_memory *rmrr; 3910 3911 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3912 return 0; 3913 3914 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3915 rmrr = container_of(rmrru->hdr, 3916 struct acpi_dmar_reserved_memory, header); 3917 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3918 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3919 ((void *)rmrr) + rmrr->header.length, 3920 rmrr->segment, rmrru->devices, 3921 rmrru->devices_cnt); 3922 if (ret < 0) 3923 return ret; 3924 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3925 dmar_remove_dev_scope(info, rmrr->segment, 3926 rmrru->devices, rmrru->devices_cnt); 3927 } 3928 } 3929 3930 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3931 if (atsru->include_all) 3932 continue; 3933 3934 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3935 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3936 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3937 (void *)atsr + atsr->header.length, 3938 atsr->segment, atsru->devices, 3939 atsru->devices_cnt); 3940 if (ret > 0) 3941 break; 3942 else if (ret < 0) 3943 return ret; 3944 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3945 if (dmar_remove_dev_scope(info, atsr->segment, 3946 atsru->devices, atsru->devices_cnt)) 3947 break; 3948 } 3949 } 3950 3951 return 0; 3952 } 3953 3954 static int intel_iommu_memory_notifier(struct notifier_block *nb, 3955 unsigned long val, void *v) 3956 { 3957 struct memory_notify *mhp = v; 3958 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 3959 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 3960 mhp->nr_pages - 1); 3961 3962 switch (val) { 3963 case MEM_GOING_ONLINE: 3964 if (iommu_domain_identity_map(si_domain, 3965 start_vpfn, last_vpfn)) { 3966 pr_warn("Failed to build identity map for [%lx-%lx]\n", 3967 start_vpfn, last_vpfn); 3968 return NOTIFY_BAD; 3969 } 3970 break; 3971 3972 case MEM_OFFLINE: 3973 case MEM_CANCEL_ONLINE: 3974 { 3975 struct dmar_drhd_unit *drhd; 3976 struct intel_iommu *iommu; 3977 struct page *freelist; 3978 3979 freelist = domain_unmap(si_domain, 3980 start_vpfn, last_vpfn, 3981 NULL); 3982 3983 rcu_read_lock(); 3984 for_each_active_iommu(iommu, drhd) 3985 iommu_flush_iotlb_psi(iommu, si_domain, 3986 start_vpfn, mhp->nr_pages, 3987 !freelist, 0); 3988 rcu_read_unlock(); 3989 dma_free_pagelist(freelist); 3990 } 3991 break; 3992 } 3993 3994 return NOTIFY_OK; 3995 } 3996 3997 static struct notifier_block intel_iommu_memory_nb = { 3998 .notifier_call = intel_iommu_memory_notifier, 3999 .priority = 0 4000 }; 4001 4002 static void free_all_cpu_cached_iovas(unsigned int cpu) 4003 { 4004 int i; 4005 4006 for (i = 0; i < g_num_of_iommus; i++) { 4007 struct intel_iommu *iommu = g_iommus[i]; 4008 struct dmar_domain *domain; 4009 int did; 4010 4011 if (!iommu) 4012 continue; 4013 4014 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4015 domain = get_iommu_domain(iommu, (u16)did); 4016 4017 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4018 continue; 4019 4020 iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain); 4021 } 4022 } 4023 } 4024 4025 static int intel_iommu_cpu_dead(unsigned int cpu) 4026 { 4027 free_all_cpu_cached_iovas(cpu); 4028 return 0; 4029 } 4030 4031 static void intel_disable_iommus(void) 4032 { 4033 struct intel_iommu *iommu = NULL; 4034 struct dmar_drhd_unit *drhd; 4035 4036 for_each_iommu(iommu, drhd) 4037 iommu_disable_translation(iommu); 4038 } 4039 4040 void intel_iommu_shutdown(void) 4041 { 4042 struct dmar_drhd_unit *drhd; 4043 struct intel_iommu *iommu = NULL; 4044 4045 if (no_iommu || dmar_disabled) 4046 return; 4047 4048 down_write(&dmar_global_lock); 4049 4050 /* Disable PMRs explicitly here. */ 4051 for_each_iommu(iommu, drhd) 4052 iommu_disable_protect_mem_regions(iommu); 4053 4054 /* Make sure the IOMMUs are switched off */ 4055 intel_disable_iommus(); 4056 4057 up_write(&dmar_global_lock); 4058 } 4059 4060 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4061 { 4062 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4063 4064 return container_of(iommu_dev, struct intel_iommu, iommu); 4065 } 4066 4067 static ssize_t intel_iommu_show_version(struct device *dev, 4068 struct device_attribute *attr, 4069 char *buf) 4070 { 4071 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4072 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4073 return sprintf(buf, "%d:%d\n", 4074 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4075 } 4076 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4077 4078 static ssize_t intel_iommu_show_address(struct device *dev, 4079 struct device_attribute *attr, 4080 char *buf) 4081 { 4082 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4083 return sprintf(buf, "%llx\n", iommu->reg_phys); 4084 } 4085 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4086 4087 static ssize_t intel_iommu_show_cap(struct device *dev, 4088 struct device_attribute *attr, 4089 char *buf) 4090 { 4091 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4092 return sprintf(buf, "%llx\n", iommu->cap); 4093 } 4094 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4095 4096 static ssize_t intel_iommu_show_ecap(struct device *dev, 4097 struct device_attribute *attr, 4098 char *buf) 4099 { 4100 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4101 return sprintf(buf, "%llx\n", iommu->ecap); 4102 } 4103 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4104 4105 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4106 struct device_attribute *attr, 4107 char *buf) 4108 { 4109 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4110 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4111 } 4112 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4113 4114 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4115 struct device_attribute *attr, 4116 char *buf) 4117 { 4118 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4119 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4120 cap_ndoms(iommu->cap))); 4121 } 4122 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4123 4124 static struct attribute *intel_iommu_attrs[] = { 4125 &dev_attr_version.attr, 4126 &dev_attr_address.attr, 4127 &dev_attr_cap.attr, 4128 &dev_attr_ecap.attr, 4129 &dev_attr_domains_supported.attr, 4130 &dev_attr_domains_used.attr, 4131 NULL, 4132 }; 4133 4134 static struct attribute_group intel_iommu_group = { 4135 .name = "intel-iommu", 4136 .attrs = intel_iommu_attrs, 4137 }; 4138 4139 const struct attribute_group *intel_iommu_groups[] = { 4140 &intel_iommu_group, 4141 NULL, 4142 }; 4143 4144 static inline bool has_external_pci(void) 4145 { 4146 struct pci_dev *pdev = NULL; 4147 4148 for_each_pci_dev(pdev) 4149 if (pdev->external_facing) 4150 return true; 4151 4152 return false; 4153 } 4154 4155 static int __init platform_optin_force_iommu(void) 4156 { 4157 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4158 return 0; 4159 4160 if (no_iommu || dmar_disabled) 4161 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4162 4163 /* 4164 * If Intel-IOMMU is disabled by default, we will apply identity 4165 * map for all devices except those marked as being untrusted. 4166 */ 4167 if (dmar_disabled) 4168 iommu_set_default_passthrough(false); 4169 4170 dmar_disabled = 0; 4171 no_iommu = 0; 4172 4173 return 1; 4174 } 4175 4176 static int __init probe_acpi_namespace_devices(void) 4177 { 4178 struct dmar_drhd_unit *drhd; 4179 /* To avoid a -Wunused-but-set-variable warning. */ 4180 struct intel_iommu *iommu __maybe_unused; 4181 struct device *dev; 4182 int i, ret = 0; 4183 4184 for_each_active_iommu(iommu, drhd) { 4185 for_each_active_dev_scope(drhd->devices, 4186 drhd->devices_cnt, i, dev) { 4187 struct acpi_device_physical_node *pn; 4188 struct iommu_group *group; 4189 struct acpi_device *adev; 4190 4191 if (dev->bus != &acpi_bus_type) 4192 continue; 4193 4194 adev = to_acpi_device(dev); 4195 mutex_lock(&adev->physical_node_lock); 4196 list_for_each_entry(pn, 4197 &adev->physical_node_list, node) { 4198 group = iommu_group_get(pn->dev); 4199 if (group) { 4200 iommu_group_put(group); 4201 continue; 4202 } 4203 4204 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4205 ret = iommu_probe_device(pn->dev); 4206 if (ret) 4207 break; 4208 } 4209 mutex_unlock(&adev->physical_node_lock); 4210 4211 if (ret) 4212 return ret; 4213 } 4214 } 4215 4216 return 0; 4217 } 4218 4219 int __init intel_iommu_init(void) 4220 { 4221 int ret = -ENODEV; 4222 struct dmar_drhd_unit *drhd; 4223 struct intel_iommu *iommu; 4224 4225 /* 4226 * Intel IOMMU is required for a TXT/tboot launch or platform 4227 * opt in, so enforce that. 4228 */ 4229 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4230 platform_optin_force_iommu(); 4231 4232 if (iommu_init_mempool()) { 4233 if (force_on) 4234 panic("tboot: Failed to initialize iommu memory\n"); 4235 return -ENOMEM; 4236 } 4237 4238 down_write(&dmar_global_lock); 4239 if (dmar_table_init()) { 4240 if (force_on) 4241 panic("tboot: Failed to initialize DMAR table\n"); 4242 goto out_free_dmar; 4243 } 4244 4245 if (dmar_dev_scope_init() < 0) { 4246 if (force_on) 4247 panic("tboot: Failed to initialize DMAR device scope\n"); 4248 goto out_free_dmar; 4249 } 4250 4251 up_write(&dmar_global_lock); 4252 4253 /* 4254 * The bus notifier takes the dmar_global_lock, so lockdep will 4255 * complain later when we register it under the lock. 4256 */ 4257 dmar_register_bus_notifier(); 4258 4259 down_write(&dmar_global_lock); 4260 4261 if (!no_iommu) 4262 intel_iommu_debugfs_init(); 4263 4264 if (no_iommu || dmar_disabled) { 4265 /* 4266 * We exit the function here to ensure IOMMU's remapping and 4267 * mempool aren't setup, which means that the IOMMU's PMRs 4268 * won't be disabled via the call to init_dmars(). So disable 4269 * it explicitly here. The PMRs were setup by tboot prior to 4270 * calling SENTER, but the kernel is expected to reset/tear 4271 * down the PMRs. 4272 */ 4273 if (intel_iommu_tboot_noforce) { 4274 for_each_iommu(iommu, drhd) 4275 iommu_disable_protect_mem_regions(iommu); 4276 } 4277 4278 /* 4279 * Make sure the IOMMUs are switched off, even when we 4280 * boot into a kexec kernel and the previous kernel left 4281 * them enabled 4282 */ 4283 intel_disable_iommus(); 4284 goto out_free_dmar; 4285 } 4286 4287 if (list_empty(&dmar_rmrr_units)) 4288 pr_info("No RMRR found\n"); 4289 4290 if (list_empty(&dmar_atsr_units)) 4291 pr_info("No ATSR found\n"); 4292 4293 if (dmar_map_gfx) 4294 intel_iommu_gfx_mapped = 1; 4295 4296 init_no_remapping_devices(); 4297 4298 ret = init_dmars(); 4299 if (ret) { 4300 if (force_on) 4301 panic("tboot: Failed to initialize DMARs\n"); 4302 pr_err("Initialization failed\n"); 4303 goto out_free_dmar; 4304 } 4305 up_write(&dmar_global_lock); 4306 4307 init_iommu_pm_ops(); 4308 4309 down_read(&dmar_global_lock); 4310 for_each_active_iommu(iommu, drhd) { 4311 iommu_device_sysfs_add(&iommu->iommu, NULL, 4312 intel_iommu_groups, 4313 "%s", iommu->name); 4314 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4315 iommu_device_register(&iommu->iommu); 4316 } 4317 up_read(&dmar_global_lock); 4318 4319 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4320 if (si_domain && !hw_pass_through) 4321 register_memory_notifier(&intel_iommu_memory_nb); 4322 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4323 intel_iommu_cpu_dead); 4324 4325 down_read(&dmar_global_lock); 4326 if (probe_acpi_namespace_devices()) 4327 pr_warn("ACPI name space devices didn't probe correctly\n"); 4328 4329 /* Finally, we enable the DMA remapping hardware. */ 4330 for_each_iommu(iommu, drhd) { 4331 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4332 iommu_enable_translation(iommu); 4333 4334 iommu_disable_protect_mem_regions(iommu); 4335 } 4336 up_read(&dmar_global_lock); 4337 4338 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4339 4340 intel_iommu_enabled = 1; 4341 4342 return 0; 4343 4344 out_free_dmar: 4345 intel_iommu_free_dmars(); 4346 up_write(&dmar_global_lock); 4347 iommu_exit_mempool(); 4348 return ret; 4349 } 4350 4351 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4352 { 4353 struct intel_iommu *iommu = opaque; 4354 4355 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 4356 return 0; 4357 } 4358 4359 /* 4360 * NB - intel-iommu lacks any sort of reference counting for the users of 4361 * dependent devices. If multiple endpoints have intersecting dependent 4362 * devices, unbinding the driver from any one of them will possibly leave 4363 * the others unable to operate. 4364 */ 4365 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 4366 { 4367 if (!iommu || !dev || !dev_is_pci(dev)) 4368 return; 4369 4370 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 4371 } 4372 4373 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4374 { 4375 struct dmar_domain *domain; 4376 struct intel_iommu *iommu; 4377 unsigned long flags; 4378 4379 assert_spin_locked(&device_domain_lock); 4380 4381 if (WARN_ON(!info)) 4382 return; 4383 4384 iommu = info->iommu; 4385 domain = info->domain; 4386 4387 if (info->dev) { 4388 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4389 intel_pasid_tear_down_entry(iommu, info->dev, 4390 PASID_RID2PASID, false); 4391 4392 iommu_disable_dev_iotlb(info); 4393 if (!dev_is_real_dma_subdevice(info->dev)) 4394 domain_context_clear(iommu, info->dev); 4395 intel_pasid_free_table(info->dev); 4396 } 4397 4398 unlink_domain_info(info); 4399 4400 spin_lock_irqsave(&iommu->lock, flags); 4401 domain_detach_iommu(domain, iommu); 4402 spin_unlock_irqrestore(&iommu->lock, flags); 4403 4404 free_devinfo_mem(info); 4405 } 4406 4407 static void dmar_remove_one_dev_info(struct device *dev) 4408 { 4409 struct device_domain_info *info; 4410 unsigned long flags; 4411 4412 spin_lock_irqsave(&device_domain_lock, flags); 4413 info = get_domain_info(dev); 4414 if (info) 4415 __dmar_remove_one_dev_info(info); 4416 spin_unlock_irqrestore(&device_domain_lock, flags); 4417 } 4418 4419 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4420 { 4421 int adjust_width; 4422 4423 /* calculate AGAW */ 4424 domain->gaw = guest_width; 4425 adjust_width = guestwidth_to_adjustwidth(guest_width); 4426 domain->agaw = width_to_agaw(adjust_width); 4427 4428 domain->iommu_coherency = 0; 4429 domain->iommu_snooping = 0; 4430 domain->iommu_superpage = 0; 4431 domain->max_addr = 0; 4432 4433 /* always allocate the top pgd */ 4434 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 4435 if (!domain->pgd) 4436 return -ENOMEM; 4437 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4438 return 0; 4439 } 4440 4441 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4442 { 4443 struct dmar_domain *dmar_domain; 4444 struct iommu_domain *domain; 4445 4446 switch (type) { 4447 case IOMMU_DOMAIN_DMA: 4448 case IOMMU_DOMAIN_UNMANAGED: 4449 dmar_domain = alloc_domain(0); 4450 if (!dmar_domain) { 4451 pr_err("Can't allocate dmar_domain\n"); 4452 return NULL; 4453 } 4454 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4455 pr_err("Domain initialization failed\n"); 4456 domain_exit(dmar_domain); 4457 return NULL; 4458 } 4459 4460 if (type == IOMMU_DOMAIN_DMA && 4461 iommu_get_dma_cookie(&dmar_domain->domain)) 4462 return NULL; 4463 4464 domain = &dmar_domain->domain; 4465 domain->geometry.aperture_start = 0; 4466 domain->geometry.aperture_end = 4467 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4468 domain->geometry.force_aperture = true; 4469 4470 return domain; 4471 case IOMMU_DOMAIN_IDENTITY: 4472 return &si_domain->domain; 4473 default: 4474 return NULL; 4475 } 4476 4477 return NULL; 4478 } 4479 4480 static void intel_iommu_domain_free(struct iommu_domain *domain) 4481 { 4482 if (domain != &si_domain->domain) 4483 domain_exit(to_dmar_domain(domain)); 4484 } 4485 4486 /* 4487 * Check whether a @domain could be attached to the @dev through the 4488 * aux-domain attach/detach APIs. 4489 */ 4490 static inline bool 4491 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4492 { 4493 struct device_domain_info *info = get_domain_info(dev); 4494 4495 return info && info->auxd_enabled && 4496 domain->type == IOMMU_DOMAIN_UNMANAGED; 4497 } 4498 4499 static inline struct subdev_domain_info * 4500 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4501 { 4502 struct subdev_domain_info *sinfo; 4503 4504 if (!list_empty(&domain->subdevices)) { 4505 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4506 if (sinfo->pdev == dev) 4507 return sinfo; 4508 } 4509 } 4510 4511 return NULL; 4512 } 4513 4514 static int auxiliary_link_device(struct dmar_domain *domain, 4515 struct device *dev) 4516 { 4517 struct device_domain_info *info = get_domain_info(dev); 4518 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4519 4520 assert_spin_locked(&device_domain_lock); 4521 if (WARN_ON(!info)) 4522 return -EINVAL; 4523 4524 if (!sinfo) { 4525 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4526 sinfo->domain = domain; 4527 sinfo->pdev = dev; 4528 list_add(&sinfo->link_phys, &info->subdevices); 4529 list_add(&sinfo->link_domain, &domain->subdevices); 4530 } 4531 4532 return ++sinfo->users; 4533 } 4534 4535 static int auxiliary_unlink_device(struct dmar_domain *domain, 4536 struct device *dev) 4537 { 4538 struct device_domain_info *info = get_domain_info(dev); 4539 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4540 int ret; 4541 4542 assert_spin_locked(&device_domain_lock); 4543 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4544 return -EINVAL; 4545 4546 ret = --sinfo->users; 4547 if (!ret) { 4548 list_del(&sinfo->link_phys); 4549 list_del(&sinfo->link_domain); 4550 kfree(sinfo); 4551 } 4552 4553 return ret; 4554 } 4555 4556 static int aux_domain_add_dev(struct dmar_domain *domain, 4557 struct device *dev) 4558 { 4559 int ret; 4560 unsigned long flags; 4561 struct intel_iommu *iommu; 4562 4563 iommu = device_to_iommu(dev, NULL, NULL); 4564 if (!iommu) 4565 return -ENODEV; 4566 4567 if (domain->default_pasid <= 0) { 4568 u32 pasid; 4569 4570 /* No private data needed for the default pasid */ 4571 pasid = ioasid_alloc(NULL, PASID_MIN, 4572 pci_max_pasids(to_pci_dev(dev)) - 1, 4573 NULL); 4574 if (pasid == INVALID_IOASID) { 4575 pr_err("Can't allocate default pasid\n"); 4576 return -ENODEV; 4577 } 4578 domain->default_pasid = pasid; 4579 } 4580 4581 spin_lock_irqsave(&device_domain_lock, flags); 4582 ret = auxiliary_link_device(domain, dev); 4583 if (ret <= 0) 4584 goto link_failed; 4585 4586 /* 4587 * Subdevices from the same physical device can be attached to the 4588 * same domain. For such cases, only the first subdevice attachment 4589 * needs to go through the full steps in this function. So if ret > 4590 * 1, just goto out. 4591 */ 4592 if (ret > 1) 4593 goto out; 4594 4595 /* 4596 * iommu->lock must be held to attach domain to iommu and setup the 4597 * pasid entry for second level translation. 4598 */ 4599 spin_lock(&iommu->lock); 4600 ret = domain_attach_iommu(domain, iommu); 4601 if (ret) 4602 goto attach_failed; 4603 4604 /* Setup the PASID entry for mediated devices: */ 4605 if (domain_use_first_level(domain)) 4606 ret = domain_setup_first_level(iommu, domain, dev, 4607 domain->default_pasid); 4608 else 4609 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4610 domain->default_pasid); 4611 if (ret) 4612 goto table_failed; 4613 4614 spin_unlock(&iommu->lock); 4615 out: 4616 spin_unlock_irqrestore(&device_domain_lock, flags); 4617 4618 return 0; 4619 4620 table_failed: 4621 domain_detach_iommu(domain, iommu); 4622 attach_failed: 4623 spin_unlock(&iommu->lock); 4624 auxiliary_unlink_device(domain, dev); 4625 link_failed: 4626 spin_unlock_irqrestore(&device_domain_lock, flags); 4627 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4628 ioasid_put(domain->default_pasid); 4629 4630 return ret; 4631 } 4632 4633 static void aux_domain_remove_dev(struct dmar_domain *domain, 4634 struct device *dev) 4635 { 4636 struct device_domain_info *info; 4637 struct intel_iommu *iommu; 4638 unsigned long flags; 4639 4640 if (!is_aux_domain(dev, &domain->domain)) 4641 return; 4642 4643 spin_lock_irqsave(&device_domain_lock, flags); 4644 info = get_domain_info(dev); 4645 iommu = info->iommu; 4646 4647 if (!auxiliary_unlink_device(domain, dev)) { 4648 spin_lock(&iommu->lock); 4649 intel_pasid_tear_down_entry(iommu, dev, 4650 domain->default_pasid, false); 4651 domain_detach_iommu(domain, iommu); 4652 spin_unlock(&iommu->lock); 4653 } 4654 4655 spin_unlock_irqrestore(&device_domain_lock, flags); 4656 4657 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4658 ioasid_put(domain->default_pasid); 4659 } 4660 4661 static int prepare_domain_attach_device(struct iommu_domain *domain, 4662 struct device *dev) 4663 { 4664 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4665 struct intel_iommu *iommu; 4666 int addr_width; 4667 4668 iommu = device_to_iommu(dev, NULL, NULL); 4669 if (!iommu) 4670 return -ENODEV; 4671 4672 /* check if this iommu agaw is sufficient for max mapped address */ 4673 addr_width = agaw_to_width(iommu->agaw); 4674 if (addr_width > cap_mgaw(iommu->cap)) 4675 addr_width = cap_mgaw(iommu->cap); 4676 4677 if (dmar_domain->max_addr > (1LL << addr_width)) { 4678 dev_err(dev, "%s: iommu width (%d) is not " 4679 "sufficient for the mapped address (%llx)\n", 4680 __func__, addr_width, dmar_domain->max_addr); 4681 return -EFAULT; 4682 } 4683 dmar_domain->gaw = addr_width; 4684 4685 /* 4686 * Knock out extra levels of page tables if necessary 4687 */ 4688 while (iommu->agaw < dmar_domain->agaw) { 4689 struct dma_pte *pte; 4690 4691 pte = dmar_domain->pgd; 4692 if (dma_pte_present(pte)) { 4693 dmar_domain->pgd = (struct dma_pte *) 4694 phys_to_virt(dma_pte_addr(pte)); 4695 free_pgtable_page(pte); 4696 } 4697 dmar_domain->agaw--; 4698 } 4699 4700 return 0; 4701 } 4702 4703 static int intel_iommu_attach_device(struct iommu_domain *domain, 4704 struct device *dev) 4705 { 4706 int ret; 4707 4708 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4709 device_is_rmrr_locked(dev)) { 4710 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4711 return -EPERM; 4712 } 4713 4714 if (is_aux_domain(dev, domain)) 4715 return -EPERM; 4716 4717 /* normally dev is not mapped */ 4718 if (unlikely(domain_context_mapped(dev))) { 4719 struct dmar_domain *old_domain; 4720 4721 old_domain = find_domain(dev); 4722 if (old_domain) 4723 dmar_remove_one_dev_info(dev); 4724 } 4725 4726 ret = prepare_domain_attach_device(domain, dev); 4727 if (ret) 4728 return ret; 4729 4730 return domain_add_dev_info(to_dmar_domain(domain), dev); 4731 } 4732 4733 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4734 struct device *dev) 4735 { 4736 int ret; 4737 4738 if (!is_aux_domain(dev, domain)) 4739 return -EPERM; 4740 4741 ret = prepare_domain_attach_device(domain, dev); 4742 if (ret) 4743 return ret; 4744 4745 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4746 } 4747 4748 static void intel_iommu_detach_device(struct iommu_domain *domain, 4749 struct device *dev) 4750 { 4751 dmar_remove_one_dev_info(dev); 4752 } 4753 4754 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4755 struct device *dev) 4756 { 4757 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4758 } 4759 4760 #ifdef CONFIG_INTEL_IOMMU_SVM 4761 /* 4762 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4763 * VT-d granularity. Invalidation is typically included in the unmap operation 4764 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4765 * owns the first level page tables. Invalidations of translation caches in the 4766 * guest are trapped and passed down to the host. 4767 * 4768 * vIOMMU in the guest will only expose first level page tables, therefore 4769 * we do not support IOTLB granularity for request without PASID (second level). 4770 * 4771 * For example, to find the VT-d granularity encoding for IOTLB 4772 * type and page selective granularity within PASID: 4773 * X: indexed by iommu cache type 4774 * Y: indexed by enum iommu_inv_granularity 4775 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4776 */ 4777 4778 static const int 4779 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4780 /* 4781 * PASID based IOTLB invalidation: PASID selective (per PASID), 4782 * page selective (address granularity) 4783 */ 4784 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4785 /* PASID based dev TLBs */ 4786 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4787 /* PASID cache */ 4788 {-EINVAL, -EINVAL, -EINVAL} 4789 }; 4790 4791 static inline int to_vtd_granularity(int type, int granu) 4792 { 4793 return inv_type_granu_table[type][granu]; 4794 } 4795 4796 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4797 { 4798 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4799 4800 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 4801 * IOMMU cache invalidate API passes granu_size in bytes, and number of 4802 * granu size in contiguous memory. 4803 */ 4804 return order_base_2(nr_pages); 4805 } 4806 4807 static int 4808 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 4809 struct iommu_cache_invalidate_info *inv_info) 4810 { 4811 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4812 struct device_domain_info *info; 4813 struct intel_iommu *iommu; 4814 unsigned long flags; 4815 int cache_type; 4816 u8 bus, devfn; 4817 u16 did, sid; 4818 int ret = 0; 4819 u64 size = 0; 4820 4821 if (!inv_info || !dmar_domain) 4822 return -EINVAL; 4823 4824 if (!dev || !dev_is_pci(dev)) 4825 return -ENODEV; 4826 4827 iommu = device_to_iommu(dev, &bus, &devfn); 4828 if (!iommu) 4829 return -ENODEV; 4830 4831 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 4832 return -EINVAL; 4833 4834 spin_lock_irqsave(&device_domain_lock, flags); 4835 spin_lock(&iommu->lock); 4836 info = get_domain_info(dev); 4837 if (!info) { 4838 ret = -EINVAL; 4839 goto out_unlock; 4840 } 4841 did = dmar_domain->iommu_did[iommu->seq_id]; 4842 sid = PCI_DEVID(bus, devfn); 4843 4844 /* Size is only valid in address selective invalidation */ 4845 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 4846 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 4847 inv_info->granu.addr_info.nb_granules); 4848 4849 for_each_set_bit(cache_type, 4850 (unsigned long *)&inv_info->cache, 4851 IOMMU_CACHE_INV_TYPE_NR) { 4852 int granu = 0; 4853 u64 pasid = 0; 4854 u64 addr = 0; 4855 4856 granu = to_vtd_granularity(cache_type, inv_info->granularity); 4857 if (granu == -EINVAL) { 4858 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 4859 cache_type, inv_info->granularity); 4860 break; 4861 } 4862 4863 /* 4864 * PASID is stored in different locations based on the 4865 * granularity. 4866 */ 4867 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 4868 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 4869 pasid = inv_info->granu.pasid_info.pasid; 4870 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4871 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 4872 pasid = inv_info->granu.addr_info.pasid; 4873 4874 switch (BIT(cache_type)) { 4875 case IOMMU_CACHE_INV_TYPE_IOTLB: 4876 /* HW will ignore LSB bits based on address mask */ 4877 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4878 size && 4879 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 4880 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 4881 inv_info->granu.addr_info.addr, size); 4882 } 4883 4884 /* 4885 * If granu is PASID-selective, address is ignored. 4886 * We use npages = -1 to indicate that. 4887 */ 4888 qi_flush_piotlb(iommu, did, pasid, 4889 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 4890 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 4891 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 4892 4893 if (!info->ats_enabled) 4894 break; 4895 /* 4896 * Always flush device IOTLB if ATS is enabled. vIOMMU 4897 * in the guest may assume IOTLB flush is inclusive, 4898 * which is more efficient. 4899 */ 4900 fallthrough; 4901 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 4902 /* 4903 * PASID based device TLB invalidation does not support 4904 * IOMMU_INV_GRANU_PASID granularity but only supports 4905 * IOMMU_INV_GRANU_ADDR. 4906 * The equivalent of that is we set the size to be the 4907 * entire range of 64 bit. User only provides PASID info 4908 * without address info. So we set addr to 0. 4909 */ 4910 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 4911 size = 64 - VTD_PAGE_SHIFT; 4912 addr = 0; 4913 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 4914 addr = inv_info->granu.addr_info.addr; 4915 } 4916 4917 if (info->ats_enabled) 4918 qi_flush_dev_iotlb_pasid(iommu, sid, 4919 info->pfsid, pasid, 4920 info->ats_qdep, addr, 4921 size); 4922 else 4923 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 4924 break; 4925 default: 4926 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 4927 cache_type); 4928 ret = -EINVAL; 4929 } 4930 } 4931 out_unlock: 4932 spin_unlock(&iommu->lock); 4933 spin_unlock_irqrestore(&device_domain_lock, flags); 4934 4935 return ret; 4936 } 4937 #endif 4938 4939 static int intel_iommu_map(struct iommu_domain *domain, 4940 unsigned long iova, phys_addr_t hpa, 4941 size_t size, int iommu_prot, gfp_t gfp) 4942 { 4943 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4944 u64 max_addr; 4945 int prot = 0; 4946 int ret; 4947 4948 if (iommu_prot & IOMMU_READ) 4949 prot |= DMA_PTE_READ; 4950 if (iommu_prot & IOMMU_WRITE) 4951 prot |= DMA_PTE_WRITE; 4952 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 4953 prot |= DMA_PTE_SNP; 4954 4955 max_addr = iova + size; 4956 if (dmar_domain->max_addr < max_addr) { 4957 u64 end; 4958 4959 /* check if minimum agaw is sufficient for mapped address */ 4960 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 4961 if (end < max_addr) { 4962 pr_err("%s: iommu width (%d) is not " 4963 "sufficient for the mapped address (%llx)\n", 4964 __func__, dmar_domain->gaw, max_addr); 4965 return -EFAULT; 4966 } 4967 dmar_domain->max_addr = max_addr; 4968 } 4969 /* Round up size to next multiple of PAGE_SIZE, if it and 4970 the low bits of hpa would take us onto the next page */ 4971 size = aligned_nrpages(hpa, size); 4972 ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 4973 hpa >> VTD_PAGE_SHIFT, size, prot); 4974 return ret; 4975 } 4976 4977 static size_t intel_iommu_unmap(struct iommu_domain *domain, 4978 unsigned long iova, size_t size, 4979 struct iommu_iotlb_gather *gather) 4980 { 4981 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4982 unsigned long start_pfn, last_pfn; 4983 int level = 0; 4984 4985 /* Cope with horrid API which requires us to unmap more than the 4986 size argument if it happens to be a large-page mapping. */ 4987 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 4988 4989 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 4990 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 4991 4992 start_pfn = iova >> VTD_PAGE_SHIFT; 4993 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 4994 4995 gather->freelist = domain_unmap(dmar_domain, start_pfn, 4996 last_pfn, gather->freelist); 4997 4998 if (dmar_domain->max_addr == iova + size) 4999 dmar_domain->max_addr = iova; 5000 5001 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5002 5003 return size; 5004 } 5005 5006 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5007 struct iommu_iotlb_gather *gather) 5008 { 5009 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5010 unsigned long iova_pfn = IOVA_PFN(gather->start); 5011 size_t size = gather->end - gather->start; 5012 unsigned long start_pfn; 5013 unsigned long nrpages; 5014 int iommu_id; 5015 5016 nrpages = aligned_nrpages(gather->start, size); 5017 start_pfn = mm_to_dma_pfn(iova_pfn); 5018 5019 for_each_domain_iommu(iommu_id, dmar_domain) 5020 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5021 start_pfn, nrpages, !gather->freelist, 0); 5022 5023 dma_free_pagelist(gather->freelist); 5024 } 5025 5026 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5027 dma_addr_t iova) 5028 { 5029 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5030 struct dma_pte *pte; 5031 int level = 0; 5032 u64 phys = 0; 5033 5034 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5035 if (pte && dma_pte_present(pte)) 5036 phys = dma_pte_addr(pte) + 5037 (iova & (BIT_MASK(level_to_offset_bits(level) + 5038 VTD_PAGE_SHIFT) - 1)); 5039 5040 return phys; 5041 } 5042 5043 static inline bool scalable_mode_support(void) 5044 { 5045 struct dmar_drhd_unit *drhd; 5046 struct intel_iommu *iommu; 5047 bool ret = true; 5048 5049 rcu_read_lock(); 5050 for_each_active_iommu(iommu, drhd) { 5051 if (!sm_supported(iommu)) { 5052 ret = false; 5053 break; 5054 } 5055 } 5056 rcu_read_unlock(); 5057 5058 return ret; 5059 } 5060 5061 static inline bool iommu_pasid_support(void) 5062 { 5063 struct dmar_drhd_unit *drhd; 5064 struct intel_iommu *iommu; 5065 bool ret = true; 5066 5067 rcu_read_lock(); 5068 for_each_active_iommu(iommu, drhd) { 5069 if (!pasid_supported(iommu)) { 5070 ret = false; 5071 break; 5072 } 5073 } 5074 rcu_read_unlock(); 5075 5076 return ret; 5077 } 5078 5079 static inline bool nested_mode_support(void) 5080 { 5081 struct dmar_drhd_unit *drhd; 5082 struct intel_iommu *iommu; 5083 bool ret = true; 5084 5085 rcu_read_lock(); 5086 for_each_active_iommu(iommu, drhd) { 5087 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5088 ret = false; 5089 break; 5090 } 5091 } 5092 rcu_read_unlock(); 5093 5094 return ret; 5095 } 5096 5097 static bool intel_iommu_capable(enum iommu_cap cap) 5098 { 5099 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5100 return domain_update_iommu_snooping(NULL) == 1; 5101 if (cap == IOMMU_CAP_INTR_REMAP) 5102 return irq_remapping_enabled == 1; 5103 5104 return false; 5105 } 5106 5107 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5108 { 5109 struct intel_iommu *iommu; 5110 5111 iommu = device_to_iommu(dev, NULL, NULL); 5112 if (!iommu) 5113 return ERR_PTR(-ENODEV); 5114 5115 if (translation_pre_enabled(iommu)) 5116 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5117 5118 return &iommu->iommu; 5119 } 5120 5121 static void intel_iommu_release_device(struct device *dev) 5122 { 5123 struct intel_iommu *iommu; 5124 5125 iommu = device_to_iommu(dev, NULL, NULL); 5126 if (!iommu) 5127 return; 5128 5129 dmar_remove_one_dev_info(dev); 5130 5131 set_dma_ops(dev, NULL); 5132 } 5133 5134 static void intel_iommu_probe_finalize(struct device *dev) 5135 { 5136 dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT; 5137 struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 5138 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5139 5140 if (domain && domain->type == IOMMU_DOMAIN_DMA) 5141 iommu_setup_dma_ops(dev, base, 5142 __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base); 5143 else 5144 set_dma_ops(dev, NULL); 5145 } 5146 5147 static void intel_iommu_get_resv_regions(struct device *device, 5148 struct list_head *head) 5149 { 5150 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5151 struct iommu_resv_region *reg; 5152 struct dmar_rmrr_unit *rmrr; 5153 struct device *i_dev; 5154 int i; 5155 5156 down_read(&dmar_global_lock); 5157 for_each_rmrr_units(rmrr) { 5158 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5159 i, i_dev) { 5160 struct iommu_resv_region *resv; 5161 enum iommu_resv_type type; 5162 size_t length; 5163 5164 if (i_dev != device && 5165 !is_downstream_to_pci_bridge(device, i_dev)) 5166 continue; 5167 5168 length = rmrr->end_address - rmrr->base_address + 1; 5169 5170 type = device_rmrr_is_relaxable(device) ? 5171 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5172 5173 resv = iommu_alloc_resv_region(rmrr->base_address, 5174 length, prot, type); 5175 if (!resv) 5176 break; 5177 5178 list_add_tail(&resv->list, head); 5179 } 5180 } 5181 up_read(&dmar_global_lock); 5182 5183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5184 if (dev_is_pci(device)) { 5185 struct pci_dev *pdev = to_pci_dev(device); 5186 5187 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5188 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5189 IOMMU_RESV_DIRECT_RELAXABLE); 5190 if (reg) 5191 list_add_tail(®->list, head); 5192 } 5193 } 5194 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5195 5196 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5197 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5198 0, IOMMU_RESV_MSI); 5199 if (!reg) 5200 return; 5201 list_add_tail(®->list, head); 5202 } 5203 5204 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5205 { 5206 struct device_domain_info *info; 5207 struct context_entry *context; 5208 struct dmar_domain *domain; 5209 unsigned long flags; 5210 u64 ctx_lo; 5211 int ret; 5212 5213 domain = find_domain(dev); 5214 if (!domain) 5215 return -EINVAL; 5216 5217 spin_lock_irqsave(&device_domain_lock, flags); 5218 spin_lock(&iommu->lock); 5219 5220 ret = -EINVAL; 5221 info = get_domain_info(dev); 5222 if (!info || !info->pasid_supported) 5223 goto out; 5224 5225 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5226 if (WARN_ON(!context)) 5227 goto out; 5228 5229 ctx_lo = context[0].lo; 5230 5231 if (!(ctx_lo & CONTEXT_PASIDE)) { 5232 ctx_lo |= CONTEXT_PASIDE; 5233 context[0].lo = ctx_lo; 5234 wmb(); 5235 iommu->flush.flush_context(iommu, 5236 domain->iommu_did[iommu->seq_id], 5237 PCI_DEVID(info->bus, info->devfn), 5238 DMA_CCMD_MASK_NOBIT, 5239 DMA_CCMD_DEVICE_INVL); 5240 } 5241 5242 /* Enable PASID support in the device, if it wasn't already */ 5243 if (!info->pasid_enabled) 5244 iommu_enable_dev_iotlb(info); 5245 5246 ret = 0; 5247 5248 out: 5249 spin_unlock(&iommu->lock); 5250 spin_unlock_irqrestore(&device_domain_lock, flags); 5251 5252 return ret; 5253 } 5254 5255 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5256 { 5257 if (dev_is_pci(dev)) 5258 return pci_device_group(dev); 5259 return generic_device_group(dev); 5260 } 5261 5262 static int intel_iommu_enable_auxd(struct device *dev) 5263 { 5264 struct device_domain_info *info; 5265 struct intel_iommu *iommu; 5266 unsigned long flags; 5267 int ret; 5268 5269 iommu = device_to_iommu(dev, NULL, NULL); 5270 if (!iommu || dmar_disabled) 5271 return -EINVAL; 5272 5273 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5274 return -EINVAL; 5275 5276 ret = intel_iommu_enable_pasid(iommu, dev); 5277 if (ret) 5278 return -ENODEV; 5279 5280 spin_lock_irqsave(&device_domain_lock, flags); 5281 info = get_domain_info(dev); 5282 info->auxd_enabled = 1; 5283 spin_unlock_irqrestore(&device_domain_lock, flags); 5284 5285 return 0; 5286 } 5287 5288 static int intel_iommu_disable_auxd(struct device *dev) 5289 { 5290 struct device_domain_info *info; 5291 unsigned long flags; 5292 5293 spin_lock_irqsave(&device_domain_lock, flags); 5294 info = get_domain_info(dev); 5295 if (!WARN_ON(!info)) 5296 info->auxd_enabled = 0; 5297 spin_unlock_irqrestore(&device_domain_lock, flags); 5298 5299 return 0; 5300 } 5301 5302 /* 5303 * A PCI express designated vendor specific extended capability is defined 5304 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5305 * for system software and tools to detect endpoint devices supporting the 5306 * Intel scalable IO virtualization without host driver dependency. 5307 * 5308 * Returns the address of the matching extended capability structure within 5309 * the device's PCI configuration space or 0 if the device does not support 5310 * it. 5311 */ 5312 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5313 { 5314 int pos; 5315 u16 vendor, id; 5316 5317 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5318 while (pos) { 5319 pci_read_config_word(pdev, pos + 4, &vendor); 5320 pci_read_config_word(pdev, pos + 8, &id); 5321 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5322 return pos; 5323 5324 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5325 } 5326 5327 return 0; 5328 } 5329 5330 static bool 5331 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5332 { 5333 if (feat == IOMMU_DEV_FEAT_AUX) { 5334 int ret; 5335 5336 if (!dev_is_pci(dev) || dmar_disabled || 5337 !scalable_mode_support() || !iommu_pasid_support()) 5338 return false; 5339 5340 ret = pci_pasid_features(to_pci_dev(dev)); 5341 if (ret < 0) 5342 return false; 5343 5344 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5345 } 5346 5347 if (feat == IOMMU_DEV_FEAT_SVA) { 5348 struct device_domain_info *info = get_domain_info(dev); 5349 5350 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5351 info->pasid_supported && info->pri_supported && 5352 info->ats_supported; 5353 } 5354 5355 return false; 5356 } 5357 5358 static int 5359 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5360 { 5361 if (feat == IOMMU_DEV_FEAT_AUX) 5362 return intel_iommu_enable_auxd(dev); 5363 5364 if (feat == IOMMU_DEV_FEAT_SVA) { 5365 struct device_domain_info *info = get_domain_info(dev); 5366 5367 if (!info) 5368 return -EINVAL; 5369 5370 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5371 return 0; 5372 } 5373 5374 return -ENODEV; 5375 } 5376 5377 static int 5378 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5379 { 5380 if (feat == IOMMU_DEV_FEAT_AUX) 5381 return intel_iommu_disable_auxd(dev); 5382 5383 return -ENODEV; 5384 } 5385 5386 static bool 5387 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5388 { 5389 struct device_domain_info *info = get_domain_info(dev); 5390 5391 if (feat == IOMMU_DEV_FEAT_AUX) 5392 return scalable_mode_support() && info && info->auxd_enabled; 5393 5394 return false; 5395 } 5396 5397 static int 5398 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5399 { 5400 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5401 5402 return dmar_domain->default_pasid > 0 ? 5403 dmar_domain->default_pasid : -EINVAL; 5404 } 5405 5406 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5407 struct device *dev) 5408 { 5409 return attach_deferred(dev); 5410 } 5411 5412 static int 5413 intel_iommu_domain_set_attr(struct iommu_domain *domain, 5414 enum iommu_attr attr, void *data) 5415 { 5416 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5417 unsigned long flags; 5418 int ret = 0; 5419 5420 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 5421 return -EINVAL; 5422 5423 switch (attr) { 5424 case DOMAIN_ATTR_NESTING: 5425 spin_lock_irqsave(&device_domain_lock, flags); 5426 if (nested_mode_support() && 5427 list_empty(&dmar_domain->devices)) { 5428 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5429 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5430 } else { 5431 ret = -ENODEV; 5432 } 5433 spin_unlock_irqrestore(&device_domain_lock, flags); 5434 break; 5435 default: 5436 ret = -EINVAL; 5437 break; 5438 } 5439 5440 return ret; 5441 } 5442 5443 static bool domain_use_flush_queue(void) 5444 { 5445 struct dmar_drhd_unit *drhd; 5446 struct intel_iommu *iommu; 5447 bool r = true; 5448 5449 if (intel_iommu_strict) 5450 return false; 5451 5452 /* 5453 * The flush queue implementation does not perform page-selective 5454 * invalidations that are required for efficient TLB flushes in virtual 5455 * environments. The benefit of batching is likely to be much lower than 5456 * the overhead of synchronizing the virtual and physical IOMMU 5457 * page-tables. 5458 */ 5459 rcu_read_lock(); 5460 for_each_active_iommu(iommu, drhd) { 5461 if (!cap_caching_mode(iommu->cap)) 5462 continue; 5463 5464 pr_warn_once("IOMMU batching is disabled due to virtualization"); 5465 r = false; 5466 break; 5467 } 5468 rcu_read_unlock(); 5469 5470 return r; 5471 } 5472 5473 static int 5474 intel_iommu_domain_get_attr(struct iommu_domain *domain, 5475 enum iommu_attr attr, void *data) 5476 { 5477 switch (domain->type) { 5478 case IOMMU_DOMAIN_UNMANAGED: 5479 return -ENODEV; 5480 case IOMMU_DOMAIN_DMA: 5481 switch (attr) { 5482 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: 5483 *(int *)data = domain_use_flush_queue(); 5484 return 0; 5485 default: 5486 return -ENODEV; 5487 } 5488 break; 5489 default: 5490 return -EINVAL; 5491 } 5492 } 5493 5494 /* 5495 * Check that the device does not live on an external facing PCI port that is 5496 * marked as untrusted. Such devices should not be able to apply quirks and 5497 * thus not be able to bypass the IOMMU restrictions. 5498 */ 5499 static bool risky_device(struct pci_dev *pdev) 5500 { 5501 if (pdev->untrusted) { 5502 pci_info(pdev, 5503 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5504 pdev->vendor, pdev->device); 5505 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5506 return true; 5507 } 5508 return false; 5509 } 5510 5511 const struct iommu_ops intel_iommu_ops = { 5512 .capable = intel_iommu_capable, 5513 .domain_alloc = intel_iommu_domain_alloc, 5514 .domain_free = intel_iommu_domain_free, 5515 .domain_get_attr = intel_iommu_domain_get_attr, 5516 .domain_set_attr = intel_iommu_domain_set_attr, 5517 .attach_dev = intel_iommu_attach_device, 5518 .detach_dev = intel_iommu_detach_device, 5519 .aux_attach_dev = intel_iommu_aux_attach_device, 5520 .aux_detach_dev = intel_iommu_aux_detach_device, 5521 .aux_get_pasid = intel_iommu_aux_get_pasid, 5522 .map = intel_iommu_map, 5523 .unmap = intel_iommu_unmap, 5524 .flush_iotlb_all = intel_flush_iotlb_all, 5525 .iotlb_sync = intel_iommu_tlb_sync, 5526 .iova_to_phys = intel_iommu_iova_to_phys, 5527 .probe_device = intel_iommu_probe_device, 5528 .probe_finalize = intel_iommu_probe_finalize, 5529 .release_device = intel_iommu_release_device, 5530 .get_resv_regions = intel_iommu_get_resv_regions, 5531 .put_resv_regions = generic_iommu_put_resv_regions, 5532 .device_group = intel_iommu_device_group, 5533 .dev_has_feat = intel_iommu_dev_has_feat, 5534 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5535 .dev_enable_feat = intel_iommu_dev_enable_feat, 5536 .dev_disable_feat = intel_iommu_dev_disable_feat, 5537 .is_attach_deferred = intel_iommu_is_attach_deferred, 5538 .def_domain_type = device_def_domain_type, 5539 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 5540 #ifdef CONFIG_INTEL_IOMMU_SVM 5541 .cache_invalidate = intel_iommu_sva_invalidate, 5542 .sva_bind_gpasid = intel_svm_bind_gpasid, 5543 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5544 .sva_bind = intel_svm_bind, 5545 .sva_unbind = intel_svm_unbind, 5546 .sva_get_pasid = intel_svm_get_pasid, 5547 .page_response = intel_svm_page_response, 5548 #endif 5549 }; 5550 5551 static void quirk_iommu_igfx(struct pci_dev *dev) 5552 { 5553 if (risky_device(dev)) 5554 return; 5555 5556 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5557 dmar_map_gfx = 0; 5558 } 5559 5560 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5564 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5565 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5566 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5567 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5568 5569 /* Broadwell igfx malfunctions with dmar */ 5570 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5571 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5573 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5574 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5575 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5576 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5577 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5582 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5585 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5586 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5587 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5588 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5589 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5590 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5591 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5594 5595 static void quirk_iommu_rwbf(struct pci_dev *dev) 5596 { 5597 if (risky_device(dev)) 5598 return; 5599 5600 /* 5601 * Mobile 4 Series Chipset neglects to set RWBF capability, 5602 * but needs it. Same seems to hold for the desktop versions. 5603 */ 5604 pci_info(dev, "Forcing write-buffer flush capability\n"); 5605 rwbf_quirk = 1; 5606 } 5607 5608 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5609 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5610 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5611 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5612 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5613 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5615 5616 #define GGC 0x52 5617 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5618 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5619 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5620 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5621 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5622 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5623 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5624 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5625 5626 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5627 { 5628 unsigned short ggc; 5629 5630 if (risky_device(dev)) 5631 return; 5632 5633 if (pci_read_config_word(dev, GGC, &ggc)) 5634 return; 5635 5636 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5637 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5638 dmar_map_gfx = 0; 5639 } else if (dmar_map_gfx) { 5640 /* we have to ensure the gfx device is idle before we flush */ 5641 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5642 intel_iommu_strict = 1; 5643 } 5644 } 5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5649 5650 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5651 { 5652 unsigned short ver; 5653 5654 if (!IS_GFX_DEVICE(dev)) 5655 return; 5656 5657 ver = (dev->device >> 8) & 0xff; 5658 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5659 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5660 ver != 0x9a) 5661 return; 5662 5663 if (risky_device(dev)) 5664 return; 5665 5666 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5667 iommu_skip_te_disable = 1; 5668 } 5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5670 5671 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5672 ISOCH DMAR unit for the Azalia sound device, but not give it any 5673 TLB entries, which causes it to deadlock. Check for that. We do 5674 this in a function called from init_dmars(), instead of in a PCI 5675 quirk, because we don't want to print the obnoxious "BIOS broken" 5676 message if VT-d is actually disabled. 5677 */ 5678 static void __init check_tylersburg_isoch(void) 5679 { 5680 struct pci_dev *pdev; 5681 uint32_t vtisochctrl; 5682 5683 /* If there's no Azalia in the system anyway, forget it. */ 5684 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5685 if (!pdev) 5686 return; 5687 5688 if (risky_device(pdev)) { 5689 pci_dev_put(pdev); 5690 return; 5691 } 5692 5693 pci_dev_put(pdev); 5694 5695 /* System Management Registers. Might be hidden, in which case 5696 we can't do the sanity check. But that's OK, because the 5697 known-broken BIOSes _don't_ actually hide it, so far. */ 5698 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5699 if (!pdev) 5700 return; 5701 5702 if (risky_device(pdev)) { 5703 pci_dev_put(pdev); 5704 return; 5705 } 5706 5707 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5708 pci_dev_put(pdev); 5709 return; 5710 } 5711 5712 pci_dev_put(pdev); 5713 5714 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5715 if (vtisochctrl & 1) 5716 return; 5717 5718 /* Drop all bits other than the number of TLB entries */ 5719 vtisochctrl &= 0x1c; 5720 5721 /* If we have the recommended number of TLB entries (16), fine. */ 5722 if (vtisochctrl == 0x10) 5723 return; 5724 5725 /* Zero TLB entries? You get to ride the short bus to school. */ 5726 if (!vtisochctrl) { 5727 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5728 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5729 dmi_get_system_info(DMI_BIOS_VENDOR), 5730 dmi_get_system_info(DMI_BIOS_VERSION), 5731 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5732 iommu_identity_mapping |= IDENTMAP_AZALIA; 5733 return; 5734 } 5735 5736 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5737 vtisochctrl); 5738 } 5739