1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/dma-iommu.h> 35 #include <linux/intel-iommu.h> 36 #include <linux/syscore_ops.h> 37 #include <linux/tboot.h> 38 #include <linux/dmi.h> 39 #include <linux/pci-ats.h> 40 #include <linux/memblock.h> 41 #include <linux/dma-direct.h> 42 #include <linux/crash_dump.h> 43 #include <linux/numa.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/cacheflush.h> 46 #include <asm/iommu.h> 47 48 #include "../irq_remapping.h" 49 #include "pasid.h" 50 #include "cap_audit.h" 51 52 #define ROOT_SIZE VTD_PAGE_SIZE 53 #define CONTEXT_SIZE VTD_PAGE_SIZE 54 55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 59 60 #define IOAPIC_RANGE_START (0xfee00000) 61 #define IOAPIC_RANGE_END (0xfeefffff) 62 #define IOVA_START_ADDR (0x1000) 63 64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 65 66 #define MAX_AGAW_WIDTH 64 67 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 68 69 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 71 72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 73 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 74 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 75 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 76 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 77 78 /* IO virtual address start page frame number */ 79 #define IOVA_START_PFN (1) 80 81 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 82 83 /* page table handling */ 84 #define LEVEL_STRIDE (9) 85 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 86 87 /* 88 * This bitmap is used to advertise the page sizes our hardware support 89 * to the IOMMU core, which will then use this information to split 90 * physically contiguous memory regions it is mapping into page sizes 91 * that we support. 92 * 93 * Traditionally the IOMMU core just handed us the mappings directly, 94 * after making sure the size is an order of a 4KiB page and that the 95 * mapping has natural alignment. 96 * 97 * To retain this behavior, we currently advertise that we support 98 * all page sizes that are an order of 4KiB. 99 * 100 * If at some point we'd like to utilize the IOMMU core's new behavior, 101 * we could change this to advertise the real page sizes we support. 102 */ 103 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 104 105 static inline int agaw_to_level(int agaw) 106 { 107 return agaw + 2; 108 } 109 110 static inline int agaw_to_width(int agaw) 111 { 112 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 113 } 114 115 static inline int width_to_agaw(int width) 116 { 117 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 118 } 119 120 static inline unsigned int level_to_offset_bits(int level) 121 { 122 return (level - 1) * LEVEL_STRIDE; 123 } 124 125 static inline int pfn_level_offset(u64 pfn, int level) 126 { 127 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 128 } 129 130 static inline u64 level_mask(int level) 131 { 132 return -1ULL << level_to_offset_bits(level); 133 } 134 135 static inline u64 level_size(int level) 136 { 137 return 1ULL << level_to_offset_bits(level); 138 } 139 140 static inline u64 align_to_level(u64 pfn, int level) 141 { 142 return (pfn + level_size(level) - 1) & level_mask(level); 143 } 144 145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 146 { 147 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 148 } 149 150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 151 are never going to work. */ 152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 153 { 154 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 155 } 156 157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 158 { 159 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 160 } 161 static inline unsigned long page_to_dma_pfn(struct page *pg) 162 { 163 return mm_to_dma_pfn(page_to_pfn(pg)); 164 } 165 static inline unsigned long virt_to_dma_pfn(void *p) 166 { 167 return page_to_dma_pfn(virt_to_page(p)); 168 } 169 170 /* global iommu list, set NULL for ignored DMAR units */ 171 static struct intel_iommu **g_iommus; 172 173 static void __init check_tylersburg_isoch(void); 174 static int rwbf_quirk; 175 176 /* 177 * set to 1 to panic kernel if can't successfully enable VT-d 178 * (used when kernel is launched w/ TXT) 179 */ 180 static int force_on = 0; 181 static int intel_iommu_tboot_noforce; 182 static int no_platform_optin; 183 184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 185 186 /* 187 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 188 * if marked present. 189 */ 190 static phys_addr_t root_entry_lctp(struct root_entry *re) 191 { 192 if (!(re->lo & 1)) 193 return 0; 194 195 return re->lo & VTD_PAGE_MASK; 196 } 197 198 /* 199 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 200 * if marked present. 201 */ 202 static phys_addr_t root_entry_uctp(struct root_entry *re) 203 { 204 if (!(re->hi & 1)) 205 return 0; 206 207 return re->hi & VTD_PAGE_MASK; 208 } 209 210 static inline void context_clear_pasid_enable(struct context_entry *context) 211 { 212 context->lo &= ~(1ULL << 11); 213 } 214 215 static inline bool context_pasid_enabled(struct context_entry *context) 216 { 217 return !!(context->lo & (1ULL << 11)); 218 } 219 220 static inline void context_set_copied(struct context_entry *context) 221 { 222 context->hi |= (1ull << 3); 223 } 224 225 static inline bool context_copied(struct context_entry *context) 226 { 227 return !!(context->hi & (1ULL << 3)); 228 } 229 230 static inline bool __context_present(struct context_entry *context) 231 { 232 return (context->lo & 1); 233 } 234 235 bool context_present(struct context_entry *context) 236 { 237 return context_pasid_enabled(context) ? 238 __context_present(context) : 239 __context_present(context) && !context_copied(context); 240 } 241 242 static inline void context_set_present(struct context_entry *context) 243 { 244 context->lo |= 1; 245 } 246 247 static inline void context_set_fault_enable(struct context_entry *context) 248 { 249 context->lo &= (((u64)-1) << 2) | 1; 250 } 251 252 static inline void context_set_translation_type(struct context_entry *context, 253 unsigned long value) 254 { 255 context->lo &= (((u64)-1) << 4) | 3; 256 context->lo |= (value & 3) << 2; 257 } 258 259 static inline void context_set_address_root(struct context_entry *context, 260 unsigned long value) 261 { 262 context->lo &= ~VTD_PAGE_MASK; 263 context->lo |= value & VTD_PAGE_MASK; 264 } 265 266 static inline void context_set_address_width(struct context_entry *context, 267 unsigned long value) 268 { 269 context->hi |= value & 7; 270 } 271 272 static inline void context_set_domain_id(struct context_entry *context, 273 unsigned long value) 274 { 275 context->hi |= (value & ((1 << 16) - 1)) << 8; 276 } 277 278 static inline int context_domain_id(struct context_entry *c) 279 { 280 return((c->hi >> 8) & 0xffff); 281 } 282 283 static inline void context_clear_entry(struct context_entry *context) 284 { 285 context->lo = 0; 286 context->hi = 0; 287 } 288 289 /* 290 * This domain is a statically identity mapping domain. 291 * 1. This domain creats a static 1:1 mapping to all usable memory. 292 * 2. It maps to each iommu if successful. 293 * 3. Each iommu mapps to this domain if successful. 294 */ 295 static struct dmar_domain *si_domain; 296 static int hw_pass_through = 1; 297 298 #define for_each_domain_iommu(idx, domain) \ 299 for (idx = 0; idx < g_num_of_iommus; idx++) \ 300 if (domain->iommu_refcnt[idx]) 301 302 struct dmar_rmrr_unit { 303 struct list_head list; /* list of rmrr units */ 304 struct acpi_dmar_header *hdr; /* ACPI header */ 305 u64 base_address; /* reserved base address*/ 306 u64 end_address; /* reserved end address */ 307 struct dmar_dev_scope *devices; /* target devices */ 308 int devices_cnt; /* target device count */ 309 }; 310 311 struct dmar_atsr_unit { 312 struct list_head list; /* list of ATSR units */ 313 struct acpi_dmar_header *hdr; /* ACPI header */ 314 struct dmar_dev_scope *devices; /* target devices */ 315 int devices_cnt; /* target device count */ 316 u8 include_all:1; /* include all ports */ 317 }; 318 319 struct dmar_satc_unit { 320 struct list_head list; /* list of SATC units */ 321 struct acpi_dmar_header *hdr; /* ACPI header */ 322 struct dmar_dev_scope *devices; /* target devices */ 323 struct intel_iommu *iommu; /* the corresponding iommu */ 324 int devices_cnt; /* target device count */ 325 u8 atc_required:1; /* ATS is required */ 326 }; 327 328 static LIST_HEAD(dmar_atsr_units); 329 static LIST_HEAD(dmar_rmrr_units); 330 static LIST_HEAD(dmar_satc_units); 331 332 #define for_each_rmrr_units(rmrr) \ 333 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 334 335 /* bitmap for indexing intel_iommus */ 336 static int g_num_of_iommus; 337 338 static void domain_exit(struct dmar_domain *domain); 339 static void domain_remove_dev_info(struct dmar_domain *domain); 340 static void dmar_remove_one_dev_info(struct device *dev); 341 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 342 static int intel_iommu_attach_device(struct iommu_domain *domain, 343 struct device *dev); 344 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 345 dma_addr_t iova); 346 347 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 348 int dmar_disabled = 0; 349 #else 350 int dmar_disabled = 1; 351 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 352 353 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 354 int intel_iommu_sm = 1; 355 #else 356 int intel_iommu_sm; 357 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 358 359 int intel_iommu_enabled = 0; 360 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 361 362 static int dmar_map_gfx = 1; 363 static int dmar_forcedac; 364 static int intel_iommu_strict; 365 static int intel_iommu_superpage = 1; 366 static int iommu_identity_mapping; 367 static int iommu_skip_te_disable; 368 369 #define IDENTMAP_GFX 2 370 #define IDENTMAP_AZALIA 4 371 372 int intel_iommu_gfx_mapped; 373 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 374 375 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 376 struct device_domain_info *get_domain_info(struct device *dev) 377 { 378 struct device_domain_info *info; 379 380 if (!dev) 381 return NULL; 382 383 info = dev_iommu_priv_get(dev); 384 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 385 return NULL; 386 387 return info; 388 } 389 390 DEFINE_SPINLOCK(device_domain_lock); 391 static LIST_HEAD(device_domain_list); 392 393 /* 394 * Iterate over elements in device_domain_list and call the specified 395 * callback @fn against each element. 396 */ 397 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 398 void *data), void *data) 399 { 400 int ret = 0; 401 unsigned long flags; 402 struct device_domain_info *info; 403 404 spin_lock_irqsave(&device_domain_lock, flags); 405 list_for_each_entry(info, &device_domain_list, global) { 406 ret = fn(info, data); 407 if (ret) { 408 spin_unlock_irqrestore(&device_domain_lock, flags); 409 return ret; 410 } 411 } 412 spin_unlock_irqrestore(&device_domain_lock, flags); 413 414 return 0; 415 } 416 417 const struct iommu_ops intel_iommu_ops; 418 419 static bool translation_pre_enabled(struct intel_iommu *iommu) 420 { 421 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 422 } 423 424 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 425 { 426 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 427 } 428 429 static void init_translation_status(struct intel_iommu *iommu) 430 { 431 u32 gsts; 432 433 gsts = readl(iommu->reg + DMAR_GSTS_REG); 434 if (gsts & DMA_GSTS_TES) 435 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 436 } 437 438 static int __init intel_iommu_setup(char *str) 439 { 440 if (!str) 441 return -EINVAL; 442 while (*str) { 443 if (!strncmp(str, "on", 2)) { 444 dmar_disabled = 0; 445 pr_info("IOMMU enabled\n"); 446 } else if (!strncmp(str, "off", 3)) { 447 dmar_disabled = 1; 448 no_platform_optin = 1; 449 pr_info("IOMMU disabled\n"); 450 } else if (!strncmp(str, "igfx_off", 8)) { 451 dmar_map_gfx = 0; 452 pr_info("Disable GFX device mapping\n"); 453 } else if (!strncmp(str, "forcedac", 8)) { 454 pr_info("Forcing DAC for PCI devices\n"); 455 dmar_forcedac = 1; 456 } else if (!strncmp(str, "strict", 6)) { 457 pr_info("Disable batched IOTLB flush\n"); 458 intel_iommu_strict = 1; 459 } else if (!strncmp(str, "sp_off", 6)) { 460 pr_info("Disable supported super page\n"); 461 intel_iommu_superpage = 0; 462 } else if (!strncmp(str, "sm_on", 5)) { 463 pr_info("Intel-IOMMU: scalable mode supported\n"); 464 intel_iommu_sm = 1; 465 } else if (!strncmp(str, "tboot_noforce", 13)) { 466 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 467 intel_iommu_tboot_noforce = 1; 468 } 469 470 str += strcspn(str, ","); 471 while (*str == ',') 472 str++; 473 } 474 return 0; 475 } 476 __setup("intel_iommu=", intel_iommu_setup); 477 478 static struct kmem_cache *iommu_domain_cache; 479 static struct kmem_cache *iommu_devinfo_cache; 480 481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 482 { 483 struct dmar_domain **domains; 484 int idx = did >> 8; 485 486 domains = iommu->domains[idx]; 487 if (!domains) 488 return NULL; 489 490 return domains[did & 0xff]; 491 } 492 493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 494 struct dmar_domain *domain) 495 { 496 struct dmar_domain **domains; 497 int idx = did >> 8; 498 499 if (!iommu->domains[idx]) { 500 size_t size = 256 * sizeof(struct dmar_domain *); 501 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 502 } 503 504 domains = iommu->domains[idx]; 505 if (WARN_ON(!domains)) 506 return; 507 else 508 domains[did & 0xff] = domain; 509 } 510 511 void *alloc_pgtable_page(int node) 512 { 513 struct page *page; 514 void *vaddr = NULL; 515 516 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 517 if (page) 518 vaddr = page_address(page); 519 return vaddr; 520 } 521 522 void free_pgtable_page(void *vaddr) 523 { 524 free_page((unsigned long)vaddr); 525 } 526 527 static inline void *alloc_domain_mem(void) 528 { 529 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 530 } 531 532 static void free_domain_mem(void *vaddr) 533 { 534 kmem_cache_free(iommu_domain_cache, vaddr); 535 } 536 537 static inline void * alloc_devinfo_mem(void) 538 { 539 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 540 } 541 542 static inline void free_devinfo_mem(void *vaddr) 543 { 544 kmem_cache_free(iommu_devinfo_cache, vaddr); 545 } 546 547 static inline int domain_type_is_si(struct dmar_domain *domain) 548 { 549 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 550 } 551 552 static inline bool domain_use_first_level(struct dmar_domain *domain) 553 { 554 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 555 } 556 557 static inline int domain_pfn_supported(struct dmar_domain *domain, 558 unsigned long pfn) 559 { 560 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 561 562 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 563 } 564 565 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 566 { 567 unsigned long sagaw; 568 int agaw = -1; 569 570 sagaw = cap_sagaw(iommu->cap); 571 for (agaw = width_to_agaw(max_gaw); 572 agaw >= 0; agaw--) { 573 if (test_bit(agaw, &sagaw)) 574 break; 575 } 576 577 return agaw; 578 } 579 580 /* 581 * Calculate max SAGAW for each iommu. 582 */ 583 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 584 { 585 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 586 } 587 588 /* 589 * calculate agaw for each iommu. 590 * "SAGAW" may be different across iommus, use a default agaw, and 591 * get a supported less agaw for iommus that don't support the default agaw. 592 */ 593 int iommu_calculate_agaw(struct intel_iommu *iommu) 594 { 595 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 596 } 597 598 /* This functionin only returns single iommu in a domain */ 599 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 600 { 601 int iommu_id; 602 603 /* si_domain and vm domain should not get here. */ 604 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 605 return NULL; 606 607 for_each_domain_iommu(iommu_id, domain) 608 break; 609 610 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 611 return NULL; 612 613 return g_iommus[iommu_id]; 614 } 615 616 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 617 { 618 return sm_supported(iommu) ? 619 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 620 } 621 622 static void domain_update_iommu_coherency(struct dmar_domain *domain) 623 { 624 struct dmar_drhd_unit *drhd; 625 struct intel_iommu *iommu; 626 bool found = false; 627 int i; 628 629 domain->iommu_coherency = 1; 630 631 for_each_domain_iommu(i, domain) { 632 found = true; 633 if (!iommu_paging_structure_coherency(g_iommus[i])) { 634 domain->iommu_coherency = 0; 635 break; 636 } 637 } 638 if (found) 639 return; 640 641 /* No hardware attached; use lowest common denominator */ 642 rcu_read_lock(); 643 for_each_active_iommu(iommu, drhd) { 644 if (!iommu_paging_structure_coherency(iommu)) { 645 domain->iommu_coherency = 0; 646 break; 647 } 648 } 649 rcu_read_unlock(); 650 } 651 652 static int domain_update_iommu_snooping(struct intel_iommu *skip) 653 { 654 struct dmar_drhd_unit *drhd; 655 struct intel_iommu *iommu; 656 int ret = 1; 657 658 rcu_read_lock(); 659 for_each_active_iommu(iommu, drhd) { 660 if (iommu != skip) { 661 if (!ecap_sc_support(iommu->ecap)) { 662 ret = 0; 663 break; 664 } 665 } 666 } 667 rcu_read_unlock(); 668 669 return ret; 670 } 671 672 static int domain_update_iommu_superpage(struct dmar_domain *domain, 673 struct intel_iommu *skip) 674 { 675 struct dmar_drhd_unit *drhd; 676 struct intel_iommu *iommu; 677 int mask = 0x3; 678 679 if (!intel_iommu_superpage) { 680 return 0; 681 } 682 683 /* set iommu_superpage to the smallest common denominator */ 684 rcu_read_lock(); 685 for_each_active_iommu(iommu, drhd) { 686 if (iommu != skip) { 687 if (domain && domain_use_first_level(domain)) { 688 if (!cap_fl1gp_support(iommu->cap)) 689 mask = 0x1; 690 } else { 691 mask &= cap_super_page_val(iommu->cap); 692 } 693 694 if (!mask) 695 break; 696 } 697 } 698 rcu_read_unlock(); 699 700 return fls(mask); 701 } 702 703 static int domain_update_device_node(struct dmar_domain *domain) 704 { 705 struct device_domain_info *info; 706 int nid = NUMA_NO_NODE; 707 708 assert_spin_locked(&device_domain_lock); 709 710 if (list_empty(&domain->devices)) 711 return NUMA_NO_NODE; 712 713 list_for_each_entry(info, &domain->devices, link) { 714 if (!info->dev) 715 continue; 716 717 /* 718 * There could possibly be multiple device numa nodes as devices 719 * within the same domain may sit behind different IOMMUs. There 720 * isn't perfect answer in such situation, so we select first 721 * come first served policy. 722 */ 723 nid = dev_to_node(info->dev); 724 if (nid != NUMA_NO_NODE) 725 break; 726 } 727 728 return nid; 729 } 730 731 static void domain_update_iotlb(struct dmar_domain *domain); 732 733 /* Some capabilities may be different across iommus */ 734 static void domain_update_iommu_cap(struct dmar_domain *domain) 735 { 736 domain_update_iommu_coherency(domain); 737 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 738 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 739 740 /* 741 * If RHSA is missing, we should default to the device numa domain 742 * as fall back. 743 */ 744 if (domain->nid == NUMA_NO_NODE) 745 domain->nid = domain_update_device_node(domain); 746 747 /* 748 * First-level translation restricts the input-address to a 749 * canonical address (i.e., address bits 63:N have the same 750 * value as address bit [N-1], where N is 48-bits with 4-level 751 * paging and 57-bits with 5-level paging). Hence, skip bit 752 * [N-1]. 753 */ 754 if (domain_use_first_level(domain)) 755 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 756 else 757 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 758 759 domain_update_iotlb(domain); 760 } 761 762 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 763 u8 devfn, int alloc) 764 { 765 struct root_entry *root = &iommu->root_entry[bus]; 766 struct context_entry *context; 767 u64 *entry; 768 769 entry = &root->lo; 770 if (sm_supported(iommu)) { 771 if (devfn >= 0x80) { 772 devfn -= 0x80; 773 entry = &root->hi; 774 } 775 devfn *= 2; 776 } 777 if (*entry & 1) 778 context = phys_to_virt(*entry & VTD_PAGE_MASK); 779 else { 780 unsigned long phy_addr; 781 if (!alloc) 782 return NULL; 783 784 context = alloc_pgtable_page(iommu->node); 785 if (!context) 786 return NULL; 787 788 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 789 phy_addr = virt_to_phys((void *)context); 790 *entry = phy_addr | 1; 791 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 792 } 793 return &context[devfn]; 794 } 795 796 static bool attach_deferred(struct device *dev) 797 { 798 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 799 } 800 801 /** 802 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 803 * sub-hierarchy of a candidate PCI-PCI bridge 804 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 805 * @bridge: the candidate PCI-PCI bridge 806 * 807 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 808 */ 809 static bool 810 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 811 { 812 struct pci_dev *pdev, *pbridge; 813 814 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 815 return false; 816 817 pdev = to_pci_dev(dev); 818 pbridge = to_pci_dev(bridge); 819 820 if (pbridge->subordinate && 821 pbridge->subordinate->number <= pdev->bus->number && 822 pbridge->subordinate->busn_res.end >= pdev->bus->number) 823 return true; 824 825 return false; 826 } 827 828 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 829 { 830 struct dmar_drhd_unit *drhd; 831 u32 vtbar; 832 int rc; 833 834 /* We know that this device on this chipset has its own IOMMU. 835 * If we find it under a different IOMMU, then the BIOS is lying 836 * to us. Hope that the IOMMU for this device is actually 837 * disabled, and it needs no translation... 838 */ 839 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 840 if (rc) { 841 /* "can't" happen */ 842 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 843 return false; 844 } 845 vtbar &= 0xffff0000; 846 847 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 848 drhd = dmar_find_matched_drhd_unit(pdev); 849 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 850 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 851 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 852 return true; 853 } 854 855 return false; 856 } 857 858 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 859 { 860 if (!iommu || iommu->drhd->ignored) 861 return true; 862 863 if (dev_is_pci(dev)) { 864 struct pci_dev *pdev = to_pci_dev(dev); 865 866 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 867 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 868 quirk_ioat_snb_local_iommu(pdev)) 869 return true; 870 } 871 872 return false; 873 } 874 875 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 876 { 877 struct dmar_drhd_unit *drhd = NULL; 878 struct pci_dev *pdev = NULL; 879 struct intel_iommu *iommu; 880 struct device *tmp; 881 u16 segment = 0; 882 int i; 883 884 if (!dev) 885 return NULL; 886 887 if (dev_is_pci(dev)) { 888 struct pci_dev *pf_pdev; 889 890 pdev = pci_real_dma_dev(to_pci_dev(dev)); 891 892 /* VFs aren't listed in scope tables; we need to look up 893 * the PF instead to find the IOMMU. */ 894 pf_pdev = pci_physfn(pdev); 895 dev = &pf_pdev->dev; 896 segment = pci_domain_nr(pdev->bus); 897 } else if (has_acpi_companion(dev)) 898 dev = &ACPI_COMPANION(dev)->dev; 899 900 rcu_read_lock(); 901 for_each_iommu(iommu, drhd) { 902 if (pdev && segment != drhd->segment) 903 continue; 904 905 for_each_active_dev_scope(drhd->devices, 906 drhd->devices_cnt, i, tmp) { 907 if (tmp == dev) { 908 /* For a VF use its original BDF# not that of the PF 909 * which we used for the IOMMU lookup. Strictly speaking 910 * we could do this for all PCI devices; we only need to 911 * get the BDF# from the scope table for ACPI matches. */ 912 if (pdev && pdev->is_virtfn) 913 goto got_pdev; 914 915 if (bus && devfn) { 916 *bus = drhd->devices[i].bus; 917 *devfn = drhd->devices[i].devfn; 918 } 919 goto out; 920 } 921 922 if (is_downstream_to_pci_bridge(dev, tmp)) 923 goto got_pdev; 924 } 925 926 if (pdev && drhd->include_all) { 927 got_pdev: 928 if (bus && devfn) { 929 *bus = pdev->bus->number; 930 *devfn = pdev->devfn; 931 } 932 goto out; 933 } 934 } 935 iommu = NULL; 936 out: 937 if (iommu_is_dummy(iommu, dev)) 938 iommu = NULL; 939 940 rcu_read_unlock(); 941 942 return iommu; 943 } 944 945 static void domain_flush_cache(struct dmar_domain *domain, 946 void *addr, int size) 947 { 948 if (!domain->iommu_coherency) 949 clflush_cache_range(addr, size); 950 } 951 952 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 953 { 954 struct context_entry *context; 955 int ret = 0; 956 unsigned long flags; 957 958 spin_lock_irqsave(&iommu->lock, flags); 959 context = iommu_context_addr(iommu, bus, devfn, 0); 960 if (context) 961 ret = context_present(context); 962 spin_unlock_irqrestore(&iommu->lock, flags); 963 return ret; 964 } 965 966 static void free_context_table(struct intel_iommu *iommu) 967 { 968 int i; 969 unsigned long flags; 970 struct context_entry *context; 971 972 spin_lock_irqsave(&iommu->lock, flags); 973 if (!iommu->root_entry) { 974 goto out; 975 } 976 for (i = 0; i < ROOT_ENTRY_NR; i++) { 977 context = iommu_context_addr(iommu, i, 0, 0); 978 if (context) 979 free_pgtable_page(context); 980 981 if (!sm_supported(iommu)) 982 continue; 983 984 context = iommu_context_addr(iommu, i, 0x80, 0); 985 if (context) 986 free_pgtable_page(context); 987 988 } 989 free_pgtable_page(iommu->root_entry); 990 iommu->root_entry = NULL; 991 out: 992 spin_unlock_irqrestore(&iommu->lock, flags); 993 } 994 995 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 996 unsigned long pfn, int *target_level) 997 { 998 struct dma_pte *parent, *pte; 999 int level = agaw_to_level(domain->agaw); 1000 int offset; 1001 1002 BUG_ON(!domain->pgd); 1003 1004 if (!domain_pfn_supported(domain, pfn)) 1005 /* Address beyond IOMMU's addressing capabilities. */ 1006 return NULL; 1007 1008 parent = domain->pgd; 1009 1010 while (1) { 1011 void *tmp_page; 1012 1013 offset = pfn_level_offset(pfn, level); 1014 pte = &parent[offset]; 1015 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 1016 break; 1017 if (level == *target_level) 1018 break; 1019 1020 if (!dma_pte_present(pte)) { 1021 uint64_t pteval; 1022 1023 tmp_page = alloc_pgtable_page(domain->nid); 1024 1025 if (!tmp_page) 1026 return NULL; 1027 1028 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1029 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1030 if (domain_use_first_level(domain)) { 1031 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1032 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1033 pteval |= DMA_FL_PTE_ACCESS; 1034 } 1035 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1036 /* Someone else set it while we were thinking; use theirs. */ 1037 free_pgtable_page(tmp_page); 1038 else 1039 domain_flush_cache(domain, pte, sizeof(*pte)); 1040 } 1041 if (level == 1) 1042 break; 1043 1044 parent = phys_to_virt(dma_pte_addr(pte)); 1045 level--; 1046 } 1047 1048 if (!*target_level) 1049 *target_level = level; 1050 1051 return pte; 1052 } 1053 1054 /* return address's pte at specific level */ 1055 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1056 unsigned long pfn, 1057 int level, int *large_page) 1058 { 1059 struct dma_pte *parent, *pte; 1060 int total = agaw_to_level(domain->agaw); 1061 int offset; 1062 1063 parent = domain->pgd; 1064 while (level <= total) { 1065 offset = pfn_level_offset(pfn, total); 1066 pte = &parent[offset]; 1067 if (level == total) 1068 return pte; 1069 1070 if (!dma_pte_present(pte)) { 1071 *large_page = total; 1072 break; 1073 } 1074 1075 if (dma_pte_superpage(pte)) { 1076 *large_page = total; 1077 return pte; 1078 } 1079 1080 parent = phys_to_virt(dma_pte_addr(pte)); 1081 total--; 1082 } 1083 return NULL; 1084 } 1085 1086 /* clear last level pte, a tlb flush should be followed */ 1087 static void dma_pte_clear_range(struct dmar_domain *domain, 1088 unsigned long start_pfn, 1089 unsigned long last_pfn) 1090 { 1091 unsigned int large_page; 1092 struct dma_pte *first_pte, *pte; 1093 1094 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1095 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1096 BUG_ON(start_pfn > last_pfn); 1097 1098 /* we don't need lock here; nobody else touches the iova range */ 1099 do { 1100 large_page = 1; 1101 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1102 if (!pte) { 1103 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1104 continue; 1105 } 1106 do { 1107 dma_clear_pte(pte); 1108 start_pfn += lvl_to_nr_pages(large_page); 1109 pte++; 1110 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1111 1112 domain_flush_cache(domain, first_pte, 1113 (void *)pte - (void *)first_pte); 1114 1115 } while (start_pfn && start_pfn <= last_pfn); 1116 } 1117 1118 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1119 int retain_level, struct dma_pte *pte, 1120 unsigned long pfn, unsigned long start_pfn, 1121 unsigned long last_pfn) 1122 { 1123 pfn = max(start_pfn, pfn); 1124 pte = &pte[pfn_level_offset(pfn, level)]; 1125 1126 do { 1127 unsigned long level_pfn; 1128 struct dma_pte *level_pte; 1129 1130 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1131 goto next; 1132 1133 level_pfn = pfn & level_mask(level); 1134 level_pte = phys_to_virt(dma_pte_addr(pte)); 1135 1136 if (level > 2) { 1137 dma_pte_free_level(domain, level - 1, retain_level, 1138 level_pte, level_pfn, start_pfn, 1139 last_pfn); 1140 } 1141 1142 /* 1143 * Free the page table if we're below the level we want to 1144 * retain and the range covers the entire table. 1145 */ 1146 if (level < retain_level && !(start_pfn > level_pfn || 1147 last_pfn < level_pfn + level_size(level) - 1)) { 1148 dma_clear_pte(pte); 1149 domain_flush_cache(domain, pte, sizeof(*pte)); 1150 free_pgtable_page(level_pte); 1151 } 1152 next: 1153 pfn += level_size(level); 1154 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1155 } 1156 1157 /* 1158 * clear last level (leaf) ptes and free page table pages below the 1159 * level we wish to keep intact. 1160 */ 1161 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1162 unsigned long start_pfn, 1163 unsigned long last_pfn, 1164 int retain_level) 1165 { 1166 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1167 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1168 BUG_ON(start_pfn > last_pfn); 1169 1170 dma_pte_clear_range(domain, start_pfn, last_pfn); 1171 1172 /* We don't need lock here; nobody else touches the iova range */ 1173 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1174 domain->pgd, 0, start_pfn, last_pfn); 1175 1176 /* free pgd */ 1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1178 free_pgtable_page(domain->pgd); 1179 domain->pgd = NULL; 1180 } 1181 } 1182 1183 /* When a page at a given level is being unlinked from its parent, we don't 1184 need to *modify* it at all. All we need to do is make a list of all the 1185 pages which can be freed just as soon as we've flushed the IOTLB and we 1186 know the hardware page-walk will no longer touch them. 1187 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1188 be freed. */ 1189 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1190 int level, struct dma_pte *pte, 1191 struct page *freelist) 1192 { 1193 struct page *pg; 1194 1195 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1196 pg->freelist = freelist; 1197 freelist = pg; 1198 1199 if (level == 1) 1200 return freelist; 1201 1202 pte = page_address(pg); 1203 do { 1204 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1205 freelist = dma_pte_list_pagetables(domain, level - 1, 1206 pte, freelist); 1207 pte++; 1208 } while (!first_pte_in_page(pte)); 1209 1210 return freelist; 1211 } 1212 1213 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1214 struct dma_pte *pte, unsigned long pfn, 1215 unsigned long start_pfn, 1216 unsigned long last_pfn, 1217 struct page *freelist) 1218 { 1219 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1220 1221 pfn = max(start_pfn, pfn); 1222 pte = &pte[pfn_level_offset(pfn, level)]; 1223 1224 do { 1225 unsigned long level_pfn; 1226 1227 if (!dma_pte_present(pte)) 1228 goto next; 1229 1230 level_pfn = pfn & level_mask(level); 1231 1232 /* If range covers entire pagetable, free it */ 1233 if (start_pfn <= level_pfn && 1234 last_pfn >= level_pfn + level_size(level) - 1) { 1235 /* These suborbinate page tables are going away entirely. Don't 1236 bother to clear them; we're just going to *free* them. */ 1237 if (level > 1 && !dma_pte_superpage(pte)) 1238 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1239 1240 dma_clear_pte(pte); 1241 if (!first_pte) 1242 first_pte = pte; 1243 last_pte = pte; 1244 } else if (level > 1) { 1245 /* Recurse down into a level that isn't *entirely* obsolete */ 1246 freelist = dma_pte_clear_level(domain, level - 1, 1247 phys_to_virt(dma_pte_addr(pte)), 1248 level_pfn, start_pfn, last_pfn, 1249 freelist); 1250 } 1251 next: 1252 pfn += level_size(level); 1253 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1254 1255 if (first_pte) 1256 domain_flush_cache(domain, first_pte, 1257 (void *)++last_pte - (void *)first_pte); 1258 1259 return freelist; 1260 } 1261 1262 /* We can't just free the pages because the IOMMU may still be walking 1263 the page tables, and may have cached the intermediate levels. The 1264 pages can only be freed after the IOTLB flush has been done. */ 1265 static struct page *domain_unmap(struct dmar_domain *domain, 1266 unsigned long start_pfn, 1267 unsigned long last_pfn, 1268 struct page *freelist) 1269 { 1270 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1271 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1272 BUG_ON(start_pfn > last_pfn); 1273 1274 /* we don't need lock here; nobody else touches the iova range */ 1275 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1276 domain->pgd, 0, start_pfn, last_pfn, 1277 freelist); 1278 1279 /* free pgd */ 1280 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1281 struct page *pgd_page = virt_to_page(domain->pgd); 1282 pgd_page->freelist = freelist; 1283 freelist = pgd_page; 1284 1285 domain->pgd = NULL; 1286 } 1287 1288 return freelist; 1289 } 1290 1291 static void dma_free_pagelist(struct page *freelist) 1292 { 1293 struct page *pg; 1294 1295 while ((pg = freelist)) { 1296 freelist = pg->freelist; 1297 free_pgtable_page(page_address(pg)); 1298 } 1299 } 1300 1301 /* iommu handling */ 1302 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1303 { 1304 struct root_entry *root; 1305 unsigned long flags; 1306 1307 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1308 if (!root) { 1309 pr_err("Allocating root entry for %s failed\n", 1310 iommu->name); 1311 return -ENOMEM; 1312 } 1313 1314 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1315 1316 spin_lock_irqsave(&iommu->lock, flags); 1317 iommu->root_entry = root; 1318 spin_unlock_irqrestore(&iommu->lock, flags); 1319 1320 return 0; 1321 } 1322 1323 static void iommu_set_root_entry(struct intel_iommu *iommu) 1324 { 1325 u64 addr; 1326 u32 sts; 1327 unsigned long flag; 1328 1329 addr = virt_to_phys(iommu->root_entry); 1330 if (sm_supported(iommu)) 1331 addr |= DMA_RTADDR_SMT; 1332 1333 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1334 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1335 1336 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1337 1338 /* Make sure hardware complete it */ 1339 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1340 readl, (sts & DMA_GSTS_RTPS), sts); 1341 1342 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1343 } 1344 1345 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1346 { 1347 u32 val; 1348 unsigned long flag; 1349 1350 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1351 return; 1352 1353 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1354 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1355 1356 /* Make sure hardware complete it */ 1357 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1358 readl, (!(val & DMA_GSTS_WBFS)), val); 1359 1360 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1361 } 1362 1363 /* return value determine if we need a write buffer flush */ 1364 static void __iommu_flush_context(struct intel_iommu *iommu, 1365 u16 did, u16 source_id, u8 function_mask, 1366 u64 type) 1367 { 1368 u64 val = 0; 1369 unsigned long flag; 1370 1371 switch (type) { 1372 case DMA_CCMD_GLOBAL_INVL: 1373 val = DMA_CCMD_GLOBAL_INVL; 1374 break; 1375 case DMA_CCMD_DOMAIN_INVL: 1376 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1377 break; 1378 case DMA_CCMD_DEVICE_INVL: 1379 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1380 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1381 break; 1382 default: 1383 BUG(); 1384 } 1385 val |= DMA_CCMD_ICC; 1386 1387 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1388 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1389 1390 /* Make sure hardware complete it */ 1391 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1392 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1393 1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1395 } 1396 1397 /* return value determine if we need a write buffer flush */ 1398 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1399 u64 addr, unsigned int size_order, u64 type) 1400 { 1401 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1402 u64 val = 0, val_iva = 0; 1403 unsigned long flag; 1404 1405 switch (type) { 1406 case DMA_TLB_GLOBAL_FLUSH: 1407 /* global flush doesn't need set IVA_REG */ 1408 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1409 break; 1410 case DMA_TLB_DSI_FLUSH: 1411 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1412 break; 1413 case DMA_TLB_PSI_FLUSH: 1414 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1415 /* IH bit is passed in as part of address */ 1416 val_iva = size_order | addr; 1417 break; 1418 default: 1419 BUG(); 1420 } 1421 /* Note: set drain read/write */ 1422 #if 0 1423 /* 1424 * This is probably to be super secure.. Looks like we can 1425 * ignore it without any impact. 1426 */ 1427 if (cap_read_drain(iommu->cap)) 1428 val |= DMA_TLB_READ_DRAIN; 1429 #endif 1430 if (cap_write_drain(iommu->cap)) 1431 val |= DMA_TLB_WRITE_DRAIN; 1432 1433 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1434 /* Note: Only uses first TLB reg currently */ 1435 if (val_iva) 1436 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1437 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1438 1439 /* Make sure hardware complete it */ 1440 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1441 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1442 1443 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1444 1445 /* check IOTLB invalidation granularity */ 1446 if (DMA_TLB_IAIG(val) == 0) 1447 pr_err("Flush IOTLB failed\n"); 1448 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1449 pr_debug("TLB flush request %Lx, actual %Lx\n", 1450 (unsigned long long)DMA_TLB_IIRG(type), 1451 (unsigned long long)DMA_TLB_IAIG(val)); 1452 } 1453 1454 static struct device_domain_info * 1455 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1456 u8 bus, u8 devfn) 1457 { 1458 struct device_domain_info *info; 1459 1460 assert_spin_locked(&device_domain_lock); 1461 1462 if (!iommu->qi) 1463 return NULL; 1464 1465 list_for_each_entry(info, &domain->devices, link) 1466 if (info->iommu == iommu && info->bus == bus && 1467 info->devfn == devfn) { 1468 if (info->ats_supported && info->dev) 1469 return info; 1470 break; 1471 } 1472 1473 return NULL; 1474 } 1475 1476 static void domain_update_iotlb(struct dmar_domain *domain) 1477 { 1478 struct device_domain_info *info; 1479 bool has_iotlb_device = false; 1480 1481 assert_spin_locked(&device_domain_lock); 1482 1483 list_for_each_entry(info, &domain->devices, link) 1484 if (info->ats_enabled) { 1485 has_iotlb_device = true; 1486 break; 1487 } 1488 1489 if (!has_iotlb_device) { 1490 struct subdev_domain_info *sinfo; 1491 1492 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1493 info = get_domain_info(sinfo->pdev); 1494 if (info && info->ats_enabled) { 1495 has_iotlb_device = true; 1496 break; 1497 } 1498 } 1499 } 1500 1501 domain->has_iotlb_device = has_iotlb_device; 1502 } 1503 1504 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1505 { 1506 struct pci_dev *pdev; 1507 1508 assert_spin_locked(&device_domain_lock); 1509 1510 if (!info || !dev_is_pci(info->dev)) 1511 return; 1512 1513 pdev = to_pci_dev(info->dev); 1514 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1515 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1516 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1517 * reserved, which should be set to 0. 1518 */ 1519 if (!ecap_dit(info->iommu->ecap)) 1520 info->pfsid = 0; 1521 else { 1522 struct pci_dev *pf_pdev; 1523 1524 /* pdev will be returned if device is not a vf */ 1525 pf_pdev = pci_physfn(pdev); 1526 info->pfsid = pci_dev_id(pf_pdev); 1527 } 1528 1529 #ifdef CONFIG_INTEL_IOMMU_SVM 1530 /* The PCIe spec, in its wisdom, declares that the behaviour of 1531 the device if you enable PASID support after ATS support is 1532 undefined. So always enable PASID support on devices which 1533 have it, even if we can't yet know if we're ever going to 1534 use it. */ 1535 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1536 info->pasid_enabled = 1; 1537 1538 if (info->pri_supported && 1539 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1540 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1541 info->pri_enabled = 1; 1542 #endif 1543 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1544 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1545 info->ats_enabled = 1; 1546 domain_update_iotlb(info->domain); 1547 info->ats_qdep = pci_ats_queue_depth(pdev); 1548 } 1549 } 1550 1551 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1552 { 1553 struct pci_dev *pdev; 1554 1555 assert_spin_locked(&device_domain_lock); 1556 1557 if (!dev_is_pci(info->dev)) 1558 return; 1559 1560 pdev = to_pci_dev(info->dev); 1561 1562 if (info->ats_enabled) { 1563 pci_disable_ats(pdev); 1564 info->ats_enabled = 0; 1565 domain_update_iotlb(info->domain); 1566 } 1567 #ifdef CONFIG_INTEL_IOMMU_SVM 1568 if (info->pri_enabled) { 1569 pci_disable_pri(pdev); 1570 info->pri_enabled = 0; 1571 } 1572 if (info->pasid_enabled) { 1573 pci_disable_pasid(pdev); 1574 info->pasid_enabled = 0; 1575 } 1576 #endif 1577 } 1578 1579 static void __iommu_flush_dev_iotlb(struct device_domain_info *info, 1580 u64 addr, unsigned int mask) 1581 { 1582 u16 sid, qdep; 1583 1584 if (!info || !info->ats_enabled) 1585 return; 1586 1587 sid = info->bus << 8 | info->devfn; 1588 qdep = info->ats_qdep; 1589 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1590 qdep, addr, mask); 1591 } 1592 1593 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1594 u64 addr, unsigned mask) 1595 { 1596 unsigned long flags; 1597 struct device_domain_info *info; 1598 struct subdev_domain_info *sinfo; 1599 1600 if (!domain->has_iotlb_device) 1601 return; 1602 1603 spin_lock_irqsave(&device_domain_lock, flags); 1604 list_for_each_entry(info, &domain->devices, link) 1605 __iommu_flush_dev_iotlb(info, addr, mask); 1606 1607 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 1608 info = get_domain_info(sinfo->pdev); 1609 __iommu_flush_dev_iotlb(info, addr, mask); 1610 } 1611 spin_unlock_irqrestore(&device_domain_lock, flags); 1612 } 1613 1614 static void domain_flush_piotlb(struct intel_iommu *iommu, 1615 struct dmar_domain *domain, 1616 u64 addr, unsigned long npages, bool ih) 1617 { 1618 u16 did = domain->iommu_did[iommu->seq_id]; 1619 1620 if (domain->default_pasid) 1621 qi_flush_piotlb(iommu, did, domain->default_pasid, 1622 addr, npages, ih); 1623 1624 if (!list_empty(&domain->devices)) 1625 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1626 } 1627 1628 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1629 struct dmar_domain *domain, 1630 unsigned long pfn, unsigned int pages, 1631 int ih, int map) 1632 { 1633 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1634 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1635 u16 did = domain->iommu_did[iommu->seq_id]; 1636 1637 BUG_ON(pages == 0); 1638 1639 if (ih) 1640 ih = 1 << 6; 1641 1642 if (domain_use_first_level(domain)) { 1643 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1644 } else { 1645 /* 1646 * Fallback to domain selective flush if no PSI support or 1647 * the size is too big. PSI requires page size to be 2 ^ x, 1648 * and the base address is naturally aligned to the size. 1649 */ 1650 if (!cap_pgsel_inv(iommu->cap) || 1651 mask > cap_max_amask_val(iommu->cap)) 1652 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1653 DMA_TLB_DSI_FLUSH); 1654 else 1655 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1656 DMA_TLB_PSI_FLUSH); 1657 } 1658 1659 /* 1660 * In caching mode, changes of pages from non-present to present require 1661 * flush. However, device IOTLB doesn't need to be flushed in this case. 1662 */ 1663 if (!cap_caching_mode(iommu->cap) || !map) 1664 iommu_flush_dev_iotlb(domain, addr, mask); 1665 } 1666 1667 /* Notification for newly created mappings */ 1668 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1669 struct dmar_domain *domain, 1670 unsigned long pfn, unsigned int pages) 1671 { 1672 /* 1673 * It's a non-present to present mapping. Only flush if caching mode 1674 * and second level. 1675 */ 1676 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1677 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1678 else 1679 iommu_flush_write_buffer(iommu); 1680 } 1681 1682 static void intel_flush_iotlb_all(struct iommu_domain *domain) 1683 { 1684 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 1685 int idx; 1686 1687 for_each_domain_iommu(idx, dmar_domain) { 1688 struct intel_iommu *iommu = g_iommus[idx]; 1689 u16 did = dmar_domain->iommu_did[iommu->seq_id]; 1690 1691 if (domain_use_first_level(dmar_domain)) 1692 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0); 1693 else 1694 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1695 DMA_TLB_DSI_FLUSH); 1696 1697 if (!cap_caching_mode(iommu->cap)) 1698 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1699 0, MAX_AGAW_PFN_WIDTH); 1700 } 1701 } 1702 1703 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1704 { 1705 u32 pmen; 1706 unsigned long flags; 1707 1708 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1709 return; 1710 1711 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1712 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1713 pmen &= ~DMA_PMEN_EPM; 1714 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1715 1716 /* wait for the protected region status bit to clear */ 1717 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1718 readl, !(pmen & DMA_PMEN_PRS), pmen); 1719 1720 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1721 } 1722 1723 static void iommu_enable_translation(struct intel_iommu *iommu) 1724 { 1725 u32 sts; 1726 unsigned long flags; 1727 1728 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1729 iommu->gcmd |= DMA_GCMD_TE; 1730 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1731 1732 /* Make sure hardware complete it */ 1733 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1734 readl, (sts & DMA_GSTS_TES), sts); 1735 1736 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1737 } 1738 1739 static void iommu_disable_translation(struct intel_iommu *iommu) 1740 { 1741 u32 sts; 1742 unsigned long flag; 1743 1744 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1745 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1746 return; 1747 1748 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1749 iommu->gcmd &= ~DMA_GCMD_TE; 1750 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1751 1752 /* Make sure hardware complete it */ 1753 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1754 readl, (!(sts & DMA_GSTS_TES)), sts); 1755 1756 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1757 } 1758 1759 static int iommu_init_domains(struct intel_iommu *iommu) 1760 { 1761 u32 ndomains, nlongs; 1762 size_t size; 1763 1764 ndomains = cap_ndoms(iommu->cap); 1765 pr_debug("%s: Number of Domains supported <%d>\n", 1766 iommu->name, ndomains); 1767 nlongs = BITS_TO_LONGS(ndomains); 1768 1769 spin_lock_init(&iommu->lock); 1770 1771 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1772 if (!iommu->domain_ids) { 1773 pr_err("%s: Allocating domain id array failed\n", 1774 iommu->name); 1775 return -ENOMEM; 1776 } 1777 1778 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1779 iommu->domains = kzalloc(size, GFP_KERNEL); 1780 1781 if (iommu->domains) { 1782 size = 256 * sizeof(struct dmar_domain *); 1783 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1784 } 1785 1786 if (!iommu->domains || !iommu->domains[0]) { 1787 pr_err("%s: Allocating domain array failed\n", 1788 iommu->name); 1789 kfree(iommu->domain_ids); 1790 kfree(iommu->domains); 1791 iommu->domain_ids = NULL; 1792 iommu->domains = NULL; 1793 return -ENOMEM; 1794 } 1795 1796 /* 1797 * If Caching mode is set, then invalid translations are tagged 1798 * with domain-id 0, hence we need to pre-allocate it. We also 1799 * use domain-id 0 as a marker for non-allocated domain-id, so 1800 * make sure it is not used for a real domain. 1801 */ 1802 set_bit(0, iommu->domain_ids); 1803 1804 /* 1805 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1806 * entry for first-level or pass-through translation modes should 1807 * be programmed with a domain id different from those used for 1808 * second-level or nested translation. We reserve a domain id for 1809 * this purpose. 1810 */ 1811 if (sm_supported(iommu)) 1812 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1813 1814 return 0; 1815 } 1816 1817 static void disable_dmar_iommu(struct intel_iommu *iommu) 1818 { 1819 struct device_domain_info *info, *tmp; 1820 unsigned long flags; 1821 1822 if (!iommu->domains || !iommu->domain_ids) 1823 return; 1824 1825 spin_lock_irqsave(&device_domain_lock, flags); 1826 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1827 if (info->iommu != iommu) 1828 continue; 1829 1830 if (!info->dev || !info->domain) 1831 continue; 1832 1833 __dmar_remove_one_dev_info(info); 1834 } 1835 spin_unlock_irqrestore(&device_domain_lock, flags); 1836 1837 if (iommu->gcmd & DMA_GCMD_TE) 1838 iommu_disable_translation(iommu); 1839 } 1840 1841 static void free_dmar_iommu(struct intel_iommu *iommu) 1842 { 1843 if ((iommu->domains) && (iommu->domain_ids)) { 1844 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1845 int i; 1846 1847 for (i = 0; i < elems; i++) 1848 kfree(iommu->domains[i]); 1849 kfree(iommu->domains); 1850 kfree(iommu->domain_ids); 1851 iommu->domains = NULL; 1852 iommu->domain_ids = NULL; 1853 } 1854 1855 g_iommus[iommu->seq_id] = NULL; 1856 1857 /* free context mapping */ 1858 free_context_table(iommu); 1859 1860 #ifdef CONFIG_INTEL_IOMMU_SVM 1861 if (pasid_supported(iommu)) { 1862 if (ecap_prs(iommu->ecap)) 1863 intel_svm_finish_prq(iommu); 1864 } 1865 if (vccap_pasid(iommu->vccap)) 1866 ioasid_unregister_allocator(&iommu->pasid_allocator); 1867 1868 #endif 1869 } 1870 1871 /* 1872 * Check and return whether first level is used by default for 1873 * DMA translation. 1874 */ 1875 static bool first_level_by_default(void) 1876 { 1877 return scalable_mode_support() && intel_cap_flts_sanity(); 1878 } 1879 1880 static struct dmar_domain *alloc_domain(int flags) 1881 { 1882 struct dmar_domain *domain; 1883 1884 domain = alloc_domain_mem(); 1885 if (!domain) 1886 return NULL; 1887 1888 memset(domain, 0, sizeof(*domain)); 1889 domain->nid = NUMA_NO_NODE; 1890 domain->flags = flags; 1891 if (first_level_by_default()) 1892 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1893 domain->has_iotlb_device = false; 1894 INIT_LIST_HEAD(&domain->devices); 1895 INIT_LIST_HEAD(&domain->subdevices); 1896 1897 return domain; 1898 } 1899 1900 /* Must be called with iommu->lock */ 1901 static int domain_attach_iommu(struct dmar_domain *domain, 1902 struct intel_iommu *iommu) 1903 { 1904 unsigned long ndomains; 1905 int num; 1906 1907 assert_spin_locked(&device_domain_lock); 1908 assert_spin_locked(&iommu->lock); 1909 1910 domain->iommu_refcnt[iommu->seq_id] += 1; 1911 domain->iommu_count += 1; 1912 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1913 ndomains = cap_ndoms(iommu->cap); 1914 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1915 1916 if (num >= ndomains) { 1917 pr_err("%s: No free domain ids\n", iommu->name); 1918 domain->iommu_refcnt[iommu->seq_id] -= 1; 1919 domain->iommu_count -= 1; 1920 return -ENOSPC; 1921 } 1922 1923 set_bit(num, iommu->domain_ids); 1924 set_iommu_domain(iommu, num, domain); 1925 1926 domain->iommu_did[iommu->seq_id] = num; 1927 domain->nid = iommu->node; 1928 1929 domain_update_iommu_cap(domain); 1930 } 1931 1932 return 0; 1933 } 1934 1935 static int domain_detach_iommu(struct dmar_domain *domain, 1936 struct intel_iommu *iommu) 1937 { 1938 int num, count; 1939 1940 assert_spin_locked(&device_domain_lock); 1941 assert_spin_locked(&iommu->lock); 1942 1943 domain->iommu_refcnt[iommu->seq_id] -= 1; 1944 count = --domain->iommu_count; 1945 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1946 num = domain->iommu_did[iommu->seq_id]; 1947 clear_bit(num, iommu->domain_ids); 1948 set_iommu_domain(iommu, num, NULL); 1949 1950 domain_update_iommu_cap(domain); 1951 domain->iommu_did[iommu->seq_id] = 0; 1952 } 1953 1954 return count; 1955 } 1956 1957 static inline int guestwidth_to_adjustwidth(int gaw) 1958 { 1959 int agaw; 1960 int r = (gaw - 12) % 9; 1961 1962 if (r == 0) 1963 agaw = gaw; 1964 else 1965 agaw = gaw + 9 - r; 1966 if (agaw > 64) 1967 agaw = 64; 1968 return agaw; 1969 } 1970 1971 static void domain_exit(struct dmar_domain *domain) 1972 { 1973 1974 /* Remove associated devices and clear attached or cached domains */ 1975 domain_remove_dev_info(domain); 1976 1977 /* destroy iovas */ 1978 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1979 iommu_put_dma_cookie(&domain->domain); 1980 1981 if (domain->pgd) { 1982 struct page *freelist; 1983 1984 freelist = domain_unmap(domain, 0, 1985 DOMAIN_MAX_PFN(domain->gaw), NULL); 1986 dma_free_pagelist(freelist); 1987 } 1988 1989 free_domain_mem(domain); 1990 } 1991 1992 /* 1993 * Get the PASID directory size for scalable mode context entry. 1994 * Value of X in the PDTS field of a scalable mode context entry 1995 * indicates PASID directory with 2^(X + 7) entries. 1996 */ 1997 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1998 { 1999 int pds, max_pde; 2000 2001 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2002 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2003 if (pds < 7) 2004 return 0; 2005 2006 return pds - 7; 2007 } 2008 2009 /* 2010 * Set the RID_PASID field of a scalable mode context entry. The 2011 * IOMMU hardware will use the PASID value set in this field for 2012 * DMA translations of DMA requests without PASID. 2013 */ 2014 static inline void 2015 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2016 { 2017 context->hi |= pasid & ((1 << 20) - 1); 2018 } 2019 2020 /* 2021 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2022 * entry. 2023 */ 2024 static inline void context_set_sm_dte(struct context_entry *context) 2025 { 2026 context->lo |= (1 << 2); 2027 } 2028 2029 /* 2030 * Set the PRE(Page Request Enable) field of a scalable mode context 2031 * entry. 2032 */ 2033 static inline void context_set_sm_pre(struct context_entry *context) 2034 { 2035 context->lo |= (1 << 4); 2036 } 2037 2038 /* Convert value to context PASID directory size field coding. */ 2039 #define context_pdts(pds) (((pds) & 0x7) << 9) 2040 2041 static int domain_context_mapping_one(struct dmar_domain *domain, 2042 struct intel_iommu *iommu, 2043 struct pasid_table *table, 2044 u8 bus, u8 devfn) 2045 { 2046 u16 did = domain->iommu_did[iommu->seq_id]; 2047 int translation = CONTEXT_TT_MULTI_LEVEL; 2048 struct device_domain_info *info = NULL; 2049 struct context_entry *context; 2050 unsigned long flags; 2051 int ret; 2052 2053 WARN_ON(did == 0); 2054 2055 if (hw_pass_through && domain_type_is_si(domain)) 2056 translation = CONTEXT_TT_PASS_THROUGH; 2057 2058 pr_debug("Set context mapping for %02x:%02x.%d\n", 2059 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2060 2061 BUG_ON(!domain->pgd); 2062 2063 spin_lock_irqsave(&device_domain_lock, flags); 2064 spin_lock(&iommu->lock); 2065 2066 ret = -ENOMEM; 2067 context = iommu_context_addr(iommu, bus, devfn, 1); 2068 if (!context) 2069 goto out_unlock; 2070 2071 ret = 0; 2072 if (context_present(context)) 2073 goto out_unlock; 2074 2075 /* 2076 * For kdump cases, old valid entries may be cached due to the 2077 * in-flight DMA and copied pgtable, but there is no unmapping 2078 * behaviour for them, thus we need an explicit cache flush for 2079 * the newly-mapped device. For kdump, at this point, the device 2080 * is supposed to finish reset at its driver probe stage, so no 2081 * in-flight DMA will exist, and we don't need to worry anymore 2082 * hereafter. 2083 */ 2084 if (context_copied(context)) { 2085 u16 did_old = context_domain_id(context); 2086 2087 if (did_old < cap_ndoms(iommu->cap)) { 2088 iommu->flush.flush_context(iommu, did_old, 2089 (((u16)bus) << 8) | devfn, 2090 DMA_CCMD_MASK_NOBIT, 2091 DMA_CCMD_DEVICE_INVL); 2092 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2093 DMA_TLB_DSI_FLUSH); 2094 } 2095 } 2096 2097 context_clear_entry(context); 2098 2099 if (sm_supported(iommu)) { 2100 unsigned long pds; 2101 2102 WARN_ON(!table); 2103 2104 /* Setup the PASID DIR pointer: */ 2105 pds = context_get_sm_pds(table); 2106 context->lo = (u64)virt_to_phys(table->table) | 2107 context_pdts(pds); 2108 2109 /* Setup the RID_PASID field: */ 2110 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2111 2112 /* 2113 * Setup the Device-TLB enable bit and Page request 2114 * Enable bit: 2115 */ 2116 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2117 if (info && info->ats_supported) 2118 context_set_sm_dte(context); 2119 if (info && info->pri_supported) 2120 context_set_sm_pre(context); 2121 } else { 2122 struct dma_pte *pgd = domain->pgd; 2123 int agaw; 2124 2125 context_set_domain_id(context, did); 2126 2127 if (translation != CONTEXT_TT_PASS_THROUGH) { 2128 /* 2129 * Skip top levels of page tables for iommu which has 2130 * less agaw than default. Unnecessary for PT mode. 2131 */ 2132 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2133 ret = -ENOMEM; 2134 pgd = phys_to_virt(dma_pte_addr(pgd)); 2135 if (!dma_pte_present(pgd)) 2136 goto out_unlock; 2137 } 2138 2139 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2140 if (info && info->ats_supported) 2141 translation = CONTEXT_TT_DEV_IOTLB; 2142 else 2143 translation = CONTEXT_TT_MULTI_LEVEL; 2144 2145 context_set_address_root(context, virt_to_phys(pgd)); 2146 context_set_address_width(context, agaw); 2147 } else { 2148 /* 2149 * In pass through mode, AW must be programmed to 2150 * indicate the largest AGAW value supported by 2151 * hardware. And ASR is ignored by hardware. 2152 */ 2153 context_set_address_width(context, iommu->msagaw); 2154 } 2155 2156 context_set_translation_type(context, translation); 2157 } 2158 2159 context_set_fault_enable(context); 2160 context_set_present(context); 2161 if (!ecap_coherent(iommu->ecap)) 2162 clflush_cache_range(context, sizeof(*context)); 2163 2164 /* 2165 * It's a non-present to present mapping. If hardware doesn't cache 2166 * non-present entry we only need to flush the write-buffer. If the 2167 * _does_ cache non-present entries, then it does so in the special 2168 * domain #0, which we have to flush: 2169 */ 2170 if (cap_caching_mode(iommu->cap)) { 2171 iommu->flush.flush_context(iommu, 0, 2172 (((u16)bus) << 8) | devfn, 2173 DMA_CCMD_MASK_NOBIT, 2174 DMA_CCMD_DEVICE_INVL); 2175 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2176 } else { 2177 iommu_flush_write_buffer(iommu); 2178 } 2179 iommu_enable_dev_iotlb(info); 2180 2181 ret = 0; 2182 2183 out_unlock: 2184 spin_unlock(&iommu->lock); 2185 spin_unlock_irqrestore(&device_domain_lock, flags); 2186 2187 return ret; 2188 } 2189 2190 struct domain_context_mapping_data { 2191 struct dmar_domain *domain; 2192 struct intel_iommu *iommu; 2193 struct pasid_table *table; 2194 }; 2195 2196 static int domain_context_mapping_cb(struct pci_dev *pdev, 2197 u16 alias, void *opaque) 2198 { 2199 struct domain_context_mapping_data *data = opaque; 2200 2201 return domain_context_mapping_one(data->domain, data->iommu, 2202 data->table, PCI_BUS_NUM(alias), 2203 alias & 0xff); 2204 } 2205 2206 static int 2207 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2208 { 2209 struct domain_context_mapping_data data; 2210 struct pasid_table *table; 2211 struct intel_iommu *iommu; 2212 u8 bus, devfn; 2213 2214 iommu = device_to_iommu(dev, &bus, &devfn); 2215 if (!iommu) 2216 return -ENODEV; 2217 2218 table = intel_pasid_get_table(dev); 2219 2220 if (!dev_is_pci(dev)) 2221 return domain_context_mapping_one(domain, iommu, table, 2222 bus, devfn); 2223 2224 data.domain = domain; 2225 data.iommu = iommu; 2226 data.table = table; 2227 2228 return pci_for_each_dma_alias(to_pci_dev(dev), 2229 &domain_context_mapping_cb, &data); 2230 } 2231 2232 static int domain_context_mapped_cb(struct pci_dev *pdev, 2233 u16 alias, void *opaque) 2234 { 2235 struct intel_iommu *iommu = opaque; 2236 2237 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2238 } 2239 2240 static int domain_context_mapped(struct device *dev) 2241 { 2242 struct intel_iommu *iommu; 2243 u8 bus, devfn; 2244 2245 iommu = device_to_iommu(dev, &bus, &devfn); 2246 if (!iommu) 2247 return -ENODEV; 2248 2249 if (!dev_is_pci(dev)) 2250 return device_context_mapped(iommu, bus, devfn); 2251 2252 return !pci_for_each_dma_alias(to_pci_dev(dev), 2253 domain_context_mapped_cb, iommu); 2254 } 2255 2256 /* Returns a number of VTD pages, but aligned to MM page size */ 2257 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2258 size_t size) 2259 { 2260 host_addr &= ~PAGE_MASK; 2261 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2262 } 2263 2264 /* Return largest possible superpage level for a given mapping */ 2265 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2266 unsigned long iov_pfn, 2267 unsigned long phy_pfn, 2268 unsigned long pages) 2269 { 2270 int support, level = 1; 2271 unsigned long pfnmerge; 2272 2273 support = domain->iommu_superpage; 2274 2275 /* To use a large page, the virtual *and* physical addresses 2276 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2277 of them will mean we have to use smaller pages. So just 2278 merge them and check both at once. */ 2279 pfnmerge = iov_pfn | phy_pfn; 2280 2281 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2282 pages >>= VTD_STRIDE_SHIFT; 2283 if (!pages) 2284 break; 2285 pfnmerge >>= VTD_STRIDE_SHIFT; 2286 level++; 2287 support--; 2288 } 2289 return level; 2290 } 2291 2292 static int 2293 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2294 unsigned long phys_pfn, unsigned long nr_pages, int prot) 2295 { 2296 unsigned int largepage_lvl = 0; 2297 unsigned long lvl_pages = 0; 2298 struct dma_pte *pte = NULL; 2299 phys_addr_t pteval; 2300 u64 attr; 2301 2302 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2303 2304 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2305 return -EINVAL; 2306 2307 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2308 if (domain_use_first_level(domain)) { 2309 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2310 2311 if (domain->domain.type == IOMMU_DOMAIN_DMA) { 2312 attr |= DMA_FL_PTE_ACCESS; 2313 if (prot & DMA_PTE_WRITE) 2314 attr |= DMA_FL_PTE_DIRTY; 2315 } 2316 } 2317 2318 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2319 2320 while (nr_pages > 0) { 2321 uint64_t tmp; 2322 2323 if (!pte) { 2324 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 2325 phys_pfn, nr_pages); 2326 2327 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2328 if (!pte) 2329 return -ENOMEM; 2330 /* It is large page*/ 2331 if (largepage_lvl > 1) { 2332 unsigned long nr_superpages, end_pfn; 2333 2334 pteval |= DMA_PTE_LARGE_PAGE; 2335 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2336 2337 nr_superpages = nr_pages / lvl_pages; 2338 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2339 2340 /* 2341 * Ensure that old small page tables are 2342 * removed to make room for superpage(s). 2343 * We're adding new large pages, so make sure 2344 * we don't remove their parent tables. 2345 */ 2346 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2347 largepage_lvl + 1); 2348 } else { 2349 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2350 } 2351 2352 } 2353 /* We don't need lock here, nobody else 2354 * touches the iova range 2355 */ 2356 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2357 if (tmp) { 2358 static int dumps = 5; 2359 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2360 iov_pfn, tmp, (unsigned long long)pteval); 2361 if (dumps) { 2362 dumps--; 2363 debug_dma_dump_mappings(NULL); 2364 } 2365 WARN_ON(1); 2366 } 2367 2368 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2369 2370 BUG_ON(nr_pages < lvl_pages); 2371 2372 nr_pages -= lvl_pages; 2373 iov_pfn += lvl_pages; 2374 phys_pfn += lvl_pages; 2375 pteval += lvl_pages * VTD_PAGE_SIZE; 2376 2377 /* If the next PTE would be the first in a new page, then we 2378 * need to flush the cache on the entries we've just written. 2379 * And then we'll need to recalculate 'pte', so clear it and 2380 * let it get set again in the if (!pte) block above. 2381 * 2382 * If we're done (!nr_pages) we need to flush the cache too. 2383 * 2384 * Also if we've been setting superpages, we may need to 2385 * recalculate 'pte' and switch back to smaller pages for the 2386 * end of the mapping, if the trailing size is not enough to 2387 * use another superpage (i.e. nr_pages < lvl_pages). 2388 * 2389 * We leave clflush for the leaf pte changes to iotlb_sync_map() 2390 * callback. 2391 */ 2392 pte++; 2393 if (!nr_pages || first_pte_in_page(pte) || 2394 (largepage_lvl > 1 && nr_pages < lvl_pages)) 2395 pte = NULL; 2396 } 2397 2398 return 0; 2399 } 2400 2401 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2402 { 2403 unsigned long flags; 2404 struct context_entry *context; 2405 u16 did_old; 2406 2407 if (!iommu) 2408 return; 2409 2410 spin_lock_irqsave(&iommu->lock, flags); 2411 context = iommu_context_addr(iommu, bus, devfn, 0); 2412 if (!context) { 2413 spin_unlock_irqrestore(&iommu->lock, flags); 2414 return; 2415 } 2416 did_old = context_domain_id(context); 2417 context_clear_entry(context); 2418 __iommu_flush_cache(iommu, context, sizeof(*context)); 2419 spin_unlock_irqrestore(&iommu->lock, flags); 2420 iommu->flush.flush_context(iommu, 2421 did_old, 2422 (((u16)bus) << 8) | devfn, 2423 DMA_CCMD_MASK_NOBIT, 2424 DMA_CCMD_DEVICE_INVL); 2425 iommu->flush.flush_iotlb(iommu, 2426 did_old, 2427 0, 2428 0, 2429 DMA_TLB_DSI_FLUSH); 2430 } 2431 2432 static inline void unlink_domain_info(struct device_domain_info *info) 2433 { 2434 assert_spin_locked(&device_domain_lock); 2435 list_del(&info->link); 2436 list_del(&info->global); 2437 if (info->dev) 2438 dev_iommu_priv_set(info->dev, NULL); 2439 } 2440 2441 static void domain_remove_dev_info(struct dmar_domain *domain) 2442 { 2443 struct device_domain_info *info, *tmp; 2444 unsigned long flags; 2445 2446 spin_lock_irqsave(&device_domain_lock, flags); 2447 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2448 __dmar_remove_one_dev_info(info); 2449 spin_unlock_irqrestore(&device_domain_lock, flags); 2450 } 2451 2452 struct dmar_domain *find_domain(struct device *dev) 2453 { 2454 struct device_domain_info *info; 2455 2456 if (unlikely(!dev || !dev->iommu)) 2457 return NULL; 2458 2459 if (unlikely(attach_deferred(dev))) 2460 return NULL; 2461 2462 /* No lock here, assumes no domain exit in normal case */ 2463 info = get_domain_info(dev); 2464 if (likely(info)) 2465 return info->domain; 2466 2467 return NULL; 2468 } 2469 2470 static inline struct device_domain_info * 2471 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2472 { 2473 struct device_domain_info *info; 2474 2475 list_for_each_entry(info, &device_domain_list, global) 2476 if (info->segment == segment && info->bus == bus && 2477 info->devfn == devfn) 2478 return info; 2479 2480 return NULL; 2481 } 2482 2483 static int domain_setup_first_level(struct intel_iommu *iommu, 2484 struct dmar_domain *domain, 2485 struct device *dev, 2486 u32 pasid) 2487 { 2488 int flags = PASID_FLAG_SUPERVISOR_MODE; 2489 struct dma_pte *pgd = domain->pgd; 2490 int agaw, level; 2491 2492 /* 2493 * Skip top levels of page tables for iommu which has 2494 * less agaw than default. Unnecessary for PT mode. 2495 */ 2496 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2497 pgd = phys_to_virt(dma_pte_addr(pgd)); 2498 if (!dma_pte_present(pgd)) 2499 return -ENOMEM; 2500 } 2501 2502 level = agaw_to_level(agaw); 2503 if (level != 4 && level != 5) 2504 return -EINVAL; 2505 2506 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2507 2508 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2509 domain->iommu_did[iommu->seq_id], 2510 flags); 2511 } 2512 2513 static bool dev_is_real_dma_subdevice(struct device *dev) 2514 { 2515 return dev && dev_is_pci(dev) && 2516 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2517 } 2518 2519 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2520 int bus, int devfn, 2521 struct device *dev, 2522 struct dmar_domain *domain) 2523 { 2524 struct dmar_domain *found = NULL; 2525 struct device_domain_info *info; 2526 unsigned long flags; 2527 int ret; 2528 2529 info = alloc_devinfo_mem(); 2530 if (!info) 2531 return NULL; 2532 2533 if (!dev_is_real_dma_subdevice(dev)) { 2534 info->bus = bus; 2535 info->devfn = devfn; 2536 info->segment = iommu->segment; 2537 } else { 2538 struct pci_dev *pdev = to_pci_dev(dev); 2539 2540 info->bus = pdev->bus->number; 2541 info->devfn = pdev->devfn; 2542 info->segment = pci_domain_nr(pdev->bus); 2543 } 2544 2545 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2546 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2547 info->ats_qdep = 0; 2548 info->dev = dev; 2549 info->domain = domain; 2550 info->iommu = iommu; 2551 info->pasid_table = NULL; 2552 info->auxd_enabled = 0; 2553 INIT_LIST_HEAD(&info->subdevices); 2554 2555 if (dev && dev_is_pci(dev)) { 2556 struct pci_dev *pdev = to_pci_dev(info->dev); 2557 2558 if (ecap_dev_iotlb_support(iommu->ecap) && 2559 pci_ats_supported(pdev) && 2560 dmar_find_matched_atsr_unit(pdev)) 2561 info->ats_supported = 1; 2562 2563 if (sm_supported(iommu)) { 2564 if (pasid_supported(iommu)) { 2565 int features = pci_pasid_features(pdev); 2566 if (features >= 0) 2567 info->pasid_supported = features | 1; 2568 } 2569 2570 if (info->ats_supported && ecap_prs(iommu->ecap) && 2571 pci_pri_supported(pdev)) 2572 info->pri_supported = 1; 2573 } 2574 } 2575 2576 spin_lock_irqsave(&device_domain_lock, flags); 2577 if (dev) 2578 found = find_domain(dev); 2579 2580 if (!found) { 2581 struct device_domain_info *info2; 2582 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2583 info->devfn); 2584 if (info2) { 2585 found = info2->domain; 2586 info2->dev = dev; 2587 } 2588 } 2589 2590 if (found) { 2591 spin_unlock_irqrestore(&device_domain_lock, flags); 2592 free_devinfo_mem(info); 2593 /* Caller must free the original domain */ 2594 return found; 2595 } 2596 2597 spin_lock(&iommu->lock); 2598 ret = domain_attach_iommu(domain, iommu); 2599 spin_unlock(&iommu->lock); 2600 2601 if (ret) { 2602 spin_unlock_irqrestore(&device_domain_lock, flags); 2603 free_devinfo_mem(info); 2604 return NULL; 2605 } 2606 2607 list_add(&info->link, &domain->devices); 2608 list_add(&info->global, &device_domain_list); 2609 if (dev) 2610 dev_iommu_priv_set(dev, info); 2611 spin_unlock_irqrestore(&device_domain_lock, flags); 2612 2613 /* PASID table is mandatory for a PCI device in scalable mode. */ 2614 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2615 ret = intel_pasid_alloc_table(dev); 2616 if (ret) { 2617 dev_err(dev, "PASID table allocation failed\n"); 2618 dmar_remove_one_dev_info(dev); 2619 return NULL; 2620 } 2621 2622 /* Setup the PASID entry for requests without PASID: */ 2623 spin_lock_irqsave(&iommu->lock, flags); 2624 if (hw_pass_through && domain_type_is_si(domain)) 2625 ret = intel_pasid_setup_pass_through(iommu, domain, 2626 dev, PASID_RID2PASID); 2627 else if (domain_use_first_level(domain)) 2628 ret = domain_setup_first_level(iommu, domain, dev, 2629 PASID_RID2PASID); 2630 else 2631 ret = intel_pasid_setup_second_level(iommu, domain, 2632 dev, PASID_RID2PASID); 2633 spin_unlock_irqrestore(&iommu->lock, flags); 2634 if (ret) { 2635 dev_err(dev, "Setup RID2PASID failed\n"); 2636 dmar_remove_one_dev_info(dev); 2637 return NULL; 2638 } 2639 } 2640 2641 if (dev && domain_context_mapping(domain, dev)) { 2642 dev_err(dev, "Domain context map failed\n"); 2643 dmar_remove_one_dev_info(dev); 2644 return NULL; 2645 } 2646 2647 return domain; 2648 } 2649 2650 static int iommu_domain_identity_map(struct dmar_domain *domain, 2651 unsigned long first_vpfn, 2652 unsigned long last_vpfn) 2653 { 2654 /* 2655 * RMRR range might have overlap with physical memory range, 2656 * clear it first 2657 */ 2658 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2659 2660 return __domain_mapping(domain, first_vpfn, 2661 first_vpfn, last_vpfn - first_vpfn + 1, 2662 DMA_PTE_READ|DMA_PTE_WRITE); 2663 } 2664 2665 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2666 2667 static int __init si_domain_init(int hw) 2668 { 2669 struct dmar_rmrr_unit *rmrr; 2670 struct device *dev; 2671 int i, nid, ret; 2672 2673 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2674 if (!si_domain) 2675 return -EFAULT; 2676 2677 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2678 domain_exit(si_domain); 2679 return -EFAULT; 2680 } 2681 2682 if (hw) 2683 return 0; 2684 2685 for_each_online_node(nid) { 2686 unsigned long start_pfn, end_pfn; 2687 int i; 2688 2689 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2690 ret = iommu_domain_identity_map(si_domain, 2691 mm_to_dma_pfn(start_pfn), 2692 mm_to_dma_pfn(end_pfn)); 2693 if (ret) 2694 return ret; 2695 } 2696 } 2697 2698 /* 2699 * Identity map the RMRRs so that devices with RMRRs could also use 2700 * the si_domain. 2701 */ 2702 for_each_rmrr_units(rmrr) { 2703 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2704 i, dev) { 2705 unsigned long long start = rmrr->base_address; 2706 unsigned long long end = rmrr->end_address; 2707 2708 if (WARN_ON(end < start || 2709 end >> agaw_to_width(si_domain->agaw))) 2710 continue; 2711 2712 ret = iommu_domain_identity_map(si_domain, 2713 mm_to_dma_pfn(start >> PAGE_SHIFT), 2714 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2715 if (ret) 2716 return ret; 2717 } 2718 } 2719 2720 return 0; 2721 } 2722 2723 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2724 { 2725 struct dmar_domain *ndomain; 2726 struct intel_iommu *iommu; 2727 u8 bus, devfn; 2728 2729 iommu = device_to_iommu(dev, &bus, &devfn); 2730 if (!iommu) 2731 return -ENODEV; 2732 2733 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2734 if (ndomain != domain) 2735 return -EBUSY; 2736 2737 return 0; 2738 } 2739 2740 static bool device_has_rmrr(struct device *dev) 2741 { 2742 struct dmar_rmrr_unit *rmrr; 2743 struct device *tmp; 2744 int i; 2745 2746 rcu_read_lock(); 2747 for_each_rmrr_units(rmrr) { 2748 /* 2749 * Return TRUE if this RMRR contains the device that 2750 * is passed in. 2751 */ 2752 for_each_active_dev_scope(rmrr->devices, 2753 rmrr->devices_cnt, i, tmp) 2754 if (tmp == dev || 2755 is_downstream_to_pci_bridge(dev, tmp)) { 2756 rcu_read_unlock(); 2757 return true; 2758 } 2759 } 2760 rcu_read_unlock(); 2761 return false; 2762 } 2763 2764 /** 2765 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2766 * is relaxable (ie. is allowed to be not enforced under some conditions) 2767 * @dev: device handle 2768 * 2769 * We assume that PCI USB devices with RMRRs have them largely 2770 * for historical reasons and that the RMRR space is not actively used post 2771 * boot. This exclusion may change if vendors begin to abuse it. 2772 * 2773 * The same exception is made for graphics devices, with the requirement that 2774 * any use of the RMRR regions will be torn down before assigning the device 2775 * to a guest. 2776 * 2777 * Return: true if the RMRR is relaxable, false otherwise 2778 */ 2779 static bool device_rmrr_is_relaxable(struct device *dev) 2780 { 2781 struct pci_dev *pdev; 2782 2783 if (!dev_is_pci(dev)) 2784 return false; 2785 2786 pdev = to_pci_dev(dev); 2787 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2788 return true; 2789 else 2790 return false; 2791 } 2792 2793 /* 2794 * There are a couple cases where we need to restrict the functionality of 2795 * devices associated with RMRRs. The first is when evaluating a device for 2796 * identity mapping because problems exist when devices are moved in and out 2797 * of domains and their respective RMRR information is lost. This means that 2798 * a device with associated RMRRs will never be in a "passthrough" domain. 2799 * The second is use of the device through the IOMMU API. This interface 2800 * expects to have full control of the IOVA space for the device. We cannot 2801 * satisfy both the requirement that RMRR access is maintained and have an 2802 * unencumbered IOVA space. We also have no ability to quiesce the device's 2803 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2804 * We therefore prevent devices associated with an RMRR from participating in 2805 * the IOMMU API, which eliminates them from device assignment. 2806 * 2807 * In both cases, devices which have relaxable RMRRs are not concerned by this 2808 * restriction. See device_rmrr_is_relaxable comment. 2809 */ 2810 static bool device_is_rmrr_locked(struct device *dev) 2811 { 2812 if (!device_has_rmrr(dev)) 2813 return false; 2814 2815 if (device_rmrr_is_relaxable(dev)) 2816 return false; 2817 2818 return true; 2819 } 2820 2821 /* 2822 * Return the required default domain type for a specific device. 2823 * 2824 * @dev: the device in query 2825 * @startup: true if this is during early boot 2826 * 2827 * Returns: 2828 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2829 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2830 * - 0: both identity and dynamic domains work for this device 2831 */ 2832 static int device_def_domain_type(struct device *dev) 2833 { 2834 if (dev_is_pci(dev)) { 2835 struct pci_dev *pdev = to_pci_dev(dev); 2836 2837 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2838 return IOMMU_DOMAIN_IDENTITY; 2839 2840 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2841 return IOMMU_DOMAIN_IDENTITY; 2842 } 2843 2844 return 0; 2845 } 2846 2847 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2848 { 2849 /* 2850 * Start from the sane iommu hardware state. 2851 * If the queued invalidation is already initialized by us 2852 * (for example, while enabling interrupt-remapping) then 2853 * we got the things already rolling from a sane state. 2854 */ 2855 if (!iommu->qi) { 2856 /* 2857 * Clear any previous faults. 2858 */ 2859 dmar_fault(-1, iommu); 2860 /* 2861 * Disable queued invalidation if supported and already enabled 2862 * before OS handover. 2863 */ 2864 dmar_disable_qi(iommu); 2865 } 2866 2867 if (dmar_enable_qi(iommu)) { 2868 /* 2869 * Queued Invalidate not enabled, use Register Based Invalidate 2870 */ 2871 iommu->flush.flush_context = __iommu_flush_context; 2872 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2873 pr_info("%s: Using Register based invalidation\n", 2874 iommu->name); 2875 } else { 2876 iommu->flush.flush_context = qi_flush_context; 2877 iommu->flush.flush_iotlb = qi_flush_iotlb; 2878 pr_info("%s: Using Queued invalidation\n", iommu->name); 2879 } 2880 } 2881 2882 static int copy_context_table(struct intel_iommu *iommu, 2883 struct root_entry *old_re, 2884 struct context_entry **tbl, 2885 int bus, bool ext) 2886 { 2887 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2888 struct context_entry *new_ce = NULL, ce; 2889 struct context_entry *old_ce = NULL; 2890 struct root_entry re; 2891 phys_addr_t old_ce_phys; 2892 2893 tbl_idx = ext ? bus * 2 : bus; 2894 memcpy(&re, old_re, sizeof(re)); 2895 2896 for (devfn = 0; devfn < 256; devfn++) { 2897 /* First calculate the correct index */ 2898 idx = (ext ? devfn * 2 : devfn) % 256; 2899 2900 if (idx == 0) { 2901 /* First save what we may have and clean up */ 2902 if (new_ce) { 2903 tbl[tbl_idx] = new_ce; 2904 __iommu_flush_cache(iommu, new_ce, 2905 VTD_PAGE_SIZE); 2906 pos = 1; 2907 } 2908 2909 if (old_ce) 2910 memunmap(old_ce); 2911 2912 ret = 0; 2913 if (devfn < 0x80) 2914 old_ce_phys = root_entry_lctp(&re); 2915 else 2916 old_ce_phys = root_entry_uctp(&re); 2917 2918 if (!old_ce_phys) { 2919 if (ext && devfn == 0) { 2920 /* No LCTP, try UCTP */ 2921 devfn = 0x7f; 2922 continue; 2923 } else { 2924 goto out; 2925 } 2926 } 2927 2928 ret = -ENOMEM; 2929 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2930 MEMREMAP_WB); 2931 if (!old_ce) 2932 goto out; 2933 2934 new_ce = alloc_pgtable_page(iommu->node); 2935 if (!new_ce) 2936 goto out_unmap; 2937 2938 ret = 0; 2939 } 2940 2941 /* Now copy the context entry */ 2942 memcpy(&ce, old_ce + idx, sizeof(ce)); 2943 2944 if (!__context_present(&ce)) 2945 continue; 2946 2947 did = context_domain_id(&ce); 2948 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2949 set_bit(did, iommu->domain_ids); 2950 2951 /* 2952 * We need a marker for copied context entries. This 2953 * marker needs to work for the old format as well as 2954 * for extended context entries. 2955 * 2956 * Bit 67 of the context entry is used. In the old 2957 * format this bit is available to software, in the 2958 * extended format it is the PGE bit, but PGE is ignored 2959 * by HW if PASIDs are disabled (and thus still 2960 * available). 2961 * 2962 * So disable PASIDs first and then mark the entry 2963 * copied. This means that we don't copy PASID 2964 * translations from the old kernel, but this is fine as 2965 * faults there are not fatal. 2966 */ 2967 context_clear_pasid_enable(&ce); 2968 context_set_copied(&ce); 2969 2970 new_ce[idx] = ce; 2971 } 2972 2973 tbl[tbl_idx + pos] = new_ce; 2974 2975 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2976 2977 out_unmap: 2978 memunmap(old_ce); 2979 2980 out: 2981 return ret; 2982 } 2983 2984 static int copy_translation_tables(struct intel_iommu *iommu) 2985 { 2986 struct context_entry **ctxt_tbls; 2987 struct root_entry *old_rt; 2988 phys_addr_t old_rt_phys; 2989 int ctxt_table_entries; 2990 unsigned long flags; 2991 u64 rtaddr_reg; 2992 int bus, ret; 2993 bool new_ext, ext; 2994 2995 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2996 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2997 new_ext = !!ecap_ecs(iommu->ecap); 2998 2999 /* 3000 * The RTT bit can only be changed when translation is disabled, 3001 * but disabling translation means to open a window for data 3002 * corruption. So bail out and don't copy anything if we would 3003 * have to change the bit. 3004 */ 3005 if (new_ext != ext) 3006 return -EINVAL; 3007 3008 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3009 if (!old_rt_phys) 3010 return -EINVAL; 3011 3012 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3013 if (!old_rt) 3014 return -ENOMEM; 3015 3016 /* This is too big for the stack - allocate it from slab */ 3017 ctxt_table_entries = ext ? 512 : 256; 3018 ret = -ENOMEM; 3019 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3020 if (!ctxt_tbls) 3021 goto out_unmap; 3022 3023 for (bus = 0; bus < 256; bus++) { 3024 ret = copy_context_table(iommu, &old_rt[bus], 3025 ctxt_tbls, bus, ext); 3026 if (ret) { 3027 pr_err("%s: Failed to copy context table for bus %d\n", 3028 iommu->name, bus); 3029 continue; 3030 } 3031 } 3032 3033 spin_lock_irqsave(&iommu->lock, flags); 3034 3035 /* Context tables are copied, now write them to the root_entry table */ 3036 for (bus = 0; bus < 256; bus++) { 3037 int idx = ext ? bus * 2 : bus; 3038 u64 val; 3039 3040 if (ctxt_tbls[idx]) { 3041 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3042 iommu->root_entry[bus].lo = val; 3043 } 3044 3045 if (!ext || !ctxt_tbls[idx + 1]) 3046 continue; 3047 3048 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3049 iommu->root_entry[bus].hi = val; 3050 } 3051 3052 spin_unlock_irqrestore(&iommu->lock, flags); 3053 3054 kfree(ctxt_tbls); 3055 3056 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3057 3058 ret = 0; 3059 3060 out_unmap: 3061 memunmap(old_rt); 3062 3063 return ret; 3064 } 3065 3066 #ifdef CONFIG_INTEL_IOMMU_SVM 3067 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3068 { 3069 struct intel_iommu *iommu = data; 3070 ioasid_t ioasid; 3071 3072 if (!iommu) 3073 return INVALID_IOASID; 3074 /* 3075 * VT-d virtual command interface always uses the full 20 bit 3076 * PASID range. Host can partition guest PASID range based on 3077 * policies but it is out of guest's control. 3078 */ 3079 if (min < PASID_MIN || max > intel_pasid_max_id) 3080 return INVALID_IOASID; 3081 3082 if (vcmd_alloc_pasid(iommu, &ioasid)) 3083 return INVALID_IOASID; 3084 3085 return ioasid; 3086 } 3087 3088 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3089 { 3090 struct intel_iommu *iommu = data; 3091 3092 if (!iommu) 3093 return; 3094 /* 3095 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3096 * We can only free the PASID when all the devices are unbound. 3097 */ 3098 if (ioasid_find(NULL, ioasid, NULL)) { 3099 pr_alert("Cannot free active IOASID %d\n", ioasid); 3100 return; 3101 } 3102 vcmd_free_pasid(iommu, ioasid); 3103 } 3104 3105 static void register_pasid_allocator(struct intel_iommu *iommu) 3106 { 3107 /* 3108 * If we are running in the host, no need for custom allocator 3109 * in that PASIDs are allocated from the host system-wide. 3110 */ 3111 if (!cap_caching_mode(iommu->cap)) 3112 return; 3113 3114 if (!sm_supported(iommu)) { 3115 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3116 return; 3117 } 3118 3119 /* 3120 * Register a custom PASID allocator if we are running in a guest, 3121 * guest PASID must be obtained via virtual command interface. 3122 * There can be multiple vIOMMUs in each guest but only one allocator 3123 * is active. All vIOMMU allocators will eventually be calling the same 3124 * host allocator. 3125 */ 3126 if (!vccap_pasid(iommu->vccap)) 3127 return; 3128 3129 pr_info("Register custom PASID allocator\n"); 3130 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3131 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3132 iommu->pasid_allocator.pdata = (void *)iommu; 3133 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3134 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3135 /* 3136 * Disable scalable mode on this IOMMU if there 3137 * is no custom allocator. Mixing SM capable vIOMMU 3138 * and non-SM vIOMMU are not supported. 3139 */ 3140 intel_iommu_sm = 0; 3141 } 3142 } 3143 #endif 3144 3145 static int __init init_dmars(void) 3146 { 3147 struct dmar_drhd_unit *drhd; 3148 struct intel_iommu *iommu; 3149 int ret; 3150 3151 /* 3152 * for each drhd 3153 * allocate root 3154 * initialize and program root entry to not present 3155 * endfor 3156 */ 3157 for_each_drhd_unit(drhd) { 3158 /* 3159 * lock not needed as this is only incremented in the single 3160 * threaded kernel __init code path all other access are read 3161 * only 3162 */ 3163 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3164 g_num_of_iommus++; 3165 continue; 3166 } 3167 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3168 } 3169 3170 /* Preallocate enough resources for IOMMU hot-addition */ 3171 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3172 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3173 3174 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3175 GFP_KERNEL); 3176 if (!g_iommus) { 3177 pr_err("Allocating global iommu array failed\n"); 3178 ret = -ENOMEM; 3179 goto error; 3180 } 3181 3182 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); 3183 if (ret) 3184 goto free_iommu; 3185 3186 for_each_iommu(iommu, drhd) { 3187 if (drhd->ignored) { 3188 iommu_disable_translation(iommu); 3189 continue; 3190 } 3191 3192 /* 3193 * Find the max pasid size of all IOMMU's in the system. 3194 * We need to ensure the system pasid table is no bigger 3195 * than the smallest supported. 3196 */ 3197 if (pasid_supported(iommu)) { 3198 u32 temp = 2 << ecap_pss(iommu->ecap); 3199 3200 intel_pasid_max_id = min_t(u32, temp, 3201 intel_pasid_max_id); 3202 } 3203 3204 g_iommus[iommu->seq_id] = iommu; 3205 3206 intel_iommu_init_qi(iommu); 3207 3208 ret = iommu_init_domains(iommu); 3209 if (ret) 3210 goto free_iommu; 3211 3212 init_translation_status(iommu); 3213 3214 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3215 iommu_disable_translation(iommu); 3216 clear_translation_pre_enabled(iommu); 3217 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3218 iommu->name); 3219 } 3220 3221 /* 3222 * TBD: 3223 * we could share the same root & context tables 3224 * among all IOMMU's. Need to Split it later. 3225 */ 3226 ret = iommu_alloc_root_entry(iommu); 3227 if (ret) 3228 goto free_iommu; 3229 3230 if (translation_pre_enabled(iommu)) { 3231 pr_info("Translation already enabled - trying to copy translation structures\n"); 3232 3233 ret = copy_translation_tables(iommu); 3234 if (ret) { 3235 /* 3236 * We found the IOMMU with translation 3237 * enabled - but failed to copy over the 3238 * old root-entry table. Try to proceed 3239 * by disabling translation now and 3240 * allocating a clean root-entry table. 3241 * This might cause DMAR faults, but 3242 * probably the dump will still succeed. 3243 */ 3244 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3245 iommu->name); 3246 iommu_disable_translation(iommu); 3247 clear_translation_pre_enabled(iommu); 3248 } else { 3249 pr_info("Copied translation tables from previous kernel for %s\n", 3250 iommu->name); 3251 } 3252 } 3253 3254 if (!ecap_pass_through(iommu->ecap)) 3255 hw_pass_through = 0; 3256 intel_svm_check(iommu); 3257 } 3258 3259 /* 3260 * Now that qi is enabled on all iommus, set the root entry and flush 3261 * caches. This is required on some Intel X58 chipsets, otherwise the 3262 * flush_context function will loop forever and the boot hangs. 3263 */ 3264 for_each_active_iommu(iommu, drhd) { 3265 iommu_flush_write_buffer(iommu); 3266 #ifdef CONFIG_INTEL_IOMMU_SVM 3267 register_pasid_allocator(iommu); 3268 #endif 3269 iommu_set_root_entry(iommu); 3270 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3271 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3272 } 3273 3274 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3275 dmar_map_gfx = 0; 3276 #endif 3277 3278 if (!dmar_map_gfx) 3279 iommu_identity_mapping |= IDENTMAP_GFX; 3280 3281 check_tylersburg_isoch(); 3282 3283 ret = si_domain_init(hw_pass_through); 3284 if (ret) 3285 goto free_iommu; 3286 3287 /* 3288 * for each drhd 3289 * enable fault log 3290 * global invalidate context cache 3291 * global invalidate iotlb 3292 * enable translation 3293 */ 3294 for_each_iommu(iommu, drhd) { 3295 if (drhd->ignored) { 3296 /* 3297 * we always have to disable PMRs or DMA may fail on 3298 * this device 3299 */ 3300 if (force_on) 3301 iommu_disable_protect_mem_regions(iommu); 3302 continue; 3303 } 3304 3305 iommu_flush_write_buffer(iommu); 3306 3307 #ifdef CONFIG_INTEL_IOMMU_SVM 3308 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3309 /* 3310 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3311 * could cause possible lock race condition. 3312 */ 3313 up_write(&dmar_global_lock); 3314 ret = intel_svm_enable_prq(iommu); 3315 down_write(&dmar_global_lock); 3316 if (ret) 3317 goto free_iommu; 3318 } 3319 #endif 3320 ret = dmar_set_interrupt(iommu); 3321 if (ret) 3322 goto free_iommu; 3323 } 3324 3325 return 0; 3326 3327 free_iommu: 3328 for_each_active_iommu(iommu, drhd) { 3329 disable_dmar_iommu(iommu); 3330 free_dmar_iommu(iommu); 3331 } 3332 3333 kfree(g_iommus); 3334 3335 error: 3336 return ret; 3337 } 3338 3339 static inline int iommu_domain_cache_init(void) 3340 { 3341 int ret = 0; 3342 3343 iommu_domain_cache = kmem_cache_create("iommu_domain", 3344 sizeof(struct dmar_domain), 3345 0, 3346 SLAB_HWCACHE_ALIGN, 3347 3348 NULL); 3349 if (!iommu_domain_cache) { 3350 pr_err("Couldn't create iommu_domain cache\n"); 3351 ret = -ENOMEM; 3352 } 3353 3354 return ret; 3355 } 3356 3357 static inline int iommu_devinfo_cache_init(void) 3358 { 3359 int ret = 0; 3360 3361 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3362 sizeof(struct device_domain_info), 3363 0, 3364 SLAB_HWCACHE_ALIGN, 3365 NULL); 3366 if (!iommu_devinfo_cache) { 3367 pr_err("Couldn't create devinfo cache\n"); 3368 ret = -ENOMEM; 3369 } 3370 3371 return ret; 3372 } 3373 3374 static int __init iommu_init_mempool(void) 3375 { 3376 int ret; 3377 ret = iova_cache_get(); 3378 if (ret) 3379 return ret; 3380 3381 ret = iommu_domain_cache_init(); 3382 if (ret) 3383 goto domain_error; 3384 3385 ret = iommu_devinfo_cache_init(); 3386 if (!ret) 3387 return ret; 3388 3389 kmem_cache_destroy(iommu_domain_cache); 3390 domain_error: 3391 iova_cache_put(); 3392 3393 return -ENOMEM; 3394 } 3395 3396 static void __init iommu_exit_mempool(void) 3397 { 3398 kmem_cache_destroy(iommu_devinfo_cache); 3399 kmem_cache_destroy(iommu_domain_cache); 3400 iova_cache_put(); 3401 } 3402 3403 static void __init init_no_remapping_devices(void) 3404 { 3405 struct dmar_drhd_unit *drhd; 3406 struct device *dev; 3407 int i; 3408 3409 for_each_drhd_unit(drhd) { 3410 if (!drhd->include_all) { 3411 for_each_active_dev_scope(drhd->devices, 3412 drhd->devices_cnt, i, dev) 3413 break; 3414 /* ignore DMAR unit if no devices exist */ 3415 if (i == drhd->devices_cnt) 3416 drhd->ignored = 1; 3417 } 3418 } 3419 3420 for_each_active_drhd_unit(drhd) { 3421 if (drhd->include_all) 3422 continue; 3423 3424 for_each_active_dev_scope(drhd->devices, 3425 drhd->devices_cnt, i, dev) 3426 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 3427 break; 3428 if (i < drhd->devices_cnt) 3429 continue; 3430 3431 /* This IOMMU has *only* gfx devices. Either bypass it or 3432 set the gfx_mapped flag, as appropriate */ 3433 drhd->gfx_dedicated = 1; 3434 if (!dmar_map_gfx) 3435 drhd->ignored = 1; 3436 } 3437 } 3438 3439 #ifdef CONFIG_SUSPEND 3440 static int init_iommu_hw(void) 3441 { 3442 struct dmar_drhd_unit *drhd; 3443 struct intel_iommu *iommu = NULL; 3444 3445 for_each_active_iommu(iommu, drhd) 3446 if (iommu->qi) 3447 dmar_reenable_qi(iommu); 3448 3449 for_each_iommu(iommu, drhd) { 3450 if (drhd->ignored) { 3451 /* 3452 * we always have to disable PMRs or DMA may fail on 3453 * this device 3454 */ 3455 if (force_on) 3456 iommu_disable_protect_mem_regions(iommu); 3457 continue; 3458 } 3459 3460 iommu_flush_write_buffer(iommu); 3461 3462 iommu_set_root_entry(iommu); 3463 3464 iommu->flush.flush_context(iommu, 0, 0, 0, 3465 DMA_CCMD_GLOBAL_INVL); 3466 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3467 iommu_enable_translation(iommu); 3468 iommu_disable_protect_mem_regions(iommu); 3469 } 3470 3471 return 0; 3472 } 3473 3474 static void iommu_flush_all(void) 3475 { 3476 struct dmar_drhd_unit *drhd; 3477 struct intel_iommu *iommu; 3478 3479 for_each_active_iommu(iommu, drhd) { 3480 iommu->flush.flush_context(iommu, 0, 0, 0, 3481 DMA_CCMD_GLOBAL_INVL); 3482 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 3483 DMA_TLB_GLOBAL_FLUSH); 3484 } 3485 } 3486 3487 static int iommu_suspend(void) 3488 { 3489 struct dmar_drhd_unit *drhd; 3490 struct intel_iommu *iommu = NULL; 3491 unsigned long flag; 3492 3493 for_each_active_iommu(iommu, drhd) { 3494 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 3495 GFP_KERNEL); 3496 if (!iommu->iommu_state) 3497 goto nomem; 3498 } 3499 3500 iommu_flush_all(); 3501 3502 for_each_active_iommu(iommu, drhd) { 3503 iommu_disable_translation(iommu); 3504 3505 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3506 3507 iommu->iommu_state[SR_DMAR_FECTL_REG] = 3508 readl(iommu->reg + DMAR_FECTL_REG); 3509 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 3510 readl(iommu->reg + DMAR_FEDATA_REG); 3511 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 3512 readl(iommu->reg + DMAR_FEADDR_REG); 3513 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 3514 readl(iommu->reg + DMAR_FEUADDR_REG); 3515 3516 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3517 } 3518 return 0; 3519 3520 nomem: 3521 for_each_active_iommu(iommu, drhd) 3522 kfree(iommu->iommu_state); 3523 3524 return -ENOMEM; 3525 } 3526 3527 static void iommu_resume(void) 3528 { 3529 struct dmar_drhd_unit *drhd; 3530 struct intel_iommu *iommu = NULL; 3531 unsigned long flag; 3532 3533 if (init_iommu_hw()) { 3534 if (force_on) 3535 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 3536 else 3537 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 3538 return; 3539 } 3540 3541 for_each_active_iommu(iommu, drhd) { 3542 3543 raw_spin_lock_irqsave(&iommu->register_lock, flag); 3544 3545 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 3546 iommu->reg + DMAR_FECTL_REG); 3547 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 3548 iommu->reg + DMAR_FEDATA_REG); 3549 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 3550 iommu->reg + DMAR_FEADDR_REG); 3551 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 3552 iommu->reg + DMAR_FEUADDR_REG); 3553 3554 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 3555 } 3556 3557 for_each_active_iommu(iommu, drhd) 3558 kfree(iommu->iommu_state); 3559 } 3560 3561 static struct syscore_ops iommu_syscore_ops = { 3562 .resume = iommu_resume, 3563 .suspend = iommu_suspend, 3564 }; 3565 3566 static void __init init_iommu_pm_ops(void) 3567 { 3568 register_syscore_ops(&iommu_syscore_ops); 3569 } 3570 3571 #else 3572 static inline void init_iommu_pm_ops(void) {} 3573 #endif /* CONFIG_PM */ 3574 3575 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 3576 { 3577 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 3578 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 3579 rmrr->end_address <= rmrr->base_address || 3580 arch_rmrr_sanity_check(rmrr)) 3581 return -EINVAL; 3582 3583 return 0; 3584 } 3585 3586 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 3587 { 3588 struct acpi_dmar_reserved_memory *rmrr; 3589 struct dmar_rmrr_unit *rmrru; 3590 3591 rmrr = (struct acpi_dmar_reserved_memory *)header; 3592 if (rmrr_sanity_check(rmrr)) { 3593 pr_warn(FW_BUG 3594 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 3595 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 3596 rmrr->base_address, rmrr->end_address, 3597 dmi_get_system_info(DMI_BIOS_VENDOR), 3598 dmi_get_system_info(DMI_BIOS_VERSION), 3599 dmi_get_system_info(DMI_PRODUCT_VERSION)); 3600 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 3601 } 3602 3603 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 3604 if (!rmrru) 3605 goto out; 3606 3607 rmrru->hdr = header; 3608 3609 rmrru->base_address = rmrr->base_address; 3610 rmrru->end_address = rmrr->end_address; 3611 3612 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 3613 ((void *)rmrr) + rmrr->header.length, 3614 &rmrru->devices_cnt); 3615 if (rmrru->devices_cnt && rmrru->devices == NULL) 3616 goto free_rmrru; 3617 3618 list_add(&rmrru->list, &dmar_rmrr_units); 3619 3620 return 0; 3621 free_rmrru: 3622 kfree(rmrru); 3623 out: 3624 return -ENOMEM; 3625 } 3626 3627 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 3628 { 3629 struct dmar_atsr_unit *atsru; 3630 struct acpi_dmar_atsr *tmp; 3631 3632 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 3633 dmar_rcu_check()) { 3634 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 3635 if (atsr->segment != tmp->segment) 3636 continue; 3637 if (atsr->header.length != tmp->header.length) 3638 continue; 3639 if (memcmp(atsr, tmp, atsr->header.length) == 0) 3640 return atsru; 3641 } 3642 3643 return NULL; 3644 } 3645 3646 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3647 { 3648 struct acpi_dmar_atsr *atsr; 3649 struct dmar_atsr_unit *atsru; 3650 3651 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3652 return 0; 3653 3654 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3655 atsru = dmar_find_atsr(atsr); 3656 if (atsru) 3657 return 0; 3658 3659 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 3660 if (!atsru) 3661 return -ENOMEM; 3662 3663 /* 3664 * If memory is allocated from slab by ACPI _DSM method, we need to 3665 * copy the memory content because the memory buffer will be freed 3666 * on return. 3667 */ 3668 atsru->hdr = (void *)(atsru + 1); 3669 memcpy(atsru->hdr, hdr, hdr->length); 3670 atsru->include_all = atsr->flags & 0x1; 3671 if (!atsru->include_all) { 3672 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 3673 (void *)atsr + atsr->header.length, 3674 &atsru->devices_cnt); 3675 if (atsru->devices_cnt && atsru->devices == NULL) { 3676 kfree(atsru); 3677 return -ENOMEM; 3678 } 3679 } 3680 3681 list_add_rcu(&atsru->list, &dmar_atsr_units); 3682 3683 return 0; 3684 } 3685 3686 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 3687 { 3688 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 3689 kfree(atsru); 3690 } 3691 3692 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3693 { 3694 struct acpi_dmar_atsr *atsr; 3695 struct dmar_atsr_unit *atsru; 3696 3697 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3698 atsru = dmar_find_atsr(atsr); 3699 if (atsru) { 3700 list_del_rcu(&atsru->list); 3701 synchronize_rcu(); 3702 intel_iommu_free_atsr(atsru); 3703 } 3704 3705 return 0; 3706 } 3707 3708 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 3709 { 3710 int i; 3711 struct device *dev; 3712 struct acpi_dmar_atsr *atsr; 3713 struct dmar_atsr_unit *atsru; 3714 3715 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 3716 atsru = dmar_find_atsr(atsr); 3717 if (!atsru) 3718 return 0; 3719 3720 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 3721 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 3722 i, dev) 3723 return -EBUSY; 3724 } 3725 3726 return 0; 3727 } 3728 3729 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) 3730 { 3731 struct dmar_satc_unit *satcu; 3732 struct acpi_dmar_satc *tmp; 3733 3734 list_for_each_entry_rcu(satcu, &dmar_satc_units, list, 3735 dmar_rcu_check()) { 3736 tmp = (struct acpi_dmar_satc *)satcu->hdr; 3737 if (satc->segment != tmp->segment) 3738 continue; 3739 if (satc->header.length != tmp->header.length) 3740 continue; 3741 if (memcmp(satc, tmp, satc->header.length) == 0) 3742 return satcu; 3743 } 3744 3745 return NULL; 3746 } 3747 3748 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) 3749 { 3750 struct acpi_dmar_satc *satc; 3751 struct dmar_satc_unit *satcu; 3752 3753 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 3754 return 0; 3755 3756 satc = container_of(hdr, struct acpi_dmar_satc, header); 3757 satcu = dmar_find_satc(satc); 3758 if (satcu) 3759 return 0; 3760 3761 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); 3762 if (!satcu) 3763 return -ENOMEM; 3764 3765 satcu->hdr = (void *)(satcu + 1); 3766 memcpy(satcu->hdr, hdr, hdr->length); 3767 satcu->atc_required = satc->flags & 0x1; 3768 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), 3769 (void *)satc + satc->header.length, 3770 &satcu->devices_cnt); 3771 if (satcu->devices_cnt && !satcu->devices) { 3772 kfree(satcu); 3773 return -ENOMEM; 3774 } 3775 list_add_rcu(&satcu->list, &dmar_satc_units); 3776 3777 return 0; 3778 } 3779 3780 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 3781 { 3782 int sp, ret; 3783 struct intel_iommu *iommu = dmaru->iommu; 3784 3785 if (g_iommus[iommu->seq_id]) 3786 return 0; 3787 3788 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); 3789 if (ret) 3790 goto out; 3791 3792 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 3793 pr_warn("%s: Doesn't support hardware pass through.\n", 3794 iommu->name); 3795 return -ENXIO; 3796 } 3797 if (!ecap_sc_support(iommu->ecap) && 3798 domain_update_iommu_snooping(iommu)) { 3799 pr_warn("%s: Doesn't support snooping.\n", 3800 iommu->name); 3801 return -ENXIO; 3802 } 3803 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 3804 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 3805 pr_warn("%s: Doesn't support large page.\n", 3806 iommu->name); 3807 return -ENXIO; 3808 } 3809 3810 /* 3811 * Disable translation if already enabled prior to OS handover. 3812 */ 3813 if (iommu->gcmd & DMA_GCMD_TE) 3814 iommu_disable_translation(iommu); 3815 3816 g_iommus[iommu->seq_id] = iommu; 3817 ret = iommu_init_domains(iommu); 3818 if (ret == 0) 3819 ret = iommu_alloc_root_entry(iommu); 3820 if (ret) 3821 goto out; 3822 3823 intel_svm_check(iommu); 3824 3825 if (dmaru->ignored) { 3826 /* 3827 * we always have to disable PMRs or DMA may fail on this device 3828 */ 3829 if (force_on) 3830 iommu_disable_protect_mem_regions(iommu); 3831 return 0; 3832 } 3833 3834 intel_iommu_init_qi(iommu); 3835 iommu_flush_write_buffer(iommu); 3836 3837 #ifdef CONFIG_INTEL_IOMMU_SVM 3838 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3839 ret = intel_svm_enable_prq(iommu); 3840 if (ret) 3841 goto disable_iommu; 3842 } 3843 #endif 3844 ret = dmar_set_interrupt(iommu); 3845 if (ret) 3846 goto disable_iommu; 3847 3848 iommu_set_root_entry(iommu); 3849 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3850 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3851 iommu_enable_translation(iommu); 3852 3853 iommu_disable_protect_mem_regions(iommu); 3854 return 0; 3855 3856 disable_iommu: 3857 disable_dmar_iommu(iommu); 3858 out: 3859 free_dmar_iommu(iommu); 3860 return ret; 3861 } 3862 3863 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 3864 { 3865 int ret = 0; 3866 struct intel_iommu *iommu = dmaru->iommu; 3867 3868 if (!intel_iommu_enabled) 3869 return 0; 3870 if (iommu == NULL) 3871 return -EINVAL; 3872 3873 if (insert) { 3874 ret = intel_iommu_add(dmaru); 3875 } else { 3876 disable_dmar_iommu(iommu); 3877 free_dmar_iommu(iommu); 3878 } 3879 3880 return ret; 3881 } 3882 3883 static void intel_iommu_free_dmars(void) 3884 { 3885 struct dmar_rmrr_unit *rmrru, *rmrr_n; 3886 struct dmar_atsr_unit *atsru, *atsr_n; 3887 struct dmar_satc_unit *satcu, *satc_n; 3888 3889 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 3890 list_del(&rmrru->list); 3891 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 3892 kfree(rmrru); 3893 } 3894 3895 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 3896 list_del(&atsru->list); 3897 intel_iommu_free_atsr(atsru); 3898 } 3899 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { 3900 list_del(&satcu->list); 3901 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); 3902 kfree(satcu); 3903 } 3904 } 3905 3906 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 3907 { 3908 int i, ret = 1; 3909 struct pci_bus *bus; 3910 struct pci_dev *bridge = NULL; 3911 struct device *tmp; 3912 struct acpi_dmar_atsr *atsr; 3913 struct dmar_atsr_unit *atsru; 3914 3915 dev = pci_physfn(dev); 3916 for (bus = dev->bus; bus; bus = bus->parent) { 3917 bridge = bus->self; 3918 /* If it's an integrated device, allow ATS */ 3919 if (!bridge) 3920 return 1; 3921 /* Connected via non-PCIe: no ATS */ 3922 if (!pci_is_pcie(bridge) || 3923 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 3924 return 0; 3925 /* If we found the root port, look it up in the ATSR */ 3926 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 3927 break; 3928 } 3929 3930 rcu_read_lock(); 3931 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 3932 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3933 if (atsr->segment != pci_domain_nr(dev->bus)) 3934 continue; 3935 3936 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 3937 if (tmp == &bridge->dev) 3938 goto out; 3939 3940 if (atsru->include_all) 3941 goto out; 3942 } 3943 ret = 0; 3944 out: 3945 rcu_read_unlock(); 3946 3947 return ret; 3948 } 3949 3950 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 3951 { 3952 int ret; 3953 struct dmar_rmrr_unit *rmrru; 3954 struct dmar_atsr_unit *atsru; 3955 struct dmar_satc_unit *satcu; 3956 struct acpi_dmar_atsr *atsr; 3957 struct acpi_dmar_reserved_memory *rmrr; 3958 struct acpi_dmar_satc *satc; 3959 3960 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 3961 return 0; 3962 3963 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 3964 rmrr = container_of(rmrru->hdr, 3965 struct acpi_dmar_reserved_memory, header); 3966 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3967 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 3968 ((void *)rmrr) + rmrr->header.length, 3969 rmrr->segment, rmrru->devices, 3970 rmrru->devices_cnt); 3971 if (ret < 0) 3972 return ret; 3973 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3974 dmar_remove_dev_scope(info, rmrr->segment, 3975 rmrru->devices, rmrru->devices_cnt); 3976 } 3977 } 3978 3979 list_for_each_entry(atsru, &dmar_atsr_units, list) { 3980 if (atsru->include_all) 3981 continue; 3982 3983 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 3984 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 3985 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 3986 (void *)atsr + atsr->header.length, 3987 atsr->segment, atsru->devices, 3988 atsru->devices_cnt); 3989 if (ret > 0) 3990 break; 3991 else if (ret < 0) 3992 return ret; 3993 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 3994 if (dmar_remove_dev_scope(info, atsr->segment, 3995 atsru->devices, atsru->devices_cnt)) 3996 break; 3997 } 3998 } 3999 list_for_each_entry(satcu, &dmar_satc_units, list) { 4000 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); 4001 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4002 ret = dmar_insert_dev_scope(info, (void *)(satc + 1), 4003 (void *)satc + satc->header.length, 4004 satc->segment, satcu->devices, 4005 satcu->devices_cnt); 4006 if (ret > 0) 4007 break; 4008 else if (ret < 0) 4009 return ret; 4010 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4011 if (dmar_remove_dev_scope(info, satc->segment, 4012 satcu->devices, satcu->devices_cnt)) 4013 break; 4014 } 4015 } 4016 4017 return 0; 4018 } 4019 4020 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4021 unsigned long val, void *v) 4022 { 4023 struct memory_notify *mhp = v; 4024 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4025 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4026 mhp->nr_pages - 1); 4027 4028 switch (val) { 4029 case MEM_GOING_ONLINE: 4030 if (iommu_domain_identity_map(si_domain, 4031 start_vpfn, last_vpfn)) { 4032 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4033 start_vpfn, last_vpfn); 4034 return NOTIFY_BAD; 4035 } 4036 break; 4037 4038 case MEM_OFFLINE: 4039 case MEM_CANCEL_ONLINE: 4040 { 4041 struct dmar_drhd_unit *drhd; 4042 struct intel_iommu *iommu; 4043 struct page *freelist; 4044 4045 freelist = domain_unmap(si_domain, 4046 start_vpfn, last_vpfn, 4047 NULL); 4048 4049 rcu_read_lock(); 4050 for_each_active_iommu(iommu, drhd) 4051 iommu_flush_iotlb_psi(iommu, si_domain, 4052 start_vpfn, mhp->nr_pages, 4053 !freelist, 0); 4054 rcu_read_unlock(); 4055 dma_free_pagelist(freelist); 4056 } 4057 break; 4058 } 4059 4060 return NOTIFY_OK; 4061 } 4062 4063 static struct notifier_block intel_iommu_memory_nb = { 4064 .notifier_call = intel_iommu_memory_notifier, 4065 .priority = 0 4066 }; 4067 4068 static void free_all_cpu_cached_iovas(unsigned int cpu) 4069 { 4070 int i; 4071 4072 for (i = 0; i < g_num_of_iommus; i++) { 4073 struct intel_iommu *iommu = g_iommus[i]; 4074 struct dmar_domain *domain; 4075 int did; 4076 4077 if (!iommu) 4078 continue; 4079 4080 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4081 domain = get_iommu_domain(iommu, (u16)did); 4082 4083 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4084 continue; 4085 4086 iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain); 4087 } 4088 } 4089 } 4090 4091 static int intel_iommu_cpu_dead(unsigned int cpu) 4092 { 4093 free_all_cpu_cached_iovas(cpu); 4094 return 0; 4095 } 4096 4097 static void intel_disable_iommus(void) 4098 { 4099 struct intel_iommu *iommu = NULL; 4100 struct dmar_drhd_unit *drhd; 4101 4102 for_each_iommu(iommu, drhd) 4103 iommu_disable_translation(iommu); 4104 } 4105 4106 void intel_iommu_shutdown(void) 4107 { 4108 struct dmar_drhd_unit *drhd; 4109 struct intel_iommu *iommu = NULL; 4110 4111 if (no_iommu || dmar_disabled) 4112 return; 4113 4114 down_write(&dmar_global_lock); 4115 4116 /* Disable PMRs explicitly here. */ 4117 for_each_iommu(iommu, drhd) 4118 iommu_disable_protect_mem_regions(iommu); 4119 4120 /* Make sure the IOMMUs are switched off */ 4121 intel_disable_iommus(); 4122 4123 up_write(&dmar_global_lock); 4124 } 4125 4126 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4127 { 4128 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4129 4130 return container_of(iommu_dev, struct intel_iommu, iommu); 4131 } 4132 4133 static ssize_t intel_iommu_show_version(struct device *dev, 4134 struct device_attribute *attr, 4135 char *buf) 4136 { 4137 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4138 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4139 return sprintf(buf, "%d:%d\n", 4140 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4141 } 4142 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4143 4144 static ssize_t intel_iommu_show_address(struct device *dev, 4145 struct device_attribute *attr, 4146 char *buf) 4147 { 4148 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4149 return sprintf(buf, "%llx\n", iommu->reg_phys); 4150 } 4151 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4152 4153 static ssize_t intel_iommu_show_cap(struct device *dev, 4154 struct device_attribute *attr, 4155 char *buf) 4156 { 4157 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4158 return sprintf(buf, "%llx\n", iommu->cap); 4159 } 4160 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4161 4162 static ssize_t intel_iommu_show_ecap(struct device *dev, 4163 struct device_attribute *attr, 4164 char *buf) 4165 { 4166 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4167 return sprintf(buf, "%llx\n", iommu->ecap); 4168 } 4169 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4170 4171 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4172 struct device_attribute *attr, 4173 char *buf) 4174 { 4175 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4176 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4177 } 4178 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4179 4180 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4181 struct device_attribute *attr, 4182 char *buf) 4183 { 4184 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4185 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4186 cap_ndoms(iommu->cap))); 4187 } 4188 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4189 4190 static struct attribute *intel_iommu_attrs[] = { 4191 &dev_attr_version.attr, 4192 &dev_attr_address.attr, 4193 &dev_attr_cap.attr, 4194 &dev_attr_ecap.attr, 4195 &dev_attr_domains_supported.attr, 4196 &dev_attr_domains_used.attr, 4197 NULL, 4198 }; 4199 4200 static struct attribute_group intel_iommu_group = { 4201 .name = "intel-iommu", 4202 .attrs = intel_iommu_attrs, 4203 }; 4204 4205 const struct attribute_group *intel_iommu_groups[] = { 4206 &intel_iommu_group, 4207 NULL, 4208 }; 4209 4210 static inline bool has_external_pci(void) 4211 { 4212 struct pci_dev *pdev = NULL; 4213 4214 for_each_pci_dev(pdev) 4215 if (pdev->external_facing) 4216 return true; 4217 4218 return false; 4219 } 4220 4221 static int __init platform_optin_force_iommu(void) 4222 { 4223 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4224 return 0; 4225 4226 if (no_iommu || dmar_disabled) 4227 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4228 4229 /* 4230 * If Intel-IOMMU is disabled by default, we will apply identity 4231 * map for all devices except those marked as being untrusted. 4232 */ 4233 if (dmar_disabled) 4234 iommu_set_default_passthrough(false); 4235 4236 dmar_disabled = 0; 4237 no_iommu = 0; 4238 4239 return 1; 4240 } 4241 4242 static int __init probe_acpi_namespace_devices(void) 4243 { 4244 struct dmar_drhd_unit *drhd; 4245 /* To avoid a -Wunused-but-set-variable warning. */ 4246 struct intel_iommu *iommu __maybe_unused; 4247 struct device *dev; 4248 int i, ret = 0; 4249 4250 for_each_active_iommu(iommu, drhd) { 4251 for_each_active_dev_scope(drhd->devices, 4252 drhd->devices_cnt, i, dev) { 4253 struct acpi_device_physical_node *pn; 4254 struct iommu_group *group; 4255 struct acpi_device *adev; 4256 4257 if (dev->bus != &acpi_bus_type) 4258 continue; 4259 4260 adev = to_acpi_device(dev); 4261 mutex_lock(&adev->physical_node_lock); 4262 list_for_each_entry(pn, 4263 &adev->physical_node_list, node) { 4264 group = iommu_group_get(pn->dev); 4265 if (group) { 4266 iommu_group_put(group); 4267 continue; 4268 } 4269 4270 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4271 ret = iommu_probe_device(pn->dev); 4272 if (ret) 4273 break; 4274 } 4275 mutex_unlock(&adev->physical_node_lock); 4276 4277 if (ret) 4278 return ret; 4279 } 4280 } 4281 4282 return 0; 4283 } 4284 4285 int __init intel_iommu_init(void) 4286 { 4287 int ret = -ENODEV; 4288 struct dmar_drhd_unit *drhd; 4289 struct intel_iommu *iommu; 4290 4291 /* 4292 * Intel IOMMU is required for a TXT/tboot launch or platform 4293 * opt in, so enforce that. 4294 */ 4295 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || 4296 platform_optin_force_iommu(); 4297 4298 if (iommu_init_mempool()) { 4299 if (force_on) 4300 panic("tboot: Failed to initialize iommu memory\n"); 4301 return -ENOMEM; 4302 } 4303 4304 down_write(&dmar_global_lock); 4305 if (dmar_table_init()) { 4306 if (force_on) 4307 panic("tboot: Failed to initialize DMAR table\n"); 4308 goto out_free_dmar; 4309 } 4310 4311 if (dmar_dev_scope_init() < 0) { 4312 if (force_on) 4313 panic("tboot: Failed to initialize DMAR device scope\n"); 4314 goto out_free_dmar; 4315 } 4316 4317 up_write(&dmar_global_lock); 4318 4319 /* 4320 * The bus notifier takes the dmar_global_lock, so lockdep will 4321 * complain later when we register it under the lock. 4322 */ 4323 dmar_register_bus_notifier(); 4324 4325 down_write(&dmar_global_lock); 4326 4327 if (!no_iommu) 4328 intel_iommu_debugfs_init(); 4329 4330 if (no_iommu || dmar_disabled) { 4331 /* 4332 * We exit the function here to ensure IOMMU's remapping and 4333 * mempool aren't setup, which means that the IOMMU's PMRs 4334 * won't be disabled via the call to init_dmars(). So disable 4335 * it explicitly here. The PMRs were setup by tboot prior to 4336 * calling SENTER, but the kernel is expected to reset/tear 4337 * down the PMRs. 4338 */ 4339 if (intel_iommu_tboot_noforce) { 4340 for_each_iommu(iommu, drhd) 4341 iommu_disable_protect_mem_regions(iommu); 4342 } 4343 4344 /* 4345 * Make sure the IOMMUs are switched off, even when we 4346 * boot into a kexec kernel and the previous kernel left 4347 * them enabled 4348 */ 4349 intel_disable_iommus(); 4350 goto out_free_dmar; 4351 } 4352 4353 if (list_empty(&dmar_rmrr_units)) 4354 pr_info("No RMRR found\n"); 4355 4356 if (list_empty(&dmar_atsr_units)) 4357 pr_info("No ATSR found\n"); 4358 4359 if (list_empty(&dmar_satc_units)) 4360 pr_info("No SATC found\n"); 4361 4362 if (dmar_map_gfx) 4363 intel_iommu_gfx_mapped = 1; 4364 4365 init_no_remapping_devices(); 4366 4367 ret = init_dmars(); 4368 if (ret) { 4369 if (force_on) 4370 panic("tboot: Failed to initialize DMARs\n"); 4371 pr_err("Initialization failed\n"); 4372 goto out_free_dmar; 4373 } 4374 up_write(&dmar_global_lock); 4375 4376 init_iommu_pm_ops(); 4377 4378 down_read(&dmar_global_lock); 4379 for_each_active_iommu(iommu, drhd) { 4380 iommu_device_sysfs_add(&iommu->iommu, NULL, 4381 intel_iommu_groups, 4382 "%s", iommu->name); 4383 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4384 iommu_device_register(&iommu->iommu); 4385 } 4386 up_read(&dmar_global_lock); 4387 4388 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4389 if (si_domain && !hw_pass_through) 4390 register_memory_notifier(&intel_iommu_memory_nb); 4391 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4392 intel_iommu_cpu_dead); 4393 4394 down_read(&dmar_global_lock); 4395 if (probe_acpi_namespace_devices()) 4396 pr_warn("ACPI name space devices didn't probe correctly\n"); 4397 4398 /* Finally, we enable the DMA remapping hardware. */ 4399 for_each_iommu(iommu, drhd) { 4400 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4401 iommu_enable_translation(iommu); 4402 4403 iommu_disable_protect_mem_regions(iommu); 4404 } 4405 up_read(&dmar_global_lock); 4406 4407 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4408 4409 intel_iommu_enabled = 1; 4410 4411 return 0; 4412 4413 out_free_dmar: 4414 intel_iommu_free_dmars(); 4415 up_write(&dmar_global_lock); 4416 iommu_exit_mempool(); 4417 return ret; 4418 } 4419 4420 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4421 { 4422 struct intel_iommu *iommu = opaque; 4423 4424 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 4425 return 0; 4426 } 4427 4428 /* 4429 * NB - intel-iommu lacks any sort of reference counting for the users of 4430 * dependent devices. If multiple endpoints have intersecting dependent 4431 * devices, unbinding the driver from any one of them will possibly leave 4432 * the others unable to operate. 4433 */ 4434 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 4435 { 4436 if (!iommu || !dev || !dev_is_pci(dev)) 4437 return; 4438 4439 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 4440 } 4441 4442 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4443 { 4444 struct dmar_domain *domain; 4445 struct intel_iommu *iommu; 4446 unsigned long flags; 4447 4448 assert_spin_locked(&device_domain_lock); 4449 4450 if (WARN_ON(!info)) 4451 return; 4452 4453 iommu = info->iommu; 4454 domain = info->domain; 4455 4456 if (info->dev) { 4457 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4458 intel_pasid_tear_down_entry(iommu, info->dev, 4459 PASID_RID2PASID, false); 4460 4461 iommu_disable_dev_iotlb(info); 4462 if (!dev_is_real_dma_subdevice(info->dev)) 4463 domain_context_clear(iommu, info->dev); 4464 intel_pasid_free_table(info->dev); 4465 } 4466 4467 unlink_domain_info(info); 4468 4469 spin_lock_irqsave(&iommu->lock, flags); 4470 domain_detach_iommu(domain, iommu); 4471 spin_unlock_irqrestore(&iommu->lock, flags); 4472 4473 free_devinfo_mem(info); 4474 } 4475 4476 static void dmar_remove_one_dev_info(struct device *dev) 4477 { 4478 struct device_domain_info *info; 4479 unsigned long flags; 4480 4481 spin_lock_irqsave(&device_domain_lock, flags); 4482 info = get_domain_info(dev); 4483 if (info) 4484 __dmar_remove_one_dev_info(info); 4485 spin_unlock_irqrestore(&device_domain_lock, flags); 4486 } 4487 4488 static int md_domain_init(struct dmar_domain *domain, int guest_width) 4489 { 4490 int adjust_width; 4491 4492 /* calculate AGAW */ 4493 domain->gaw = guest_width; 4494 adjust_width = guestwidth_to_adjustwidth(guest_width); 4495 domain->agaw = width_to_agaw(adjust_width); 4496 4497 domain->iommu_coherency = 0; 4498 domain->iommu_snooping = 0; 4499 domain->iommu_superpage = 0; 4500 domain->max_addr = 0; 4501 4502 /* always allocate the top pgd */ 4503 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 4504 if (!domain->pgd) 4505 return -ENOMEM; 4506 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 4507 return 0; 4508 } 4509 4510 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 4511 { 4512 struct dmar_domain *dmar_domain; 4513 struct iommu_domain *domain; 4514 4515 switch (type) { 4516 case IOMMU_DOMAIN_DMA: 4517 case IOMMU_DOMAIN_UNMANAGED: 4518 dmar_domain = alloc_domain(0); 4519 if (!dmar_domain) { 4520 pr_err("Can't allocate dmar_domain\n"); 4521 return NULL; 4522 } 4523 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 4524 pr_err("Domain initialization failed\n"); 4525 domain_exit(dmar_domain); 4526 return NULL; 4527 } 4528 4529 if (type == IOMMU_DOMAIN_DMA && 4530 iommu_get_dma_cookie(&dmar_domain->domain)) 4531 return NULL; 4532 4533 domain = &dmar_domain->domain; 4534 domain->geometry.aperture_start = 0; 4535 domain->geometry.aperture_end = 4536 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 4537 domain->geometry.force_aperture = true; 4538 4539 return domain; 4540 case IOMMU_DOMAIN_IDENTITY: 4541 return &si_domain->domain; 4542 default: 4543 return NULL; 4544 } 4545 4546 return NULL; 4547 } 4548 4549 static void intel_iommu_domain_free(struct iommu_domain *domain) 4550 { 4551 if (domain != &si_domain->domain) 4552 domain_exit(to_dmar_domain(domain)); 4553 } 4554 4555 /* 4556 * Check whether a @domain could be attached to the @dev through the 4557 * aux-domain attach/detach APIs. 4558 */ 4559 static inline bool 4560 is_aux_domain(struct device *dev, struct iommu_domain *domain) 4561 { 4562 struct device_domain_info *info = get_domain_info(dev); 4563 4564 return info && info->auxd_enabled && 4565 domain->type == IOMMU_DOMAIN_UNMANAGED; 4566 } 4567 4568 static inline struct subdev_domain_info * 4569 lookup_subdev_info(struct dmar_domain *domain, struct device *dev) 4570 { 4571 struct subdev_domain_info *sinfo; 4572 4573 if (!list_empty(&domain->subdevices)) { 4574 list_for_each_entry(sinfo, &domain->subdevices, link_domain) { 4575 if (sinfo->pdev == dev) 4576 return sinfo; 4577 } 4578 } 4579 4580 return NULL; 4581 } 4582 4583 static int auxiliary_link_device(struct dmar_domain *domain, 4584 struct device *dev) 4585 { 4586 struct device_domain_info *info = get_domain_info(dev); 4587 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4588 4589 assert_spin_locked(&device_domain_lock); 4590 if (WARN_ON(!info)) 4591 return -EINVAL; 4592 4593 if (!sinfo) { 4594 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); 4595 sinfo->domain = domain; 4596 sinfo->pdev = dev; 4597 list_add(&sinfo->link_phys, &info->subdevices); 4598 list_add(&sinfo->link_domain, &domain->subdevices); 4599 } 4600 4601 return ++sinfo->users; 4602 } 4603 4604 static int auxiliary_unlink_device(struct dmar_domain *domain, 4605 struct device *dev) 4606 { 4607 struct device_domain_info *info = get_domain_info(dev); 4608 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); 4609 int ret; 4610 4611 assert_spin_locked(&device_domain_lock); 4612 if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) 4613 return -EINVAL; 4614 4615 ret = --sinfo->users; 4616 if (!ret) { 4617 list_del(&sinfo->link_phys); 4618 list_del(&sinfo->link_domain); 4619 kfree(sinfo); 4620 } 4621 4622 return ret; 4623 } 4624 4625 static int aux_domain_add_dev(struct dmar_domain *domain, 4626 struct device *dev) 4627 { 4628 int ret; 4629 unsigned long flags; 4630 struct intel_iommu *iommu; 4631 4632 iommu = device_to_iommu(dev, NULL, NULL); 4633 if (!iommu) 4634 return -ENODEV; 4635 4636 if (domain->default_pasid <= 0) { 4637 u32 pasid; 4638 4639 /* No private data needed for the default pasid */ 4640 pasid = ioasid_alloc(NULL, PASID_MIN, 4641 pci_max_pasids(to_pci_dev(dev)) - 1, 4642 NULL); 4643 if (pasid == INVALID_IOASID) { 4644 pr_err("Can't allocate default pasid\n"); 4645 return -ENODEV; 4646 } 4647 domain->default_pasid = pasid; 4648 } 4649 4650 spin_lock_irqsave(&device_domain_lock, flags); 4651 ret = auxiliary_link_device(domain, dev); 4652 if (ret <= 0) 4653 goto link_failed; 4654 4655 /* 4656 * Subdevices from the same physical device can be attached to the 4657 * same domain. For such cases, only the first subdevice attachment 4658 * needs to go through the full steps in this function. So if ret > 4659 * 1, just goto out. 4660 */ 4661 if (ret > 1) 4662 goto out; 4663 4664 /* 4665 * iommu->lock must be held to attach domain to iommu and setup the 4666 * pasid entry for second level translation. 4667 */ 4668 spin_lock(&iommu->lock); 4669 ret = domain_attach_iommu(domain, iommu); 4670 if (ret) 4671 goto attach_failed; 4672 4673 /* Setup the PASID entry for mediated devices: */ 4674 if (domain_use_first_level(domain)) 4675 ret = domain_setup_first_level(iommu, domain, dev, 4676 domain->default_pasid); 4677 else 4678 ret = intel_pasid_setup_second_level(iommu, domain, dev, 4679 domain->default_pasid); 4680 if (ret) 4681 goto table_failed; 4682 4683 spin_unlock(&iommu->lock); 4684 out: 4685 spin_unlock_irqrestore(&device_domain_lock, flags); 4686 4687 return 0; 4688 4689 table_failed: 4690 domain_detach_iommu(domain, iommu); 4691 attach_failed: 4692 spin_unlock(&iommu->lock); 4693 auxiliary_unlink_device(domain, dev); 4694 link_failed: 4695 spin_unlock_irqrestore(&device_domain_lock, flags); 4696 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4697 ioasid_put(domain->default_pasid); 4698 4699 return ret; 4700 } 4701 4702 static void aux_domain_remove_dev(struct dmar_domain *domain, 4703 struct device *dev) 4704 { 4705 struct device_domain_info *info; 4706 struct intel_iommu *iommu; 4707 unsigned long flags; 4708 4709 if (!is_aux_domain(dev, &domain->domain)) 4710 return; 4711 4712 spin_lock_irqsave(&device_domain_lock, flags); 4713 info = get_domain_info(dev); 4714 iommu = info->iommu; 4715 4716 if (!auxiliary_unlink_device(domain, dev)) { 4717 spin_lock(&iommu->lock); 4718 intel_pasid_tear_down_entry(iommu, dev, 4719 domain->default_pasid, false); 4720 domain_detach_iommu(domain, iommu); 4721 spin_unlock(&iommu->lock); 4722 } 4723 4724 spin_unlock_irqrestore(&device_domain_lock, flags); 4725 4726 if (list_empty(&domain->subdevices) && domain->default_pasid > 0) 4727 ioasid_put(domain->default_pasid); 4728 } 4729 4730 static int prepare_domain_attach_device(struct iommu_domain *domain, 4731 struct device *dev) 4732 { 4733 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4734 struct intel_iommu *iommu; 4735 int addr_width; 4736 4737 iommu = device_to_iommu(dev, NULL, NULL); 4738 if (!iommu) 4739 return -ENODEV; 4740 4741 /* check if this iommu agaw is sufficient for max mapped address */ 4742 addr_width = agaw_to_width(iommu->agaw); 4743 if (addr_width > cap_mgaw(iommu->cap)) 4744 addr_width = cap_mgaw(iommu->cap); 4745 4746 if (dmar_domain->max_addr > (1LL << addr_width)) { 4747 dev_err(dev, "%s: iommu width (%d) is not " 4748 "sufficient for the mapped address (%llx)\n", 4749 __func__, addr_width, dmar_domain->max_addr); 4750 return -EFAULT; 4751 } 4752 dmar_domain->gaw = addr_width; 4753 4754 /* 4755 * Knock out extra levels of page tables if necessary 4756 */ 4757 while (iommu->agaw < dmar_domain->agaw) { 4758 struct dma_pte *pte; 4759 4760 pte = dmar_domain->pgd; 4761 if (dma_pte_present(pte)) { 4762 dmar_domain->pgd = (struct dma_pte *) 4763 phys_to_virt(dma_pte_addr(pte)); 4764 free_pgtable_page(pte); 4765 } 4766 dmar_domain->agaw--; 4767 } 4768 4769 return 0; 4770 } 4771 4772 static int intel_iommu_attach_device(struct iommu_domain *domain, 4773 struct device *dev) 4774 { 4775 int ret; 4776 4777 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 4778 device_is_rmrr_locked(dev)) { 4779 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 4780 return -EPERM; 4781 } 4782 4783 if (is_aux_domain(dev, domain)) 4784 return -EPERM; 4785 4786 /* normally dev is not mapped */ 4787 if (unlikely(domain_context_mapped(dev))) { 4788 struct dmar_domain *old_domain; 4789 4790 old_domain = find_domain(dev); 4791 if (old_domain) 4792 dmar_remove_one_dev_info(dev); 4793 } 4794 4795 ret = prepare_domain_attach_device(domain, dev); 4796 if (ret) 4797 return ret; 4798 4799 return domain_add_dev_info(to_dmar_domain(domain), dev); 4800 } 4801 4802 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 4803 struct device *dev) 4804 { 4805 int ret; 4806 4807 if (!is_aux_domain(dev, domain)) 4808 return -EPERM; 4809 4810 ret = prepare_domain_attach_device(domain, dev); 4811 if (ret) 4812 return ret; 4813 4814 return aux_domain_add_dev(to_dmar_domain(domain), dev); 4815 } 4816 4817 static void intel_iommu_detach_device(struct iommu_domain *domain, 4818 struct device *dev) 4819 { 4820 dmar_remove_one_dev_info(dev); 4821 } 4822 4823 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 4824 struct device *dev) 4825 { 4826 aux_domain_remove_dev(to_dmar_domain(domain), dev); 4827 } 4828 4829 #ifdef CONFIG_INTEL_IOMMU_SVM 4830 /* 4831 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 4832 * VT-d granularity. Invalidation is typically included in the unmap operation 4833 * as a result of DMA or VFIO unmap. However, for assigned devices guest 4834 * owns the first level page tables. Invalidations of translation caches in the 4835 * guest are trapped and passed down to the host. 4836 * 4837 * vIOMMU in the guest will only expose first level page tables, therefore 4838 * we do not support IOTLB granularity for request without PASID (second level). 4839 * 4840 * For example, to find the VT-d granularity encoding for IOTLB 4841 * type and page selective granularity within PASID: 4842 * X: indexed by iommu cache type 4843 * Y: indexed by enum iommu_inv_granularity 4844 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 4845 */ 4846 4847 static const int 4848 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 4849 /* 4850 * PASID based IOTLB invalidation: PASID selective (per PASID), 4851 * page selective (address granularity) 4852 */ 4853 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 4854 /* PASID based dev TLBs */ 4855 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 4856 /* PASID cache */ 4857 {-EINVAL, -EINVAL, -EINVAL} 4858 }; 4859 4860 static inline int to_vtd_granularity(int type, int granu) 4861 { 4862 return inv_type_granu_table[type][granu]; 4863 } 4864 4865 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 4866 { 4867 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 4868 4869 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 4870 * IOMMU cache invalidate API passes granu_size in bytes, and number of 4871 * granu size in contiguous memory. 4872 */ 4873 return order_base_2(nr_pages); 4874 } 4875 4876 static int 4877 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 4878 struct iommu_cache_invalidate_info *inv_info) 4879 { 4880 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 4881 struct device_domain_info *info; 4882 struct intel_iommu *iommu; 4883 unsigned long flags; 4884 int cache_type; 4885 u8 bus, devfn; 4886 u16 did, sid; 4887 int ret = 0; 4888 u64 size = 0; 4889 4890 if (!inv_info || !dmar_domain) 4891 return -EINVAL; 4892 4893 if (!dev || !dev_is_pci(dev)) 4894 return -ENODEV; 4895 4896 iommu = device_to_iommu(dev, &bus, &devfn); 4897 if (!iommu) 4898 return -ENODEV; 4899 4900 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 4901 return -EINVAL; 4902 4903 spin_lock_irqsave(&device_domain_lock, flags); 4904 spin_lock(&iommu->lock); 4905 info = get_domain_info(dev); 4906 if (!info) { 4907 ret = -EINVAL; 4908 goto out_unlock; 4909 } 4910 did = dmar_domain->iommu_did[iommu->seq_id]; 4911 sid = PCI_DEVID(bus, devfn); 4912 4913 /* Size is only valid in address selective invalidation */ 4914 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 4915 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 4916 inv_info->granu.addr_info.nb_granules); 4917 4918 for_each_set_bit(cache_type, 4919 (unsigned long *)&inv_info->cache, 4920 IOMMU_CACHE_INV_TYPE_NR) { 4921 int granu = 0; 4922 u64 pasid = 0; 4923 u64 addr = 0; 4924 4925 granu = to_vtd_granularity(cache_type, inv_info->granularity); 4926 if (granu == -EINVAL) { 4927 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 4928 cache_type, inv_info->granularity); 4929 break; 4930 } 4931 4932 /* 4933 * PASID is stored in different locations based on the 4934 * granularity. 4935 */ 4936 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 4937 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 4938 pasid = inv_info->granu.pasid_info.pasid; 4939 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4940 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 4941 pasid = inv_info->granu.addr_info.pasid; 4942 4943 switch (BIT(cache_type)) { 4944 case IOMMU_CACHE_INV_TYPE_IOTLB: 4945 /* HW will ignore LSB bits based on address mask */ 4946 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 4947 size && 4948 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 4949 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 4950 inv_info->granu.addr_info.addr, size); 4951 } 4952 4953 /* 4954 * If granu is PASID-selective, address is ignored. 4955 * We use npages = -1 to indicate that. 4956 */ 4957 qi_flush_piotlb(iommu, did, pasid, 4958 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 4959 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 4960 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 4961 4962 if (!info->ats_enabled) 4963 break; 4964 /* 4965 * Always flush device IOTLB if ATS is enabled. vIOMMU 4966 * in the guest may assume IOTLB flush is inclusive, 4967 * which is more efficient. 4968 */ 4969 fallthrough; 4970 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 4971 /* 4972 * PASID based device TLB invalidation does not support 4973 * IOMMU_INV_GRANU_PASID granularity but only supports 4974 * IOMMU_INV_GRANU_ADDR. 4975 * The equivalent of that is we set the size to be the 4976 * entire range of 64 bit. User only provides PASID info 4977 * without address info. So we set addr to 0. 4978 */ 4979 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 4980 size = 64 - VTD_PAGE_SHIFT; 4981 addr = 0; 4982 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 4983 addr = inv_info->granu.addr_info.addr; 4984 } 4985 4986 if (info->ats_enabled) 4987 qi_flush_dev_iotlb_pasid(iommu, sid, 4988 info->pfsid, pasid, 4989 info->ats_qdep, addr, 4990 size); 4991 else 4992 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 4993 break; 4994 default: 4995 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 4996 cache_type); 4997 ret = -EINVAL; 4998 } 4999 } 5000 out_unlock: 5001 spin_unlock(&iommu->lock); 5002 spin_unlock_irqrestore(&device_domain_lock, flags); 5003 5004 return ret; 5005 } 5006 #endif 5007 5008 static int intel_iommu_map(struct iommu_domain *domain, 5009 unsigned long iova, phys_addr_t hpa, 5010 size_t size, int iommu_prot, gfp_t gfp) 5011 { 5012 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5013 u64 max_addr; 5014 int prot = 0; 5015 5016 if (iommu_prot & IOMMU_READ) 5017 prot |= DMA_PTE_READ; 5018 if (iommu_prot & IOMMU_WRITE) 5019 prot |= DMA_PTE_WRITE; 5020 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5021 prot |= DMA_PTE_SNP; 5022 5023 max_addr = iova + size; 5024 if (dmar_domain->max_addr < max_addr) { 5025 u64 end; 5026 5027 /* check if minimum agaw is sufficient for mapped address */ 5028 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5029 if (end < max_addr) { 5030 pr_err("%s: iommu width (%d) is not " 5031 "sufficient for the mapped address (%llx)\n", 5032 __func__, dmar_domain->gaw, max_addr); 5033 return -EFAULT; 5034 } 5035 dmar_domain->max_addr = max_addr; 5036 } 5037 /* Round up size to next multiple of PAGE_SIZE, if it and 5038 the low bits of hpa would take us onto the next page */ 5039 size = aligned_nrpages(hpa, size); 5040 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5041 hpa >> VTD_PAGE_SHIFT, size, prot); 5042 } 5043 5044 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5045 unsigned long iova, size_t size, 5046 struct iommu_iotlb_gather *gather) 5047 { 5048 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5049 unsigned long start_pfn, last_pfn; 5050 int level = 0; 5051 5052 /* Cope with horrid API which requires us to unmap more than the 5053 size argument if it happens to be a large-page mapping. */ 5054 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5055 5056 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5057 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5058 5059 start_pfn = iova >> VTD_PAGE_SHIFT; 5060 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5061 5062 gather->freelist = domain_unmap(dmar_domain, start_pfn, 5063 last_pfn, gather->freelist); 5064 5065 if (dmar_domain->max_addr == iova + size) 5066 dmar_domain->max_addr = iova; 5067 5068 iommu_iotlb_gather_add_page(domain, gather, iova, size); 5069 5070 return size; 5071 } 5072 5073 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 5074 struct iommu_iotlb_gather *gather) 5075 { 5076 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5077 unsigned long iova_pfn = IOVA_PFN(gather->start); 5078 size_t size = gather->end - gather->start; 5079 unsigned long start_pfn; 5080 unsigned long nrpages; 5081 int iommu_id; 5082 5083 nrpages = aligned_nrpages(gather->start, size); 5084 start_pfn = mm_to_dma_pfn(iova_pfn); 5085 5086 for_each_domain_iommu(iommu_id, dmar_domain) 5087 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5088 start_pfn, nrpages, !gather->freelist, 0); 5089 5090 dma_free_pagelist(gather->freelist); 5091 } 5092 5093 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5094 dma_addr_t iova) 5095 { 5096 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5097 struct dma_pte *pte; 5098 int level = 0; 5099 u64 phys = 0; 5100 5101 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5102 if (pte && dma_pte_present(pte)) 5103 phys = dma_pte_addr(pte) + 5104 (iova & (BIT_MASK(level_to_offset_bits(level) + 5105 VTD_PAGE_SHIFT) - 1)); 5106 5107 return phys; 5108 } 5109 5110 static bool intel_iommu_capable(enum iommu_cap cap) 5111 { 5112 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5113 return domain_update_iommu_snooping(NULL) == 1; 5114 if (cap == IOMMU_CAP_INTR_REMAP) 5115 return irq_remapping_enabled == 1; 5116 5117 return false; 5118 } 5119 5120 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5121 { 5122 struct intel_iommu *iommu; 5123 5124 iommu = device_to_iommu(dev, NULL, NULL); 5125 if (!iommu) 5126 return ERR_PTR(-ENODEV); 5127 5128 if (translation_pre_enabled(iommu)) 5129 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5130 5131 return &iommu->iommu; 5132 } 5133 5134 static void intel_iommu_release_device(struct device *dev) 5135 { 5136 struct intel_iommu *iommu; 5137 5138 iommu = device_to_iommu(dev, NULL, NULL); 5139 if (!iommu) 5140 return; 5141 5142 dmar_remove_one_dev_info(dev); 5143 5144 set_dma_ops(dev, NULL); 5145 } 5146 5147 static void intel_iommu_probe_finalize(struct device *dev) 5148 { 5149 dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT; 5150 struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 5151 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5152 5153 if (domain && domain->type == IOMMU_DOMAIN_DMA) 5154 iommu_setup_dma_ops(dev, base, 5155 __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base); 5156 else 5157 set_dma_ops(dev, NULL); 5158 } 5159 5160 static void intel_iommu_get_resv_regions(struct device *device, 5161 struct list_head *head) 5162 { 5163 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5164 struct iommu_resv_region *reg; 5165 struct dmar_rmrr_unit *rmrr; 5166 struct device *i_dev; 5167 int i; 5168 5169 down_read(&dmar_global_lock); 5170 for_each_rmrr_units(rmrr) { 5171 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5172 i, i_dev) { 5173 struct iommu_resv_region *resv; 5174 enum iommu_resv_type type; 5175 size_t length; 5176 5177 if (i_dev != device && 5178 !is_downstream_to_pci_bridge(device, i_dev)) 5179 continue; 5180 5181 length = rmrr->end_address - rmrr->base_address + 1; 5182 5183 type = device_rmrr_is_relaxable(device) ? 5184 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5185 5186 resv = iommu_alloc_resv_region(rmrr->base_address, 5187 length, prot, type); 5188 if (!resv) 5189 break; 5190 5191 list_add_tail(&resv->list, head); 5192 } 5193 } 5194 up_read(&dmar_global_lock); 5195 5196 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5197 if (dev_is_pci(device)) { 5198 struct pci_dev *pdev = to_pci_dev(device); 5199 5200 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5201 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5202 IOMMU_RESV_DIRECT_RELAXABLE); 5203 if (reg) 5204 list_add_tail(®->list, head); 5205 } 5206 } 5207 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5208 5209 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5210 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5211 0, IOMMU_RESV_MSI); 5212 if (!reg) 5213 return; 5214 list_add_tail(®->list, head); 5215 } 5216 5217 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5218 { 5219 struct device_domain_info *info; 5220 struct context_entry *context; 5221 struct dmar_domain *domain; 5222 unsigned long flags; 5223 u64 ctx_lo; 5224 int ret; 5225 5226 domain = find_domain(dev); 5227 if (!domain) 5228 return -EINVAL; 5229 5230 spin_lock_irqsave(&device_domain_lock, flags); 5231 spin_lock(&iommu->lock); 5232 5233 ret = -EINVAL; 5234 info = get_domain_info(dev); 5235 if (!info || !info->pasid_supported) 5236 goto out; 5237 5238 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5239 if (WARN_ON(!context)) 5240 goto out; 5241 5242 ctx_lo = context[0].lo; 5243 5244 if (!(ctx_lo & CONTEXT_PASIDE)) { 5245 ctx_lo |= CONTEXT_PASIDE; 5246 context[0].lo = ctx_lo; 5247 wmb(); 5248 iommu->flush.flush_context(iommu, 5249 domain->iommu_did[iommu->seq_id], 5250 PCI_DEVID(info->bus, info->devfn), 5251 DMA_CCMD_MASK_NOBIT, 5252 DMA_CCMD_DEVICE_INVL); 5253 } 5254 5255 /* Enable PASID support in the device, if it wasn't already */ 5256 if (!info->pasid_enabled) 5257 iommu_enable_dev_iotlb(info); 5258 5259 ret = 0; 5260 5261 out: 5262 spin_unlock(&iommu->lock); 5263 spin_unlock_irqrestore(&device_domain_lock, flags); 5264 5265 return ret; 5266 } 5267 5268 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5269 { 5270 if (dev_is_pci(dev)) 5271 return pci_device_group(dev); 5272 return generic_device_group(dev); 5273 } 5274 5275 static int intel_iommu_enable_auxd(struct device *dev) 5276 { 5277 struct device_domain_info *info; 5278 struct intel_iommu *iommu; 5279 unsigned long flags; 5280 int ret; 5281 5282 iommu = device_to_iommu(dev, NULL, NULL); 5283 if (!iommu || dmar_disabled) 5284 return -EINVAL; 5285 5286 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5287 return -EINVAL; 5288 5289 ret = intel_iommu_enable_pasid(iommu, dev); 5290 if (ret) 5291 return -ENODEV; 5292 5293 spin_lock_irqsave(&device_domain_lock, flags); 5294 info = get_domain_info(dev); 5295 info->auxd_enabled = 1; 5296 spin_unlock_irqrestore(&device_domain_lock, flags); 5297 5298 return 0; 5299 } 5300 5301 static int intel_iommu_disable_auxd(struct device *dev) 5302 { 5303 struct device_domain_info *info; 5304 unsigned long flags; 5305 5306 spin_lock_irqsave(&device_domain_lock, flags); 5307 info = get_domain_info(dev); 5308 if (!WARN_ON(!info)) 5309 info->auxd_enabled = 0; 5310 spin_unlock_irqrestore(&device_domain_lock, flags); 5311 5312 return 0; 5313 } 5314 5315 /* 5316 * A PCI express designated vendor specific extended capability is defined 5317 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5318 * for system software and tools to detect endpoint devices supporting the 5319 * Intel scalable IO virtualization without host driver dependency. 5320 * 5321 * Returns the address of the matching extended capability structure within 5322 * the device's PCI configuration space or 0 if the device does not support 5323 * it. 5324 */ 5325 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5326 { 5327 int pos; 5328 u16 vendor, id; 5329 5330 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5331 while (pos) { 5332 pci_read_config_word(pdev, pos + 4, &vendor); 5333 pci_read_config_word(pdev, pos + 8, &id); 5334 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5335 return pos; 5336 5337 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5338 } 5339 5340 return 0; 5341 } 5342 5343 static bool 5344 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5345 { 5346 if (feat == IOMMU_DEV_FEAT_AUX) { 5347 int ret; 5348 5349 if (!dev_is_pci(dev) || dmar_disabled || 5350 !scalable_mode_support() || !pasid_mode_support()) 5351 return false; 5352 5353 ret = pci_pasid_features(to_pci_dev(dev)); 5354 if (ret < 0) 5355 return false; 5356 5357 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5358 } 5359 5360 if (feat == IOMMU_DEV_FEAT_SVA) { 5361 struct device_domain_info *info = get_domain_info(dev); 5362 5363 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5364 info->pasid_supported && info->pri_supported && 5365 info->ats_supported; 5366 } 5367 5368 return false; 5369 } 5370 5371 static int 5372 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5373 { 5374 if (feat == IOMMU_DEV_FEAT_AUX) 5375 return intel_iommu_enable_auxd(dev); 5376 5377 if (feat == IOMMU_DEV_FEAT_SVA) { 5378 struct device_domain_info *info = get_domain_info(dev); 5379 5380 if (!info) 5381 return -EINVAL; 5382 5383 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5384 return 0; 5385 } 5386 5387 return -ENODEV; 5388 } 5389 5390 static int 5391 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5392 { 5393 if (feat == IOMMU_DEV_FEAT_AUX) 5394 return intel_iommu_disable_auxd(dev); 5395 5396 return -ENODEV; 5397 } 5398 5399 static bool 5400 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5401 { 5402 struct device_domain_info *info = get_domain_info(dev); 5403 5404 if (feat == IOMMU_DEV_FEAT_AUX) 5405 return scalable_mode_support() && info && info->auxd_enabled; 5406 5407 return false; 5408 } 5409 5410 static int 5411 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5412 { 5413 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5414 5415 return dmar_domain->default_pasid > 0 ? 5416 dmar_domain->default_pasid : -EINVAL; 5417 } 5418 5419 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5420 struct device *dev) 5421 { 5422 return attach_deferred(dev); 5423 } 5424 5425 static int 5426 intel_iommu_domain_set_attr(struct iommu_domain *domain, 5427 enum iommu_attr attr, void *data) 5428 { 5429 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5430 unsigned long flags; 5431 int ret = 0; 5432 5433 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 5434 return -EINVAL; 5435 5436 switch (attr) { 5437 case DOMAIN_ATTR_NESTING: 5438 spin_lock_irqsave(&device_domain_lock, flags); 5439 if (nested_mode_support() && 5440 list_empty(&dmar_domain->devices)) { 5441 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 5442 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 5443 } else { 5444 ret = -ENODEV; 5445 } 5446 spin_unlock_irqrestore(&device_domain_lock, flags); 5447 break; 5448 default: 5449 ret = -EINVAL; 5450 break; 5451 } 5452 5453 return ret; 5454 } 5455 5456 static bool domain_use_flush_queue(void) 5457 { 5458 struct dmar_drhd_unit *drhd; 5459 struct intel_iommu *iommu; 5460 bool r = true; 5461 5462 if (intel_iommu_strict) 5463 return false; 5464 5465 /* 5466 * The flush queue implementation does not perform page-selective 5467 * invalidations that are required for efficient TLB flushes in virtual 5468 * environments. The benefit of batching is likely to be much lower than 5469 * the overhead of synchronizing the virtual and physical IOMMU 5470 * page-tables. 5471 */ 5472 rcu_read_lock(); 5473 for_each_active_iommu(iommu, drhd) { 5474 if (!cap_caching_mode(iommu->cap)) 5475 continue; 5476 5477 pr_warn_once("IOMMU batching is disabled due to virtualization"); 5478 r = false; 5479 break; 5480 } 5481 rcu_read_unlock(); 5482 5483 return r; 5484 } 5485 5486 static int 5487 intel_iommu_domain_get_attr(struct iommu_domain *domain, 5488 enum iommu_attr attr, void *data) 5489 { 5490 switch (domain->type) { 5491 case IOMMU_DOMAIN_UNMANAGED: 5492 return -ENODEV; 5493 case IOMMU_DOMAIN_DMA: 5494 switch (attr) { 5495 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: 5496 *(int *)data = domain_use_flush_queue(); 5497 return 0; 5498 default: 5499 return -ENODEV; 5500 } 5501 break; 5502 default: 5503 return -EINVAL; 5504 } 5505 } 5506 5507 /* 5508 * Check that the device does not live on an external facing PCI port that is 5509 * marked as untrusted. Such devices should not be able to apply quirks and 5510 * thus not be able to bypass the IOMMU restrictions. 5511 */ 5512 static bool risky_device(struct pci_dev *pdev) 5513 { 5514 if (pdev->untrusted) { 5515 pci_info(pdev, 5516 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 5517 pdev->vendor, pdev->device); 5518 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 5519 return true; 5520 } 5521 return false; 5522 } 5523 5524 static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn, 5525 unsigned long clf_pages) 5526 { 5527 struct dma_pte *first_pte = NULL, *pte = NULL; 5528 unsigned long lvl_pages = 0; 5529 int level = 0; 5530 5531 while (clf_pages > 0) { 5532 if (!pte) { 5533 level = 0; 5534 pte = pfn_to_dma_pte(domain, clf_pfn, &level); 5535 if (WARN_ON(!pte)) 5536 return; 5537 first_pte = pte; 5538 lvl_pages = lvl_to_nr_pages(level); 5539 } 5540 5541 if (WARN_ON(!lvl_pages || clf_pages < lvl_pages)) 5542 return; 5543 5544 clf_pages -= lvl_pages; 5545 clf_pfn += lvl_pages; 5546 pte++; 5547 5548 if (!clf_pages || first_pte_in_page(pte) || 5549 (level > 1 && clf_pages < lvl_pages)) { 5550 domain_flush_cache(domain, first_pte, 5551 (void *)pte - (void *)first_pte); 5552 pte = NULL; 5553 } 5554 } 5555 } 5556 5557 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, 5558 unsigned long iova, size_t size) 5559 { 5560 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5561 unsigned long pages = aligned_nrpages(iova, size); 5562 unsigned long pfn = iova >> VTD_PAGE_SHIFT; 5563 struct intel_iommu *iommu; 5564 int iommu_id; 5565 5566 if (!dmar_domain->iommu_coherency) 5567 clflush_sync_map(dmar_domain, pfn, pages); 5568 5569 for_each_domain_iommu(iommu_id, dmar_domain) { 5570 iommu = g_iommus[iommu_id]; 5571 __mapping_notify_one(iommu, dmar_domain, pfn, pages); 5572 } 5573 } 5574 5575 const struct iommu_ops intel_iommu_ops = { 5576 .capable = intel_iommu_capable, 5577 .domain_alloc = intel_iommu_domain_alloc, 5578 .domain_free = intel_iommu_domain_free, 5579 .domain_get_attr = intel_iommu_domain_get_attr, 5580 .domain_set_attr = intel_iommu_domain_set_attr, 5581 .attach_dev = intel_iommu_attach_device, 5582 .detach_dev = intel_iommu_detach_device, 5583 .aux_attach_dev = intel_iommu_aux_attach_device, 5584 .aux_detach_dev = intel_iommu_aux_detach_device, 5585 .aux_get_pasid = intel_iommu_aux_get_pasid, 5586 .map = intel_iommu_map, 5587 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 5588 .unmap = intel_iommu_unmap, 5589 .flush_iotlb_all = intel_flush_iotlb_all, 5590 .iotlb_sync = intel_iommu_tlb_sync, 5591 .iova_to_phys = intel_iommu_iova_to_phys, 5592 .probe_device = intel_iommu_probe_device, 5593 .probe_finalize = intel_iommu_probe_finalize, 5594 .release_device = intel_iommu_release_device, 5595 .get_resv_regions = intel_iommu_get_resv_regions, 5596 .put_resv_regions = generic_iommu_put_resv_regions, 5597 .device_group = intel_iommu_device_group, 5598 .dev_has_feat = intel_iommu_dev_has_feat, 5599 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 5600 .dev_enable_feat = intel_iommu_dev_enable_feat, 5601 .dev_disable_feat = intel_iommu_dev_disable_feat, 5602 .is_attach_deferred = intel_iommu_is_attach_deferred, 5603 .def_domain_type = device_def_domain_type, 5604 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 5605 #ifdef CONFIG_INTEL_IOMMU_SVM 5606 .cache_invalidate = intel_iommu_sva_invalidate, 5607 .sva_bind_gpasid = intel_svm_bind_gpasid, 5608 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 5609 .sva_bind = intel_svm_bind, 5610 .sva_unbind = intel_svm_unbind, 5611 .sva_get_pasid = intel_svm_get_pasid, 5612 .page_response = intel_svm_page_response, 5613 #endif 5614 }; 5615 5616 static void quirk_iommu_igfx(struct pci_dev *dev) 5617 { 5618 if (risky_device(dev)) 5619 return; 5620 5621 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 5622 dmar_map_gfx = 0; 5623 } 5624 5625 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 5626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 5627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 5628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 5629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 5630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 5631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 5632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 5633 5634 /* Broadwell igfx malfunctions with dmar */ 5635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 5636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 5637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 5638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 5656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 5659 5660 static void quirk_iommu_rwbf(struct pci_dev *dev) 5661 { 5662 if (risky_device(dev)) 5663 return; 5664 5665 /* 5666 * Mobile 4 Series Chipset neglects to set RWBF capability, 5667 * but needs it. Same seems to hold for the desktop versions. 5668 */ 5669 pci_info(dev, "Forcing write-buffer flush capability\n"); 5670 rwbf_quirk = 1; 5671 } 5672 5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 5674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 5675 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 5676 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 5677 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 5678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 5679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 5680 5681 #define GGC 0x52 5682 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 5683 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 5684 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 5685 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 5686 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 5687 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 5688 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 5689 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 5690 5691 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 5692 { 5693 unsigned short ggc; 5694 5695 if (risky_device(dev)) 5696 return; 5697 5698 if (pci_read_config_word(dev, GGC, &ggc)) 5699 return; 5700 5701 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 5702 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 5703 dmar_map_gfx = 0; 5704 } else if (dmar_map_gfx) { 5705 /* we have to ensure the gfx device is idle before we flush */ 5706 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 5707 intel_iommu_strict = 1; 5708 } 5709 } 5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 5714 5715 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 5716 { 5717 unsigned short ver; 5718 5719 if (!IS_GFX_DEVICE(dev)) 5720 return; 5721 5722 ver = (dev->device >> 8) & 0xff; 5723 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 5724 ver != 0x4e && ver != 0x8a && ver != 0x98 && 5725 ver != 0x9a) 5726 return; 5727 5728 if (risky_device(dev)) 5729 return; 5730 5731 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 5732 iommu_skip_te_disable = 1; 5733 } 5734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 5735 5736 /* On Tylersburg chipsets, some BIOSes have been known to enable the 5737 ISOCH DMAR unit for the Azalia sound device, but not give it any 5738 TLB entries, which causes it to deadlock. Check for that. We do 5739 this in a function called from init_dmars(), instead of in a PCI 5740 quirk, because we don't want to print the obnoxious "BIOS broken" 5741 message if VT-d is actually disabled. 5742 */ 5743 static void __init check_tylersburg_isoch(void) 5744 { 5745 struct pci_dev *pdev; 5746 uint32_t vtisochctrl; 5747 5748 /* If there's no Azalia in the system anyway, forget it. */ 5749 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 5750 if (!pdev) 5751 return; 5752 5753 if (risky_device(pdev)) { 5754 pci_dev_put(pdev); 5755 return; 5756 } 5757 5758 pci_dev_put(pdev); 5759 5760 /* System Management Registers. Might be hidden, in which case 5761 we can't do the sanity check. But that's OK, because the 5762 known-broken BIOSes _don't_ actually hide it, so far. */ 5763 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 5764 if (!pdev) 5765 return; 5766 5767 if (risky_device(pdev)) { 5768 pci_dev_put(pdev); 5769 return; 5770 } 5771 5772 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 5773 pci_dev_put(pdev); 5774 return; 5775 } 5776 5777 pci_dev_put(pdev); 5778 5779 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 5780 if (vtisochctrl & 1) 5781 return; 5782 5783 /* Drop all bits other than the number of TLB entries */ 5784 vtisochctrl &= 0x1c; 5785 5786 /* If we have the recommended number of TLB entries (16), fine. */ 5787 if (vtisochctrl == 0x10) 5788 return; 5789 5790 /* Zero TLB entries? You get to ride the short bus to school. */ 5791 if (!vtisochctrl) { 5792 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 5793 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 5794 dmi_get_system_info(DMI_BIOS_VENDOR), 5795 dmi_get_system_info(DMI_BIOS_VERSION), 5796 dmi_get_system_info(DMI_PRODUCT_VERSION)); 5797 iommu_identity_mapping |= IDENTMAP_AZALIA; 5798 return; 5799 } 5800 5801 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 5802 vtisochctrl); 5803 } 5804