1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-mapping.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/intel-iommu.h> 35 #include <linux/syscore_ops.h> 36 #include <linux/tboot.h> 37 #include <linux/dmi.h> 38 #include <linux/pci-ats.h> 39 #include <linux/memblock.h> 40 #include <linux/dma-contiguous.h> 41 #include <linux/dma-direct.h> 42 #include <linux/crash_dump.h> 43 #include <linux/numa.h> 44 #include <linux/swiotlb.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 #include <trace/events/intel_iommu.h> 49 50 #include "../irq_remapping.h" 51 #include "pasid.h" 52 53 #define ROOT_SIZE VTD_PAGE_SIZE 54 #define CONTEXT_SIZE VTD_PAGE_SIZE 55 56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 60 61 #define IOAPIC_RANGE_START (0xfee00000) 62 #define IOAPIC_RANGE_END (0xfeefffff) 63 #define IOVA_START_ADDR (0x1000) 64 65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 66 67 #define MAX_AGAW_WIDTH 64 68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 69 70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1) 71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1) 72 73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 78 79 /* IO virtual address start page frame number */ 80 #define IOVA_START_PFN (1) 81 82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 83 84 /* page table handling */ 85 #define LEVEL_STRIDE (9) 86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 87 88 /* 89 * This bitmap is used to advertise the page sizes our hardware support 90 * to the IOMMU core, which will then use this information to split 91 * physically contiguous memory regions it is mapping into page sizes 92 * that we support. 93 * 94 * Traditionally the IOMMU core just handed us the mappings directly, 95 * after making sure the size is an order of a 4KiB page and that the 96 * mapping has natural alignment. 97 * 98 * To retain this behavior, we currently advertise that we support 99 * all page sizes that are an order of 4KiB. 100 * 101 * If at some point we'd like to utilize the IOMMU core's new behavior, 102 * we could change this to advertise the real page sizes we support. 103 */ 104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 105 106 static inline int agaw_to_level(int agaw) 107 { 108 return agaw + 2; 109 } 110 111 static inline int agaw_to_width(int agaw) 112 { 113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 114 } 115 116 static inline int width_to_agaw(int width) 117 { 118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 119 } 120 121 static inline unsigned int level_to_offset_bits(int level) 122 { 123 return (level - 1) * LEVEL_STRIDE; 124 } 125 126 static inline int pfn_level_offset(u64 pfn, int level) 127 { 128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 129 } 130 131 static inline u64 level_mask(int level) 132 { 133 return -1ULL << level_to_offset_bits(level); 134 } 135 136 static inline u64 level_size(int level) 137 { 138 return 1ULL << level_to_offset_bits(level); 139 } 140 141 static inline u64 align_to_level(u64 pfn, int level) 142 { 143 return (pfn + level_size(level) - 1) & level_mask(level); 144 } 145 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 147 { 148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 149 } 150 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 152 are never going to work. */ 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 154 { 155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 156 } 157 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 159 { 160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 161 } 162 static inline unsigned long page_to_dma_pfn(struct page *pg) 163 { 164 return mm_to_dma_pfn(page_to_pfn(pg)); 165 } 166 static inline unsigned long virt_to_dma_pfn(void *p) 167 { 168 return page_to_dma_pfn(virt_to_page(p)); 169 } 170 171 /* global iommu list, set NULL for ignored DMAR units */ 172 static struct intel_iommu **g_iommus; 173 174 static void __init check_tylersburg_isoch(void); 175 static int rwbf_quirk; 176 177 /* 178 * set to 1 to panic kernel if can't successfully enable VT-d 179 * (used when kernel is launched w/ TXT) 180 */ 181 static int force_on = 0; 182 int intel_iommu_tboot_noforce; 183 static int no_platform_optin; 184 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 186 187 /* 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 189 * if marked present. 190 */ 191 static phys_addr_t root_entry_lctp(struct root_entry *re) 192 { 193 if (!(re->lo & 1)) 194 return 0; 195 196 return re->lo & VTD_PAGE_MASK; 197 } 198 199 /* 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 201 * if marked present. 202 */ 203 static phys_addr_t root_entry_uctp(struct root_entry *re) 204 { 205 if (!(re->hi & 1)) 206 return 0; 207 208 return re->hi & VTD_PAGE_MASK; 209 } 210 211 static inline void context_clear_pasid_enable(struct context_entry *context) 212 { 213 context->lo &= ~(1ULL << 11); 214 } 215 216 static inline bool context_pasid_enabled(struct context_entry *context) 217 { 218 return !!(context->lo & (1ULL << 11)); 219 } 220 221 static inline void context_set_copied(struct context_entry *context) 222 { 223 context->hi |= (1ull << 3); 224 } 225 226 static inline bool context_copied(struct context_entry *context) 227 { 228 return !!(context->hi & (1ULL << 3)); 229 } 230 231 static inline bool __context_present(struct context_entry *context) 232 { 233 return (context->lo & 1); 234 } 235 236 bool context_present(struct context_entry *context) 237 { 238 return context_pasid_enabled(context) ? 239 __context_present(context) : 240 __context_present(context) && !context_copied(context); 241 } 242 243 static inline void context_set_present(struct context_entry *context) 244 { 245 context->lo |= 1; 246 } 247 248 static inline void context_set_fault_enable(struct context_entry *context) 249 { 250 context->lo &= (((u64)-1) << 2) | 1; 251 } 252 253 static inline void context_set_translation_type(struct context_entry *context, 254 unsigned long value) 255 { 256 context->lo &= (((u64)-1) << 4) | 3; 257 context->lo |= (value & 3) << 2; 258 } 259 260 static inline void context_set_address_root(struct context_entry *context, 261 unsigned long value) 262 { 263 context->lo &= ~VTD_PAGE_MASK; 264 context->lo |= value & VTD_PAGE_MASK; 265 } 266 267 static inline void context_set_address_width(struct context_entry *context, 268 unsigned long value) 269 { 270 context->hi |= value & 7; 271 } 272 273 static inline void context_set_domain_id(struct context_entry *context, 274 unsigned long value) 275 { 276 context->hi |= (value & ((1 << 16) - 1)) << 8; 277 } 278 279 static inline int context_domain_id(struct context_entry *c) 280 { 281 return((c->hi >> 8) & 0xffff); 282 } 283 284 static inline void context_clear_entry(struct context_entry *context) 285 { 286 context->lo = 0; 287 context->hi = 0; 288 } 289 290 /* 291 * This domain is a statically identity mapping domain. 292 * 1. This domain creats a static 1:1 mapping to all usable memory. 293 * 2. It maps to each iommu if successful. 294 * 3. Each iommu mapps to this domain if successful. 295 */ 296 static struct dmar_domain *si_domain; 297 static int hw_pass_through = 1; 298 299 #define for_each_domain_iommu(idx, domain) \ 300 for (idx = 0; idx < g_num_of_iommus; idx++) \ 301 if (domain->iommu_refcnt[idx]) 302 303 struct dmar_rmrr_unit { 304 struct list_head list; /* list of rmrr units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 u64 base_address; /* reserved base address*/ 307 u64 end_address; /* reserved end address */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 int devices_cnt; /* target device count */ 310 }; 311 312 struct dmar_atsr_unit { 313 struct list_head list; /* list of ATSR units */ 314 struct acpi_dmar_header *hdr; /* ACPI header */ 315 struct dmar_dev_scope *devices; /* target devices */ 316 int devices_cnt; /* target device count */ 317 u8 include_all:1; /* include all ports */ 318 }; 319 320 static LIST_HEAD(dmar_atsr_units); 321 static LIST_HEAD(dmar_rmrr_units); 322 323 #define for_each_rmrr_units(rmrr) \ 324 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 325 326 /* bitmap for indexing intel_iommus */ 327 static int g_num_of_iommus; 328 329 static void domain_exit(struct dmar_domain *domain); 330 static void domain_remove_dev_info(struct dmar_domain *domain); 331 static void dmar_remove_one_dev_info(struct device *dev); 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 333 static int intel_iommu_attach_device(struct iommu_domain *domain, 334 struct device *dev); 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 336 dma_addr_t iova); 337 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 339 int dmar_disabled = 0; 340 #else 341 int dmar_disabled = 1; 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 343 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 345 int intel_iommu_sm = 1; 346 #else 347 int intel_iommu_sm; 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 349 350 int intel_iommu_enabled = 0; 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 352 353 static int dmar_map_gfx = 1; 354 static int dmar_forcedac; 355 static int intel_iommu_strict; 356 static int intel_iommu_superpage = 1; 357 static int iommu_identity_mapping; 358 static int intel_no_bounce; 359 static int iommu_skip_te_disable; 360 361 #define IDENTMAP_GFX 2 362 #define IDENTMAP_AZALIA 4 363 364 int intel_iommu_gfx_mapped; 365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 366 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 368 struct device_domain_info *get_domain_info(struct device *dev) 369 { 370 struct device_domain_info *info; 371 372 if (!dev) 373 return NULL; 374 375 info = dev_iommu_priv_get(dev); 376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 377 return NULL; 378 379 return info; 380 } 381 382 DEFINE_SPINLOCK(device_domain_lock); 383 static LIST_HEAD(device_domain_list); 384 385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \ 386 to_pci_dev(d)->untrusted) 387 388 /* 389 * Iterate over elements in device_domain_list and call the specified 390 * callback @fn against each element. 391 */ 392 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 393 void *data), void *data) 394 { 395 int ret = 0; 396 unsigned long flags; 397 struct device_domain_info *info; 398 399 spin_lock_irqsave(&device_domain_lock, flags); 400 list_for_each_entry(info, &device_domain_list, global) { 401 ret = fn(info, data); 402 if (ret) { 403 spin_unlock_irqrestore(&device_domain_lock, flags); 404 return ret; 405 } 406 } 407 spin_unlock_irqrestore(&device_domain_lock, flags); 408 409 return 0; 410 } 411 412 const struct iommu_ops intel_iommu_ops; 413 414 static bool translation_pre_enabled(struct intel_iommu *iommu) 415 { 416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 417 } 418 419 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 420 { 421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 422 } 423 424 static void init_translation_status(struct intel_iommu *iommu) 425 { 426 u32 gsts; 427 428 gsts = readl(iommu->reg + DMAR_GSTS_REG); 429 if (gsts & DMA_GSTS_TES) 430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 431 } 432 433 static int __init intel_iommu_setup(char *str) 434 { 435 if (!str) 436 return -EINVAL; 437 while (*str) { 438 if (!strncmp(str, "on", 2)) { 439 dmar_disabled = 0; 440 pr_info("IOMMU enabled\n"); 441 } else if (!strncmp(str, "off", 3)) { 442 dmar_disabled = 1; 443 no_platform_optin = 1; 444 pr_info("IOMMU disabled\n"); 445 } else if (!strncmp(str, "igfx_off", 8)) { 446 dmar_map_gfx = 0; 447 pr_info("Disable GFX device mapping\n"); 448 } else if (!strncmp(str, "forcedac", 8)) { 449 pr_info("Forcing DAC for PCI devices\n"); 450 dmar_forcedac = 1; 451 } else if (!strncmp(str, "strict", 6)) { 452 pr_info("Disable batched IOTLB flush\n"); 453 intel_iommu_strict = 1; 454 } else if (!strncmp(str, "sp_off", 6)) { 455 pr_info("Disable supported super page\n"); 456 intel_iommu_superpage = 0; 457 } else if (!strncmp(str, "sm_on", 5)) { 458 pr_info("Intel-IOMMU: scalable mode supported\n"); 459 intel_iommu_sm = 1; 460 } else if (!strncmp(str, "tboot_noforce", 13)) { 461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 462 intel_iommu_tboot_noforce = 1; 463 } else if (!strncmp(str, "nobounce", 8)) { 464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 465 intel_no_bounce = 1; 466 } 467 468 str += strcspn(str, ","); 469 while (*str == ',') 470 str++; 471 } 472 return 0; 473 } 474 __setup("intel_iommu=", intel_iommu_setup); 475 476 static struct kmem_cache *iommu_domain_cache; 477 static struct kmem_cache *iommu_devinfo_cache; 478 479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 480 { 481 struct dmar_domain **domains; 482 int idx = did >> 8; 483 484 domains = iommu->domains[idx]; 485 if (!domains) 486 return NULL; 487 488 return domains[did & 0xff]; 489 } 490 491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 492 struct dmar_domain *domain) 493 { 494 struct dmar_domain **domains; 495 int idx = did >> 8; 496 497 if (!iommu->domains[idx]) { 498 size_t size = 256 * sizeof(struct dmar_domain *); 499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 500 } 501 502 domains = iommu->domains[idx]; 503 if (WARN_ON(!domains)) 504 return; 505 else 506 domains[did & 0xff] = domain; 507 } 508 509 void *alloc_pgtable_page(int node) 510 { 511 struct page *page; 512 void *vaddr = NULL; 513 514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 515 if (page) 516 vaddr = page_address(page); 517 return vaddr; 518 } 519 520 void free_pgtable_page(void *vaddr) 521 { 522 free_page((unsigned long)vaddr); 523 } 524 525 static inline void *alloc_domain_mem(void) 526 { 527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 528 } 529 530 static void free_domain_mem(void *vaddr) 531 { 532 kmem_cache_free(iommu_domain_cache, vaddr); 533 } 534 535 static inline void * alloc_devinfo_mem(void) 536 { 537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 538 } 539 540 static inline void free_devinfo_mem(void *vaddr) 541 { 542 kmem_cache_free(iommu_devinfo_cache, vaddr); 543 } 544 545 static inline int domain_type_is_si(struct dmar_domain *domain) 546 { 547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 548 } 549 550 static inline bool domain_use_first_level(struct dmar_domain *domain) 551 { 552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 553 } 554 555 static inline int domain_pfn_supported(struct dmar_domain *domain, 556 unsigned long pfn) 557 { 558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 559 560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 561 } 562 563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 564 { 565 unsigned long sagaw; 566 int agaw = -1; 567 568 sagaw = cap_sagaw(iommu->cap); 569 for (agaw = width_to_agaw(max_gaw); 570 agaw >= 0; agaw--) { 571 if (test_bit(agaw, &sagaw)) 572 break; 573 } 574 575 return agaw; 576 } 577 578 /* 579 * Calculate max SAGAW for each iommu. 580 */ 581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 582 { 583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 584 } 585 586 /* 587 * calculate agaw for each iommu. 588 * "SAGAW" may be different across iommus, use a default agaw, and 589 * get a supported less agaw for iommus that don't support the default agaw. 590 */ 591 int iommu_calculate_agaw(struct intel_iommu *iommu) 592 { 593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 594 } 595 596 /* This functionin only returns single iommu in a domain */ 597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 598 { 599 int iommu_id; 600 601 /* si_domain and vm domain should not get here. */ 602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 603 return NULL; 604 605 for_each_domain_iommu(iommu_id, domain) 606 break; 607 608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 609 return NULL; 610 611 return g_iommus[iommu_id]; 612 } 613 614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 615 { 616 return sm_supported(iommu) ? 617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 618 } 619 620 static void domain_update_iommu_coherency(struct dmar_domain *domain) 621 { 622 struct dmar_drhd_unit *drhd; 623 struct intel_iommu *iommu; 624 bool found = false; 625 int i; 626 627 domain->iommu_coherency = 1; 628 629 for_each_domain_iommu(i, domain) { 630 found = true; 631 if (!iommu_paging_structure_coherency(g_iommus[i])) { 632 domain->iommu_coherency = 0; 633 break; 634 } 635 } 636 if (found) 637 return; 638 639 /* No hardware attached; use lowest common denominator */ 640 rcu_read_lock(); 641 for_each_active_iommu(iommu, drhd) { 642 if (!iommu_paging_structure_coherency(iommu)) { 643 domain->iommu_coherency = 0; 644 break; 645 } 646 } 647 rcu_read_unlock(); 648 } 649 650 static int domain_update_iommu_snooping(struct intel_iommu *skip) 651 { 652 struct dmar_drhd_unit *drhd; 653 struct intel_iommu *iommu; 654 int ret = 1; 655 656 rcu_read_lock(); 657 for_each_active_iommu(iommu, drhd) { 658 if (iommu != skip) { 659 if (!ecap_sc_support(iommu->ecap)) { 660 ret = 0; 661 break; 662 } 663 } 664 } 665 rcu_read_unlock(); 666 667 return ret; 668 } 669 670 static int domain_update_iommu_superpage(struct dmar_domain *domain, 671 struct intel_iommu *skip) 672 { 673 struct dmar_drhd_unit *drhd; 674 struct intel_iommu *iommu; 675 int mask = 0x3; 676 677 if (!intel_iommu_superpage) { 678 return 0; 679 } 680 681 /* set iommu_superpage to the smallest common denominator */ 682 rcu_read_lock(); 683 for_each_active_iommu(iommu, drhd) { 684 if (iommu != skip) { 685 if (domain && domain_use_first_level(domain)) { 686 if (!cap_fl1gp_support(iommu->cap)) 687 mask = 0x1; 688 } else { 689 mask &= cap_super_page_val(iommu->cap); 690 } 691 692 if (!mask) 693 break; 694 } 695 } 696 rcu_read_unlock(); 697 698 return fls(mask); 699 } 700 701 /* Some capabilities may be different across iommus */ 702 static void domain_update_iommu_cap(struct dmar_domain *domain) 703 { 704 domain_update_iommu_coherency(domain); 705 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 706 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 707 } 708 709 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 710 u8 devfn, int alloc) 711 { 712 struct root_entry *root = &iommu->root_entry[bus]; 713 struct context_entry *context; 714 u64 *entry; 715 716 entry = &root->lo; 717 if (sm_supported(iommu)) { 718 if (devfn >= 0x80) { 719 devfn -= 0x80; 720 entry = &root->hi; 721 } 722 devfn *= 2; 723 } 724 if (*entry & 1) 725 context = phys_to_virt(*entry & VTD_PAGE_MASK); 726 else { 727 unsigned long phy_addr; 728 if (!alloc) 729 return NULL; 730 731 context = alloc_pgtable_page(iommu->node); 732 if (!context) 733 return NULL; 734 735 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 736 phy_addr = virt_to_phys((void *)context); 737 *entry = phy_addr | 1; 738 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 739 } 740 return &context[devfn]; 741 } 742 743 static bool attach_deferred(struct device *dev) 744 { 745 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 746 } 747 748 /** 749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 750 * sub-hierarchy of a candidate PCI-PCI bridge 751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 752 * @bridge: the candidate PCI-PCI bridge 753 * 754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 755 */ 756 static bool 757 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 758 { 759 struct pci_dev *pdev, *pbridge; 760 761 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 762 return false; 763 764 pdev = to_pci_dev(dev); 765 pbridge = to_pci_dev(bridge); 766 767 if (pbridge->subordinate && 768 pbridge->subordinate->number <= pdev->bus->number && 769 pbridge->subordinate->busn_res.end >= pdev->bus->number) 770 return true; 771 772 return false; 773 } 774 775 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 776 { 777 struct dmar_drhd_unit *drhd; 778 u32 vtbar; 779 int rc; 780 781 /* We know that this device on this chipset has its own IOMMU. 782 * If we find it under a different IOMMU, then the BIOS is lying 783 * to us. Hope that the IOMMU for this device is actually 784 * disabled, and it needs no translation... 785 */ 786 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 787 if (rc) { 788 /* "can't" happen */ 789 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 790 return false; 791 } 792 vtbar &= 0xffff0000; 793 794 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 795 drhd = dmar_find_matched_drhd_unit(pdev); 796 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 797 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 798 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 799 return true; 800 } 801 802 return false; 803 } 804 805 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 806 { 807 if (!iommu || iommu->drhd->ignored) 808 return true; 809 810 if (dev_is_pci(dev)) { 811 struct pci_dev *pdev = to_pci_dev(dev); 812 813 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 814 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 815 quirk_ioat_snb_local_iommu(pdev)) 816 return true; 817 } 818 819 return false; 820 } 821 822 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 823 { 824 struct dmar_drhd_unit *drhd = NULL; 825 struct pci_dev *pdev = NULL; 826 struct intel_iommu *iommu; 827 struct device *tmp; 828 u16 segment = 0; 829 int i; 830 831 if (!dev) 832 return NULL; 833 834 if (dev_is_pci(dev)) { 835 struct pci_dev *pf_pdev; 836 837 pdev = pci_real_dma_dev(to_pci_dev(dev)); 838 839 /* VFs aren't listed in scope tables; we need to look up 840 * the PF instead to find the IOMMU. */ 841 pf_pdev = pci_physfn(pdev); 842 dev = &pf_pdev->dev; 843 segment = pci_domain_nr(pdev->bus); 844 } else if (has_acpi_companion(dev)) 845 dev = &ACPI_COMPANION(dev)->dev; 846 847 rcu_read_lock(); 848 for_each_iommu(iommu, drhd) { 849 if (pdev && segment != drhd->segment) 850 continue; 851 852 for_each_active_dev_scope(drhd->devices, 853 drhd->devices_cnt, i, tmp) { 854 if (tmp == dev) { 855 /* For a VF use its original BDF# not that of the PF 856 * which we used for the IOMMU lookup. Strictly speaking 857 * we could do this for all PCI devices; we only need to 858 * get the BDF# from the scope table for ACPI matches. */ 859 if (pdev && pdev->is_virtfn) 860 goto got_pdev; 861 862 if (bus && devfn) { 863 *bus = drhd->devices[i].bus; 864 *devfn = drhd->devices[i].devfn; 865 } 866 goto out; 867 } 868 869 if (is_downstream_to_pci_bridge(dev, tmp)) 870 goto got_pdev; 871 } 872 873 if (pdev && drhd->include_all) { 874 got_pdev: 875 if (bus && devfn) { 876 *bus = pdev->bus->number; 877 *devfn = pdev->devfn; 878 } 879 goto out; 880 } 881 } 882 iommu = NULL; 883 out: 884 if (iommu_is_dummy(iommu, dev)) 885 iommu = NULL; 886 887 rcu_read_unlock(); 888 889 return iommu; 890 } 891 892 static void domain_flush_cache(struct dmar_domain *domain, 893 void *addr, int size) 894 { 895 if (!domain->iommu_coherency) 896 clflush_cache_range(addr, size); 897 } 898 899 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 900 { 901 struct context_entry *context; 902 int ret = 0; 903 unsigned long flags; 904 905 spin_lock_irqsave(&iommu->lock, flags); 906 context = iommu_context_addr(iommu, bus, devfn, 0); 907 if (context) 908 ret = context_present(context); 909 spin_unlock_irqrestore(&iommu->lock, flags); 910 return ret; 911 } 912 913 static void free_context_table(struct intel_iommu *iommu) 914 { 915 int i; 916 unsigned long flags; 917 struct context_entry *context; 918 919 spin_lock_irqsave(&iommu->lock, flags); 920 if (!iommu->root_entry) { 921 goto out; 922 } 923 for (i = 0; i < ROOT_ENTRY_NR; i++) { 924 context = iommu_context_addr(iommu, i, 0, 0); 925 if (context) 926 free_pgtable_page(context); 927 928 if (!sm_supported(iommu)) 929 continue; 930 931 context = iommu_context_addr(iommu, i, 0x80, 0); 932 if (context) 933 free_pgtable_page(context); 934 935 } 936 free_pgtable_page(iommu->root_entry); 937 iommu->root_entry = NULL; 938 out: 939 spin_unlock_irqrestore(&iommu->lock, flags); 940 } 941 942 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 943 unsigned long pfn, int *target_level) 944 { 945 struct dma_pte *parent, *pte; 946 int level = agaw_to_level(domain->agaw); 947 int offset; 948 949 BUG_ON(!domain->pgd); 950 951 if (!domain_pfn_supported(domain, pfn)) 952 /* Address beyond IOMMU's addressing capabilities. */ 953 return NULL; 954 955 parent = domain->pgd; 956 957 while (1) { 958 void *tmp_page; 959 960 offset = pfn_level_offset(pfn, level); 961 pte = &parent[offset]; 962 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 963 break; 964 if (level == *target_level) 965 break; 966 967 if (!dma_pte_present(pte)) { 968 uint64_t pteval; 969 970 tmp_page = alloc_pgtable_page(domain->nid); 971 972 if (!tmp_page) 973 return NULL; 974 975 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 976 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 977 if (domain_use_first_level(domain)) 978 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 979 if (cmpxchg64(&pte->val, 0ULL, pteval)) 980 /* Someone else set it while we were thinking; use theirs. */ 981 free_pgtable_page(tmp_page); 982 else 983 domain_flush_cache(domain, pte, sizeof(*pte)); 984 } 985 if (level == 1) 986 break; 987 988 parent = phys_to_virt(dma_pte_addr(pte)); 989 level--; 990 } 991 992 if (!*target_level) 993 *target_level = level; 994 995 return pte; 996 } 997 998 /* return address's pte at specific level */ 999 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1000 unsigned long pfn, 1001 int level, int *large_page) 1002 { 1003 struct dma_pte *parent, *pte; 1004 int total = agaw_to_level(domain->agaw); 1005 int offset; 1006 1007 parent = domain->pgd; 1008 while (level <= total) { 1009 offset = pfn_level_offset(pfn, total); 1010 pte = &parent[offset]; 1011 if (level == total) 1012 return pte; 1013 1014 if (!dma_pte_present(pte)) { 1015 *large_page = total; 1016 break; 1017 } 1018 1019 if (dma_pte_superpage(pte)) { 1020 *large_page = total; 1021 return pte; 1022 } 1023 1024 parent = phys_to_virt(dma_pte_addr(pte)); 1025 total--; 1026 } 1027 return NULL; 1028 } 1029 1030 /* clear last level pte, a tlb flush should be followed */ 1031 static void dma_pte_clear_range(struct dmar_domain *domain, 1032 unsigned long start_pfn, 1033 unsigned long last_pfn) 1034 { 1035 unsigned int large_page; 1036 struct dma_pte *first_pte, *pte; 1037 1038 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1039 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1040 BUG_ON(start_pfn > last_pfn); 1041 1042 /* we don't need lock here; nobody else touches the iova range */ 1043 do { 1044 large_page = 1; 1045 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1046 if (!pte) { 1047 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1048 continue; 1049 } 1050 do { 1051 dma_clear_pte(pte); 1052 start_pfn += lvl_to_nr_pages(large_page); 1053 pte++; 1054 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1055 1056 domain_flush_cache(domain, first_pte, 1057 (void *)pte - (void *)first_pte); 1058 1059 } while (start_pfn && start_pfn <= last_pfn); 1060 } 1061 1062 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1063 int retain_level, struct dma_pte *pte, 1064 unsigned long pfn, unsigned long start_pfn, 1065 unsigned long last_pfn) 1066 { 1067 pfn = max(start_pfn, pfn); 1068 pte = &pte[pfn_level_offset(pfn, level)]; 1069 1070 do { 1071 unsigned long level_pfn; 1072 struct dma_pte *level_pte; 1073 1074 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1075 goto next; 1076 1077 level_pfn = pfn & level_mask(level); 1078 level_pte = phys_to_virt(dma_pte_addr(pte)); 1079 1080 if (level > 2) { 1081 dma_pte_free_level(domain, level - 1, retain_level, 1082 level_pte, level_pfn, start_pfn, 1083 last_pfn); 1084 } 1085 1086 /* 1087 * Free the page table if we're below the level we want to 1088 * retain and the range covers the entire table. 1089 */ 1090 if (level < retain_level && !(start_pfn > level_pfn || 1091 last_pfn < level_pfn + level_size(level) - 1)) { 1092 dma_clear_pte(pte); 1093 domain_flush_cache(domain, pte, sizeof(*pte)); 1094 free_pgtable_page(level_pte); 1095 } 1096 next: 1097 pfn += level_size(level); 1098 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1099 } 1100 1101 /* 1102 * clear last level (leaf) ptes and free page table pages below the 1103 * level we wish to keep intact. 1104 */ 1105 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1106 unsigned long start_pfn, 1107 unsigned long last_pfn, 1108 int retain_level) 1109 { 1110 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1111 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1112 BUG_ON(start_pfn > last_pfn); 1113 1114 dma_pte_clear_range(domain, start_pfn, last_pfn); 1115 1116 /* We don't need lock here; nobody else touches the iova range */ 1117 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1118 domain->pgd, 0, start_pfn, last_pfn); 1119 1120 /* free pgd */ 1121 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1122 free_pgtable_page(domain->pgd); 1123 domain->pgd = NULL; 1124 } 1125 } 1126 1127 /* When a page at a given level is being unlinked from its parent, we don't 1128 need to *modify* it at all. All we need to do is make a list of all the 1129 pages which can be freed just as soon as we've flushed the IOTLB and we 1130 know the hardware page-walk will no longer touch them. 1131 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1132 be freed. */ 1133 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1134 int level, struct dma_pte *pte, 1135 struct page *freelist) 1136 { 1137 struct page *pg; 1138 1139 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1140 pg->freelist = freelist; 1141 freelist = pg; 1142 1143 if (level == 1) 1144 return freelist; 1145 1146 pte = page_address(pg); 1147 do { 1148 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1149 freelist = dma_pte_list_pagetables(domain, level - 1, 1150 pte, freelist); 1151 pte++; 1152 } while (!first_pte_in_page(pte)); 1153 1154 return freelist; 1155 } 1156 1157 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1158 struct dma_pte *pte, unsigned long pfn, 1159 unsigned long start_pfn, 1160 unsigned long last_pfn, 1161 struct page *freelist) 1162 { 1163 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1164 1165 pfn = max(start_pfn, pfn); 1166 pte = &pte[pfn_level_offset(pfn, level)]; 1167 1168 do { 1169 unsigned long level_pfn; 1170 1171 if (!dma_pte_present(pte)) 1172 goto next; 1173 1174 level_pfn = pfn & level_mask(level); 1175 1176 /* If range covers entire pagetable, free it */ 1177 if (start_pfn <= level_pfn && 1178 last_pfn >= level_pfn + level_size(level) - 1) { 1179 /* These suborbinate page tables are going away entirely. Don't 1180 bother to clear them; we're just going to *free* them. */ 1181 if (level > 1 && !dma_pte_superpage(pte)) 1182 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1183 1184 dma_clear_pte(pte); 1185 if (!first_pte) 1186 first_pte = pte; 1187 last_pte = pte; 1188 } else if (level > 1) { 1189 /* Recurse down into a level that isn't *entirely* obsolete */ 1190 freelist = dma_pte_clear_level(domain, level - 1, 1191 phys_to_virt(dma_pte_addr(pte)), 1192 level_pfn, start_pfn, last_pfn, 1193 freelist); 1194 } 1195 next: 1196 pfn += level_size(level); 1197 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1198 1199 if (first_pte) 1200 domain_flush_cache(domain, first_pte, 1201 (void *)++last_pte - (void *)first_pte); 1202 1203 return freelist; 1204 } 1205 1206 /* We can't just free the pages because the IOMMU may still be walking 1207 the page tables, and may have cached the intermediate levels. The 1208 pages can only be freed after the IOTLB flush has been done. */ 1209 static struct page *domain_unmap(struct dmar_domain *domain, 1210 unsigned long start_pfn, 1211 unsigned long last_pfn) 1212 { 1213 struct page *freelist; 1214 1215 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1216 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1217 BUG_ON(start_pfn > last_pfn); 1218 1219 /* we don't need lock here; nobody else touches the iova range */ 1220 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1221 domain->pgd, 0, start_pfn, last_pfn, NULL); 1222 1223 /* free pgd */ 1224 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1225 struct page *pgd_page = virt_to_page(domain->pgd); 1226 pgd_page->freelist = freelist; 1227 freelist = pgd_page; 1228 1229 domain->pgd = NULL; 1230 } 1231 1232 return freelist; 1233 } 1234 1235 static void dma_free_pagelist(struct page *freelist) 1236 { 1237 struct page *pg; 1238 1239 while ((pg = freelist)) { 1240 freelist = pg->freelist; 1241 free_pgtable_page(page_address(pg)); 1242 } 1243 } 1244 1245 static void iova_entry_free(unsigned long data) 1246 { 1247 struct page *freelist = (struct page *)data; 1248 1249 dma_free_pagelist(freelist); 1250 } 1251 1252 /* iommu handling */ 1253 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1254 { 1255 struct root_entry *root; 1256 unsigned long flags; 1257 1258 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1259 if (!root) { 1260 pr_err("Allocating root entry for %s failed\n", 1261 iommu->name); 1262 return -ENOMEM; 1263 } 1264 1265 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1266 1267 spin_lock_irqsave(&iommu->lock, flags); 1268 iommu->root_entry = root; 1269 spin_unlock_irqrestore(&iommu->lock, flags); 1270 1271 return 0; 1272 } 1273 1274 static void iommu_set_root_entry(struct intel_iommu *iommu) 1275 { 1276 u64 addr; 1277 u32 sts; 1278 unsigned long flag; 1279 1280 addr = virt_to_phys(iommu->root_entry); 1281 if (sm_supported(iommu)) 1282 addr |= DMA_RTADDR_SMT; 1283 1284 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1285 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1286 1287 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1288 1289 /* Make sure hardware complete it */ 1290 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1291 readl, (sts & DMA_GSTS_RTPS), sts); 1292 1293 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1294 } 1295 1296 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1297 { 1298 u32 val; 1299 unsigned long flag; 1300 1301 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1302 return; 1303 1304 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1305 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1306 1307 /* Make sure hardware complete it */ 1308 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1309 readl, (!(val & DMA_GSTS_WBFS)), val); 1310 1311 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1312 } 1313 1314 /* return value determine if we need a write buffer flush */ 1315 static void __iommu_flush_context(struct intel_iommu *iommu, 1316 u16 did, u16 source_id, u8 function_mask, 1317 u64 type) 1318 { 1319 u64 val = 0; 1320 unsigned long flag; 1321 1322 switch (type) { 1323 case DMA_CCMD_GLOBAL_INVL: 1324 val = DMA_CCMD_GLOBAL_INVL; 1325 break; 1326 case DMA_CCMD_DOMAIN_INVL: 1327 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1328 break; 1329 case DMA_CCMD_DEVICE_INVL: 1330 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1331 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1332 break; 1333 default: 1334 BUG(); 1335 } 1336 val |= DMA_CCMD_ICC; 1337 1338 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1339 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1340 1341 /* Make sure hardware complete it */ 1342 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1343 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1344 1345 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1346 } 1347 1348 /* return value determine if we need a write buffer flush */ 1349 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1350 u64 addr, unsigned int size_order, u64 type) 1351 { 1352 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1353 u64 val = 0, val_iva = 0; 1354 unsigned long flag; 1355 1356 switch (type) { 1357 case DMA_TLB_GLOBAL_FLUSH: 1358 /* global flush doesn't need set IVA_REG */ 1359 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1360 break; 1361 case DMA_TLB_DSI_FLUSH: 1362 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1363 break; 1364 case DMA_TLB_PSI_FLUSH: 1365 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1366 /* IH bit is passed in as part of address */ 1367 val_iva = size_order | addr; 1368 break; 1369 default: 1370 BUG(); 1371 } 1372 /* Note: set drain read/write */ 1373 #if 0 1374 /* 1375 * This is probably to be super secure.. Looks like we can 1376 * ignore it without any impact. 1377 */ 1378 if (cap_read_drain(iommu->cap)) 1379 val |= DMA_TLB_READ_DRAIN; 1380 #endif 1381 if (cap_write_drain(iommu->cap)) 1382 val |= DMA_TLB_WRITE_DRAIN; 1383 1384 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1385 /* Note: Only uses first TLB reg currently */ 1386 if (val_iva) 1387 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1388 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1389 1390 /* Make sure hardware complete it */ 1391 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1392 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1393 1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1395 1396 /* check IOTLB invalidation granularity */ 1397 if (DMA_TLB_IAIG(val) == 0) 1398 pr_err("Flush IOTLB failed\n"); 1399 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1400 pr_debug("TLB flush request %Lx, actual %Lx\n", 1401 (unsigned long long)DMA_TLB_IIRG(type), 1402 (unsigned long long)DMA_TLB_IAIG(val)); 1403 } 1404 1405 static struct device_domain_info * 1406 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1407 u8 bus, u8 devfn) 1408 { 1409 struct device_domain_info *info; 1410 1411 assert_spin_locked(&device_domain_lock); 1412 1413 if (!iommu->qi) 1414 return NULL; 1415 1416 list_for_each_entry(info, &domain->devices, link) 1417 if (info->iommu == iommu && info->bus == bus && 1418 info->devfn == devfn) { 1419 if (info->ats_supported && info->dev) 1420 return info; 1421 break; 1422 } 1423 1424 return NULL; 1425 } 1426 1427 static void domain_update_iotlb(struct dmar_domain *domain) 1428 { 1429 struct device_domain_info *info; 1430 bool has_iotlb_device = false; 1431 1432 assert_spin_locked(&device_domain_lock); 1433 1434 list_for_each_entry(info, &domain->devices, link) { 1435 struct pci_dev *pdev; 1436 1437 if (!info->dev || !dev_is_pci(info->dev)) 1438 continue; 1439 1440 pdev = to_pci_dev(info->dev); 1441 if (pdev->ats_enabled) { 1442 has_iotlb_device = true; 1443 break; 1444 } 1445 } 1446 1447 domain->has_iotlb_device = has_iotlb_device; 1448 } 1449 1450 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1451 { 1452 struct pci_dev *pdev; 1453 1454 assert_spin_locked(&device_domain_lock); 1455 1456 if (!info || !dev_is_pci(info->dev)) 1457 return; 1458 1459 pdev = to_pci_dev(info->dev); 1460 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1461 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1462 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1463 * reserved, which should be set to 0. 1464 */ 1465 if (!ecap_dit(info->iommu->ecap)) 1466 info->pfsid = 0; 1467 else { 1468 struct pci_dev *pf_pdev; 1469 1470 /* pdev will be returned if device is not a vf */ 1471 pf_pdev = pci_physfn(pdev); 1472 info->pfsid = pci_dev_id(pf_pdev); 1473 } 1474 1475 #ifdef CONFIG_INTEL_IOMMU_SVM 1476 /* The PCIe spec, in its wisdom, declares that the behaviour of 1477 the device if you enable PASID support after ATS support is 1478 undefined. So always enable PASID support on devices which 1479 have it, even if we can't yet know if we're ever going to 1480 use it. */ 1481 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1482 info->pasid_enabled = 1; 1483 1484 if (info->pri_supported && 1485 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1486 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1487 info->pri_enabled = 1; 1488 #endif 1489 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1490 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1491 info->ats_enabled = 1; 1492 domain_update_iotlb(info->domain); 1493 info->ats_qdep = pci_ats_queue_depth(pdev); 1494 } 1495 } 1496 1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1498 { 1499 struct pci_dev *pdev; 1500 1501 assert_spin_locked(&device_domain_lock); 1502 1503 if (!dev_is_pci(info->dev)) 1504 return; 1505 1506 pdev = to_pci_dev(info->dev); 1507 1508 if (info->ats_enabled) { 1509 pci_disable_ats(pdev); 1510 info->ats_enabled = 0; 1511 domain_update_iotlb(info->domain); 1512 } 1513 #ifdef CONFIG_INTEL_IOMMU_SVM 1514 if (info->pri_enabled) { 1515 pci_disable_pri(pdev); 1516 info->pri_enabled = 0; 1517 } 1518 if (info->pasid_enabled) { 1519 pci_disable_pasid(pdev); 1520 info->pasid_enabled = 0; 1521 } 1522 #endif 1523 } 1524 1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1526 u64 addr, unsigned mask) 1527 { 1528 u16 sid, qdep; 1529 unsigned long flags; 1530 struct device_domain_info *info; 1531 1532 if (!domain->has_iotlb_device) 1533 return; 1534 1535 spin_lock_irqsave(&device_domain_lock, flags); 1536 list_for_each_entry(info, &domain->devices, link) { 1537 if (!info->ats_enabled) 1538 continue; 1539 1540 sid = info->bus << 8 | info->devfn; 1541 qdep = info->ats_qdep; 1542 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1543 qdep, addr, mask); 1544 } 1545 spin_unlock_irqrestore(&device_domain_lock, flags); 1546 } 1547 1548 static void domain_flush_piotlb(struct intel_iommu *iommu, 1549 struct dmar_domain *domain, 1550 u64 addr, unsigned long npages, bool ih) 1551 { 1552 u16 did = domain->iommu_did[iommu->seq_id]; 1553 1554 if (domain->default_pasid) 1555 qi_flush_piotlb(iommu, did, domain->default_pasid, 1556 addr, npages, ih); 1557 1558 if (!list_empty(&domain->devices)) 1559 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1560 } 1561 1562 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1563 struct dmar_domain *domain, 1564 unsigned long pfn, unsigned int pages, 1565 int ih, int map) 1566 { 1567 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1568 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1569 u16 did = domain->iommu_did[iommu->seq_id]; 1570 1571 BUG_ON(pages == 0); 1572 1573 if (ih) 1574 ih = 1 << 6; 1575 1576 if (domain_use_first_level(domain)) { 1577 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1578 } else { 1579 /* 1580 * Fallback to domain selective flush if no PSI support or 1581 * the size is too big. PSI requires page size to be 2 ^ x, 1582 * and the base address is naturally aligned to the size. 1583 */ 1584 if (!cap_pgsel_inv(iommu->cap) || 1585 mask > cap_max_amask_val(iommu->cap)) 1586 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1587 DMA_TLB_DSI_FLUSH); 1588 else 1589 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1590 DMA_TLB_PSI_FLUSH); 1591 } 1592 1593 /* 1594 * In caching mode, changes of pages from non-present to present require 1595 * flush. However, device IOTLB doesn't need to be flushed in this case. 1596 */ 1597 if (!cap_caching_mode(iommu->cap) || !map) 1598 iommu_flush_dev_iotlb(domain, addr, mask); 1599 } 1600 1601 /* Notification for newly created mappings */ 1602 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1603 struct dmar_domain *domain, 1604 unsigned long pfn, unsigned int pages) 1605 { 1606 /* 1607 * It's a non-present to present mapping. Only flush if caching mode 1608 * and second level. 1609 */ 1610 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1611 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1612 else 1613 iommu_flush_write_buffer(iommu); 1614 } 1615 1616 static void iommu_flush_iova(struct iova_domain *iovad) 1617 { 1618 struct dmar_domain *domain; 1619 int idx; 1620 1621 domain = container_of(iovad, struct dmar_domain, iovad); 1622 1623 for_each_domain_iommu(idx, domain) { 1624 struct intel_iommu *iommu = g_iommus[idx]; 1625 u16 did = domain->iommu_did[iommu->seq_id]; 1626 1627 if (domain_use_first_level(domain)) 1628 domain_flush_piotlb(iommu, domain, 0, -1, 0); 1629 else 1630 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1631 DMA_TLB_DSI_FLUSH); 1632 1633 if (!cap_caching_mode(iommu->cap)) 1634 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1635 0, MAX_AGAW_PFN_WIDTH); 1636 } 1637 } 1638 1639 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1640 { 1641 u32 pmen; 1642 unsigned long flags; 1643 1644 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1645 return; 1646 1647 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1648 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1649 pmen &= ~DMA_PMEN_EPM; 1650 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1651 1652 /* wait for the protected region status bit to clear */ 1653 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1654 readl, !(pmen & DMA_PMEN_PRS), pmen); 1655 1656 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1657 } 1658 1659 static void iommu_enable_translation(struct intel_iommu *iommu) 1660 { 1661 u32 sts; 1662 unsigned long flags; 1663 1664 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1665 iommu->gcmd |= DMA_GCMD_TE; 1666 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1667 1668 /* Make sure hardware complete it */ 1669 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1670 readl, (sts & DMA_GSTS_TES), sts); 1671 1672 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1673 } 1674 1675 static void iommu_disable_translation(struct intel_iommu *iommu) 1676 { 1677 u32 sts; 1678 unsigned long flag; 1679 1680 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1681 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1682 return; 1683 1684 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1685 iommu->gcmd &= ~DMA_GCMD_TE; 1686 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1687 1688 /* Make sure hardware complete it */ 1689 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1690 readl, (!(sts & DMA_GSTS_TES)), sts); 1691 1692 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1693 } 1694 1695 static int iommu_init_domains(struct intel_iommu *iommu) 1696 { 1697 u32 ndomains, nlongs; 1698 size_t size; 1699 1700 ndomains = cap_ndoms(iommu->cap); 1701 pr_debug("%s: Number of Domains supported <%d>\n", 1702 iommu->name, ndomains); 1703 nlongs = BITS_TO_LONGS(ndomains); 1704 1705 spin_lock_init(&iommu->lock); 1706 1707 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1708 if (!iommu->domain_ids) { 1709 pr_err("%s: Allocating domain id array failed\n", 1710 iommu->name); 1711 return -ENOMEM; 1712 } 1713 1714 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1715 iommu->domains = kzalloc(size, GFP_KERNEL); 1716 1717 if (iommu->domains) { 1718 size = 256 * sizeof(struct dmar_domain *); 1719 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1720 } 1721 1722 if (!iommu->domains || !iommu->domains[0]) { 1723 pr_err("%s: Allocating domain array failed\n", 1724 iommu->name); 1725 kfree(iommu->domain_ids); 1726 kfree(iommu->domains); 1727 iommu->domain_ids = NULL; 1728 iommu->domains = NULL; 1729 return -ENOMEM; 1730 } 1731 1732 /* 1733 * If Caching mode is set, then invalid translations are tagged 1734 * with domain-id 0, hence we need to pre-allocate it. We also 1735 * use domain-id 0 as a marker for non-allocated domain-id, so 1736 * make sure it is not used for a real domain. 1737 */ 1738 set_bit(0, iommu->domain_ids); 1739 1740 /* 1741 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1742 * entry for first-level or pass-through translation modes should 1743 * be programmed with a domain id different from those used for 1744 * second-level or nested translation. We reserve a domain id for 1745 * this purpose. 1746 */ 1747 if (sm_supported(iommu)) 1748 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1749 1750 return 0; 1751 } 1752 1753 static void disable_dmar_iommu(struct intel_iommu *iommu) 1754 { 1755 struct device_domain_info *info, *tmp; 1756 unsigned long flags; 1757 1758 if (!iommu->domains || !iommu->domain_ids) 1759 return; 1760 1761 spin_lock_irqsave(&device_domain_lock, flags); 1762 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1763 if (info->iommu != iommu) 1764 continue; 1765 1766 if (!info->dev || !info->domain) 1767 continue; 1768 1769 __dmar_remove_one_dev_info(info); 1770 } 1771 spin_unlock_irqrestore(&device_domain_lock, flags); 1772 1773 if (iommu->gcmd & DMA_GCMD_TE) 1774 iommu_disable_translation(iommu); 1775 } 1776 1777 static void free_dmar_iommu(struct intel_iommu *iommu) 1778 { 1779 if ((iommu->domains) && (iommu->domain_ids)) { 1780 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1781 int i; 1782 1783 for (i = 0; i < elems; i++) 1784 kfree(iommu->domains[i]); 1785 kfree(iommu->domains); 1786 kfree(iommu->domain_ids); 1787 iommu->domains = NULL; 1788 iommu->domain_ids = NULL; 1789 } 1790 1791 g_iommus[iommu->seq_id] = NULL; 1792 1793 /* free context mapping */ 1794 free_context_table(iommu); 1795 1796 #ifdef CONFIG_INTEL_IOMMU_SVM 1797 if (pasid_supported(iommu)) { 1798 if (ecap_prs(iommu->ecap)) 1799 intel_svm_finish_prq(iommu); 1800 } 1801 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) 1802 ioasid_unregister_allocator(&iommu->pasid_allocator); 1803 1804 #endif 1805 } 1806 1807 /* 1808 * Check and return whether first level is used by default for 1809 * DMA translation. 1810 */ 1811 static bool first_level_by_default(void) 1812 { 1813 struct dmar_drhd_unit *drhd; 1814 struct intel_iommu *iommu; 1815 static int first_level_support = -1; 1816 1817 if (likely(first_level_support != -1)) 1818 return first_level_support; 1819 1820 first_level_support = 1; 1821 1822 rcu_read_lock(); 1823 for_each_active_iommu(iommu, drhd) { 1824 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1825 first_level_support = 0; 1826 break; 1827 } 1828 } 1829 rcu_read_unlock(); 1830 1831 return first_level_support; 1832 } 1833 1834 static struct dmar_domain *alloc_domain(int flags) 1835 { 1836 struct dmar_domain *domain; 1837 1838 domain = alloc_domain_mem(); 1839 if (!domain) 1840 return NULL; 1841 1842 memset(domain, 0, sizeof(*domain)); 1843 domain->nid = NUMA_NO_NODE; 1844 domain->flags = flags; 1845 if (first_level_by_default()) 1846 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1847 domain->has_iotlb_device = false; 1848 INIT_LIST_HEAD(&domain->devices); 1849 1850 return domain; 1851 } 1852 1853 /* Must be called with iommu->lock */ 1854 static int domain_attach_iommu(struct dmar_domain *domain, 1855 struct intel_iommu *iommu) 1856 { 1857 unsigned long ndomains; 1858 int num; 1859 1860 assert_spin_locked(&device_domain_lock); 1861 assert_spin_locked(&iommu->lock); 1862 1863 domain->iommu_refcnt[iommu->seq_id] += 1; 1864 domain->iommu_count += 1; 1865 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1866 ndomains = cap_ndoms(iommu->cap); 1867 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1868 1869 if (num >= ndomains) { 1870 pr_err("%s: No free domain ids\n", iommu->name); 1871 domain->iommu_refcnt[iommu->seq_id] -= 1; 1872 domain->iommu_count -= 1; 1873 return -ENOSPC; 1874 } 1875 1876 set_bit(num, iommu->domain_ids); 1877 set_iommu_domain(iommu, num, domain); 1878 1879 domain->iommu_did[iommu->seq_id] = num; 1880 domain->nid = iommu->node; 1881 1882 domain_update_iommu_cap(domain); 1883 } 1884 1885 return 0; 1886 } 1887 1888 static int domain_detach_iommu(struct dmar_domain *domain, 1889 struct intel_iommu *iommu) 1890 { 1891 int num, count; 1892 1893 assert_spin_locked(&device_domain_lock); 1894 assert_spin_locked(&iommu->lock); 1895 1896 domain->iommu_refcnt[iommu->seq_id] -= 1; 1897 count = --domain->iommu_count; 1898 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1899 num = domain->iommu_did[iommu->seq_id]; 1900 clear_bit(num, iommu->domain_ids); 1901 set_iommu_domain(iommu, num, NULL); 1902 1903 domain_update_iommu_cap(domain); 1904 domain->iommu_did[iommu->seq_id] = 0; 1905 } 1906 1907 return count; 1908 } 1909 1910 static struct iova_domain reserved_iova_list; 1911 static struct lock_class_key reserved_rbtree_key; 1912 1913 static int dmar_init_reserved_ranges(void) 1914 { 1915 struct pci_dev *pdev = NULL; 1916 struct iova *iova; 1917 int i; 1918 1919 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN); 1920 1921 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, 1922 &reserved_rbtree_key); 1923 1924 /* IOAPIC ranges shouldn't be accessed by DMA */ 1925 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), 1926 IOVA_PFN(IOAPIC_RANGE_END)); 1927 if (!iova) { 1928 pr_err("Reserve IOAPIC range failed\n"); 1929 return -ENODEV; 1930 } 1931 1932 /* Reserve all PCI MMIO to avoid peer-to-peer access */ 1933 for_each_pci_dev(pdev) { 1934 struct resource *r; 1935 1936 for (i = 0; i < PCI_NUM_RESOURCES; i++) { 1937 r = &pdev->resource[i]; 1938 if (!r->flags || !(r->flags & IORESOURCE_MEM)) 1939 continue; 1940 iova = reserve_iova(&reserved_iova_list, 1941 IOVA_PFN(r->start), 1942 IOVA_PFN(r->end)); 1943 if (!iova) { 1944 pci_err(pdev, "Reserve iova for %pR failed\n", r); 1945 return -ENODEV; 1946 } 1947 } 1948 } 1949 return 0; 1950 } 1951 1952 static inline int guestwidth_to_adjustwidth(int gaw) 1953 { 1954 int agaw; 1955 int r = (gaw - 12) % 9; 1956 1957 if (r == 0) 1958 agaw = gaw; 1959 else 1960 agaw = gaw + 9 - r; 1961 if (agaw > 64) 1962 agaw = 64; 1963 return agaw; 1964 } 1965 1966 static void domain_exit(struct dmar_domain *domain) 1967 { 1968 1969 /* Remove associated devices and clear attached or cached domains */ 1970 domain_remove_dev_info(domain); 1971 1972 /* destroy iovas */ 1973 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1974 put_iova_domain(&domain->iovad); 1975 1976 if (domain->pgd) { 1977 struct page *freelist; 1978 1979 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); 1980 dma_free_pagelist(freelist); 1981 } 1982 1983 free_domain_mem(domain); 1984 } 1985 1986 /* 1987 * Get the PASID directory size for scalable mode context entry. 1988 * Value of X in the PDTS field of a scalable mode context entry 1989 * indicates PASID directory with 2^(X + 7) entries. 1990 */ 1991 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1992 { 1993 int pds, max_pde; 1994 1995 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1996 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 1997 if (pds < 7) 1998 return 0; 1999 2000 return pds - 7; 2001 } 2002 2003 /* 2004 * Set the RID_PASID field of a scalable mode context entry. The 2005 * IOMMU hardware will use the PASID value set in this field for 2006 * DMA translations of DMA requests without PASID. 2007 */ 2008 static inline void 2009 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2010 { 2011 context->hi |= pasid & ((1 << 20) - 1); 2012 } 2013 2014 /* 2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2016 * entry. 2017 */ 2018 static inline void context_set_sm_dte(struct context_entry *context) 2019 { 2020 context->lo |= (1 << 2); 2021 } 2022 2023 /* 2024 * Set the PRE(Page Request Enable) field of a scalable mode context 2025 * entry. 2026 */ 2027 static inline void context_set_sm_pre(struct context_entry *context) 2028 { 2029 context->lo |= (1 << 4); 2030 } 2031 2032 /* Convert value to context PASID directory size field coding. */ 2033 #define context_pdts(pds) (((pds) & 0x7) << 9) 2034 2035 static int domain_context_mapping_one(struct dmar_domain *domain, 2036 struct intel_iommu *iommu, 2037 struct pasid_table *table, 2038 u8 bus, u8 devfn) 2039 { 2040 u16 did = domain->iommu_did[iommu->seq_id]; 2041 int translation = CONTEXT_TT_MULTI_LEVEL; 2042 struct device_domain_info *info = NULL; 2043 struct context_entry *context; 2044 unsigned long flags; 2045 int ret; 2046 2047 WARN_ON(did == 0); 2048 2049 if (hw_pass_through && domain_type_is_si(domain)) 2050 translation = CONTEXT_TT_PASS_THROUGH; 2051 2052 pr_debug("Set context mapping for %02x:%02x.%d\n", 2053 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2054 2055 BUG_ON(!domain->pgd); 2056 2057 spin_lock_irqsave(&device_domain_lock, flags); 2058 spin_lock(&iommu->lock); 2059 2060 ret = -ENOMEM; 2061 context = iommu_context_addr(iommu, bus, devfn, 1); 2062 if (!context) 2063 goto out_unlock; 2064 2065 ret = 0; 2066 if (context_present(context)) 2067 goto out_unlock; 2068 2069 /* 2070 * For kdump cases, old valid entries may be cached due to the 2071 * in-flight DMA and copied pgtable, but there is no unmapping 2072 * behaviour for them, thus we need an explicit cache flush for 2073 * the newly-mapped device. For kdump, at this point, the device 2074 * is supposed to finish reset at its driver probe stage, so no 2075 * in-flight DMA will exist, and we don't need to worry anymore 2076 * hereafter. 2077 */ 2078 if (context_copied(context)) { 2079 u16 did_old = context_domain_id(context); 2080 2081 if (did_old < cap_ndoms(iommu->cap)) { 2082 iommu->flush.flush_context(iommu, did_old, 2083 (((u16)bus) << 8) | devfn, 2084 DMA_CCMD_MASK_NOBIT, 2085 DMA_CCMD_DEVICE_INVL); 2086 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2087 DMA_TLB_DSI_FLUSH); 2088 } 2089 } 2090 2091 context_clear_entry(context); 2092 2093 if (sm_supported(iommu)) { 2094 unsigned long pds; 2095 2096 WARN_ON(!table); 2097 2098 /* Setup the PASID DIR pointer: */ 2099 pds = context_get_sm_pds(table); 2100 context->lo = (u64)virt_to_phys(table->table) | 2101 context_pdts(pds); 2102 2103 /* Setup the RID_PASID field: */ 2104 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2105 2106 /* 2107 * Setup the Device-TLB enable bit and Page request 2108 * Enable bit: 2109 */ 2110 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2111 if (info && info->ats_supported) 2112 context_set_sm_dte(context); 2113 if (info && info->pri_supported) 2114 context_set_sm_pre(context); 2115 } else { 2116 struct dma_pte *pgd = domain->pgd; 2117 int agaw; 2118 2119 context_set_domain_id(context, did); 2120 2121 if (translation != CONTEXT_TT_PASS_THROUGH) { 2122 /* 2123 * Skip top levels of page tables for iommu which has 2124 * less agaw than default. Unnecessary for PT mode. 2125 */ 2126 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2127 ret = -ENOMEM; 2128 pgd = phys_to_virt(dma_pte_addr(pgd)); 2129 if (!dma_pte_present(pgd)) 2130 goto out_unlock; 2131 } 2132 2133 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2134 if (info && info->ats_supported) 2135 translation = CONTEXT_TT_DEV_IOTLB; 2136 else 2137 translation = CONTEXT_TT_MULTI_LEVEL; 2138 2139 context_set_address_root(context, virt_to_phys(pgd)); 2140 context_set_address_width(context, agaw); 2141 } else { 2142 /* 2143 * In pass through mode, AW must be programmed to 2144 * indicate the largest AGAW value supported by 2145 * hardware. And ASR is ignored by hardware. 2146 */ 2147 context_set_address_width(context, iommu->msagaw); 2148 } 2149 2150 context_set_translation_type(context, translation); 2151 } 2152 2153 context_set_fault_enable(context); 2154 context_set_present(context); 2155 if (!ecap_coherent(iommu->ecap)) 2156 clflush_cache_range(context, sizeof(*context)); 2157 2158 /* 2159 * It's a non-present to present mapping. If hardware doesn't cache 2160 * non-present entry we only need to flush the write-buffer. If the 2161 * _does_ cache non-present entries, then it does so in the special 2162 * domain #0, which we have to flush: 2163 */ 2164 if (cap_caching_mode(iommu->cap)) { 2165 iommu->flush.flush_context(iommu, 0, 2166 (((u16)bus) << 8) | devfn, 2167 DMA_CCMD_MASK_NOBIT, 2168 DMA_CCMD_DEVICE_INVL); 2169 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2170 } else { 2171 iommu_flush_write_buffer(iommu); 2172 } 2173 iommu_enable_dev_iotlb(info); 2174 2175 ret = 0; 2176 2177 out_unlock: 2178 spin_unlock(&iommu->lock); 2179 spin_unlock_irqrestore(&device_domain_lock, flags); 2180 2181 return ret; 2182 } 2183 2184 struct domain_context_mapping_data { 2185 struct dmar_domain *domain; 2186 struct intel_iommu *iommu; 2187 struct pasid_table *table; 2188 }; 2189 2190 static int domain_context_mapping_cb(struct pci_dev *pdev, 2191 u16 alias, void *opaque) 2192 { 2193 struct domain_context_mapping_data *data = opaque; 2194 2195 return domain_context_mapping_one(data->domain, data->iommu, 2196 data->table, PCI_BUS_NUM(alias), 2197 alias & 0xff); 2198 } 2199 2200 static int 2201 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2202 { 2203 struct domain_context_mapping_data data; 2204 struct pasid_table *table; 2205 struct intel_iommu *iommu; 2206 u8 bus, devfn; 2207 2208 iommu = device_to_iommu(dev, &bus, &devfn); 2209 if (!iommu) 2210 return -ENODEV; 2211 2212 table = intel_pasid_get_table(dev); 2213 2214 if (!dev_is_pci(dev)) 2215 return domain_context_mapping_one(domain, iommu, table, 2216 bus, devfn); 2217 2218 data.domain = domain; 2219 data.iommu = iommu; 2220 data.table = table; 2221 2222 return pci_for_each_dma_alias(to_pci_dev(dev), 2223 &domain_context_mapping_cb, &data); 2224 } 2225 2226 static int domain_context_mapped_cb(struct pci_dev *pdev, 2227 u16 alias, void *opaque) 2228 { 2229 struct intel_iommu *iommu = opaque; 2230 2231 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2232 } 2233 2234 static int domain_context_mapped(struct device *dev) 2235 { 2236 struct intel_iommu *iommu; 2237 u8 bus, devfn; 2238 2239 iommu = device_to_iommu(dev, &bus, &devfn); 2240 if (!iommu) 2241 return -ENODEV; 2242 2243 if (!dev_is_pci(dev)) 2244 return device_context_mapped(iommu, bus, devfn); 2245 2246 return !pci_for_each_dma_alias(to_pci_dev(dev), 2247 domain_context_mapped_cb, iommu); 2248 } 2249 2250 /* Returns a number of VTD pages, but aligned to MM page size */ 2251 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2252 size_t size) 2253 { 2254 host_addr &= ~PAGE_MASK; 2255 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2256 } 2257 2258 /* Return largest possible superpage level for a given mapping */ 2259 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2260 unsigned long iov_pfn, 2261 unsigned long phy_pfn, 2262 unsigned long pages) 2263 { 2264 int support, level = 1; 2265 unsigned long pfnmerge; 2266 2267 support = domain->iommu_superpage; 2268 2269 /* To use a large page, the virtual *and* physical addresses 2270 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2271 of them will mean we have to use smaller pages. So just 2272 merge them and check both at once. */ 2273 pfnmerge = iov_pfn | phy_pfn; 2274 2275 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2276 pages >>= VTD_STRIDE_SHIFT; 2277 if (!pages) 2278 break; 2279 pfnmerge >>= VTD_STRIDE_SHIFT; 2280 level++; 2281 support--; 2282 } 2283 return level; 2284 } 2285 2286 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2287 struct scatterlist *sg, unsigned long phys_pfn, 2288 unsigned long nr_pages, int prot) 2289 { 2290 struct dma_pte *first_pte = NULL, *pte = NULL; 2291 phys_addr_t pteval; 2292 unsigned long sg_res = 0; 2293 unsigned int largepage_lvl = 0; 2294 unsigned long lvl_pages = 0; 2295 u64 attr; 2296 2297 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2298 2299 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2300 return -EINVAL; 2301 2302 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2303 if (domain_use_first_level(domain)) 2304 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2305 2306 if (!sg) { 2307 sg_res = nr_pages; 2308 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2309 } 2310 2311 while (nr_pages > 0) { 2312 uint64_t tmp; 2313 2314 if (!sg_res) { 2315 unsigned int pgoff = sg->offset & ~PAGE_MASK; 2316 2317 sg_res = aligned_nrpages(sg->offset, sg->length); 2318 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff; 2319 sg->dma_length = sg->length; 2320 pteval = (sg_phys(sg) - pgoff) | attr; 2321 phys_pfn = pteval >> VTD_PAGE_SHIFT; 2322 } 2323 2324 if (!pte) { 2325 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); 2326 2327 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2328 if (!pte) 2329 return -ENOMEM; 2330 /* It is large page*/ 2331 if (largepage_lvl > 1) { 2332 unsigned long nr_superpages, end_pfn; 2333 2334 pteval |= DMA_PTE_LARGE_PAGE; 2335 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2336 2337 nr_superpages = sg_res / lvl_pages; 2338 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2339 2340 /* 2341 * Ensure that old small page tables are 2342 * removed to make room for superpage(s). 2343 * We're adding new large pages, so make sure 2344 * we don't remove their parent tables. 2345 */ 2346 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2347 largepage_lvl + 1); 2348 } else { 2349 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2350 } 2351 2352 } 2353 /* We don't need lock here, nobody else 2354 * touches the iova range 2355 */ 2356 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2357 if (tmp) { 2358 static int dumps = 5; 2359 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2360 iov_pfn, tmp, (unsigned long long)pteval); 2361 if (dumps) { 2362 dumps--; 2363 debug_dma_dump_mappings(NULL); 2364 } 2365 WARN_ON(1); 2366 } 2367 2368 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2369 2370 BUG_ON(nr_pages < lvl_pages); 2371 BUG_ON(sg_res < lvl_pages); 2372 2373 nr_pages -= lvl_pages; 2374 iov_pfn += lvl_pages; 2375 phys_pfn += lvl_pages; 2376 pteval += lvl_pages * VTD_PAGE_SIZE; 2377 sg_res -= lvl_pages; 2378 2379 /* If the next PTE would be the first in a new page, then we 2380 need to flush the cache on the entries we've just written. 2381 And then we'll need to recalculate 'pte', so clear it and 2382 let it get set again in the if (!pte) block above. 2383 2384 If we're done (!nr_pages) we need to flush the cache too. 2385 2386 Also if we've been setting superpages, we may need to 2387 recalculate 'pte' and switch back to smaller pages for the 2388 end of the mapping, if the trailing size is not enough to 2389 use another superpage (i.e. sg_res < lvl_pages). */ 2390 pte++; 2391 if (!nr_pages || first_pte_in_page(pte) || 2392 (largepage_lvl > 1 && sg_res < lvl_pages)) { 2393 domain_flush_cache(domain, first_pte, 2394 (void *)pte - (void *)first_pte); 2395 pte = NULL; 2396 } 2397 2398 if (!sg_res && nr_pages) 2399 sg = sg_next(sg); 2400 } 2401 return 0; 2402 } 2403 2404 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2405 struct scatterlist *sg, unsigned long phys_pfn, 2406 unsigned long nr_pages, int prot) 2407 { 2408 int iommu_id, ret; 2409 struct intel_iommu *iommu; 2410 2411 /* Do the real mapping first */ 2412 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot); 2413 if (ret) 2414 return ret; 2415 2416 for_each_domain_iommu(iommu_id, domain) { 2417 iommu = g_iommus[iommu_id]; 2418 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2419 } 2420 2421 return 0; 2422 } 2423 2424 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2425 struct scatterlist *sg, unsigned long nr_pages, 2426 int prot) 2427 { 2428 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); 2429 } 2430 2431 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2432 unsigned long phys_pfn, unsigned long nr_pages, 2433 int prot) 2434 { 2435 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); 2436 } 2437 2438 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2439 { 2440 unsigned long flags; 2441 struct context_entry *context; 2442 u16 did_old; 2443 2444 if (!iommu) 2445 return; 2446 2447 spin_lock_irqsave(&iommu->lock, flags); 2448 context = iommu_context_addr(iommu, bus, devfn, 0); 2449 if (!context) { 2450 spin_unlock_irqrestore(&iommu->lock, flags); 2451 return; 2452 } 2453 did_old = context_domain_id(context); 2454 context_clear_entry(context); 2455 __iommu_flush_cache(iommu, context, sizeof(*context)); 2456 spin_unlock_irqrestore(&iommu->lock, flags); 2457 iommu->flush.flush_context(iommu, 2458 did_old, 2459 (((u16)bus) << 8) | devfn, 2460 DMA_CCMD_MASK_NOBIT, 2461 DMA_CCMD_DEVICE_INVL); 2462 iommu->flush.flush_iotlb(iommu, 2463 did_old, 2464 0, 2465 0, 2466 DMA_TLB_DSI_FLUSH); 2467 } 2468 2469 static inline void unlink_domain_info(struct device_domain_info *info) 2470 { 2471 assert_spin_locked(&device_domain_lock); 2472 list_del(&info->link); 2473 list_del(&info->global); 2474 if (info->dev) 2475 dev_iommu_priv_set(info->dev, NULL); 2476 } 2477 2478 static void domain_remove_dev_info(struct dmar_domain *domain) 2479 { 2480 struct device_domain_info *info, *tmp; 2481 unsigned long flags; 2482 2483 spin_lock_irqsave(&device_domain_lock, flags); 2484 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2485 __dmar_remove_one_dev_info(info); 2486 spin_unlock_irqrestore(&device_domain_lock, flags); 2487 } 2488 2489 struct dmar_domain *find_domain(struct device *dev) 2490 { 2491 struct device_domain_info *info; 2492 2493 if (unlikely(attach_deferred(dev))) 2494 return NULL; 2495 2496 /* No lock here, assumes no domain exit in normal case */ 2497 info = get_domain_info(dev); 2498 if (likely(info)) 2499 return info->domain; 2500 2501 return NULL; 2502 } 2503 2504 static void do_deferred_attach(struct device *dev) 2505 { 2506 struct iommu_domain *domain; 2507 2508 dev_iommu_priv_set(dev, NULL); 2509 domain = iommu_get_domain_for_dev(dev); 2510 if (domain) 2511 intel_iommu_attach_device(domain, dev); 2512 } 2513 2514 static inline struct device_domain_info * 2515 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2516 { 2517 struct device_domain_info *info; 2518 2519 list_for_each_entry(info, &device_domain_list, global) 2520 if (info->segment == segment && info->bus == bus && 2521 info->devfn == devfn) 2522 return info; 2523 2524 return NULL; 2525 } 2526 2527 static int domain_setup_first_level(struct intel_iommu *iommu, 2528 struct dmar_domain *domain, 2529 struct device *dev, 2530 int pasid) 2531 { 2532 int flags = PASID_FLAG_SUPERVISOR_MODE; 2533 struct dma_pte *pgd = domain->pgd; 2534 int agaw, level; 2535 2536 /* 2537 * Skip top levels of page tables for iommu which has 2538 * less agaw than default. Unnecessary for PT mode. 2539 */ 2540 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2541 pgd = phys_to_virt(dma_pte_addr(pgd)); 2542 if (!dma_pte_present(pgd)) 2543 return -ENOMEM; 2544 } 2545 2546 level = agaw_to_level(agaw); 2547 if (level != 4 && level != 5) 2548 return -EINVAL; 2549 2550 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2551 2552 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2553 domain->iommu_did[iommu->seq_id], 2554 flags); 2555 } 2556 2557 static bool dev_is_real_dma_subdevice(struct device *dev) 2558 { 2559 return dev && dev_is_pci(dev) && 2560 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2561 } 2562 2563 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2564 int bus, int devfn, 2565 struct device *dev, 2566 struct dmar_domain *domain) 2567 { 2568 struct dmar_domain *found = NULL; 2569 struct device_domain_info *info; 2570 unsigned long flags; 2571 int ret; 2572 2573 info = alloc_devinfo_mem(); 2574 if (!info) 2575 return NULL; 2576 2577 if (!dev_is_real_dma_subdevice(dev)) { 2578 info->bus = bus; 2579 info->devfn = devfn; 2580 info->segment = iommu->segment; 2581 } else { 2582 struct pci_dev *pdev = to_pci_dev(dev); 2583 2584 info->bus = pdev->bus->number; 2585 info->devfn = pdev->devfn; 2586 info->segment = pci_domain_nr(pdev->bus); 2587 } 2588 2589 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2590 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2591 info->ats_qdep = 0; 2592 info->dev = dev; 2593 info->domain = domain; 2594 info->iommu = iommu; 2595 info->pasid_table = NULL; 2596 info->auxd_enabled = 0; 2597 INIT_LIST_HEAD(&info->auxiliary_domains); 2598 2599 if (dev && dev_is_pci(dev)) { 2600 struct pci_dev *pdev = to_pci_dev(info->dev); 2601 2602 if (ecap_dev_iotlb_support(iommu->ecap) && 2603 pci_ats_supported(pdev) && 2604 dmar_find_matched_atsr_unit(pdev)) 2605 info->ats_supported = 1; 2606 2607 if (sm_supported(iommu)) { 2608 if (pasid_supported(iommu)) { 2609 int features = pci_pasid_features(pdev); 2610 if (features >= 0) 2611 info->pasid_supported = features | 1; 2612 } 2613 2614 if (info->ats_supported && ecap_prs(iommu->ecap) && 2615 pci_pri_supported(pdev)) 2616 info->pri_supported = 1; 2617 } 2618 } 2619 2620 spin_lock_irqsave(&device_domain_lock, flags); 2621 if (dev) 2622 found = find_domain(dev); 2623 2624 if (!found) { 2625 struct device_domain_info *info2; 2626 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2627 info->devfn); 2628 if (info2) { 2629 found = info2->domain; 2630 info2->dev = dev; 2631 } 2632 } 2633 2634 if (found) { 2635 spin_unlock_irqrestore(&device_domain_lock, flags); 2636 free_devinfo_mem(info); 2637 /* Caller must free the original domain */ 2638 return found; 2639 } 2640 2641 spin_lock(&iommu->lock); 2642 ret = domain_attach_iommu(domain, iommu); 2643 spin_unlock(&iommu->lock); 2644 2645 if (ret) { 2646 spin_unlock_irqrestore(&device_domain_lock, flags); 2647 free_devinfo_mem(info); 2648 return NULL; 2649 } 2650 2651 list_add(&info->link, &domain->devices); 2652 list_add(&info->global, &device_domain_list); 2653 if (dev) 2654 dev_iommu_priv_set(dev, info); 2655 spin_unlock_irqrestore(&device_domain_lock, flags); 2656 2657 /* PASID table is mandatory for a PCI device in scalable mode. */ 2658 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2659 ret = intel_pasid_alloc_table(dev); 2660 if (ret) { 2661 dev_err(dev, "PASID table allocation failed\n"); 2662 dmar_remove_one_dev_info(dev); 2663 return NULL; 2664 } 2665 2666 /* Setup the PASID entry for requests without PASID: */ 2667 spin_lock(&iommu->lock); 2668 if (hw_pass_through && domain_type_is_si(domain)) 2669 ret = intel_pasid_setup_pass_through(iommu, domain, 2670 dev, PASID_RID2PASID); 2671 else if (domain_use_first_level(domain)) 2672 ret = domain_setup_first_level(iommu, domain, dev, 2673 PASID_RID2PASID); 2674 else 2675 ret = intel_pasid_setup_second_level(iommu, domain, 2676 dev, PASID_RID2PASID); 2677 spin_unlock(&iommu->lock); 2678 if (ret) { 2679 dev_err(dev, "Setup RID2PASID failed\n"); 2680 dmar_remove_one_dev_info(dev); 2681 return NULL; 2682 } 2683 } 2684 2685 if (dev && domain_context_mapping(domain, dev)) { 2686 dev_err(dev, "Domain context map failed\n"); 2687 dmar_remove_one_dev_info(dev); 2688 return NULL; 2689 } 2690 2691 return domain; 2692 } 2693 2694 static int iommu_domain_identity_map(struct dmar_domain *domain, 2695 unsigned long first_vpfn, 2696 unsigned long last_vpfn) 2697 { 2698 /* 2699 * RMRR range might have overlap with physical memory range, 2700 * clear it first 2701 */ 2702 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2703 2704 return __domain_mapping(domain, first_vpfn, NULL, 2705 first_vpfn, last_vpfn - first_vpfn + 1, 2706 DMA_PTE_READ|DMA_PTE_WRITE); 2707 } 2708 2709 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2710 2711 static int __init si_domain_init(int hw) 2712 { 2713 struct dmar_rmrr_unit *rmrr; 2714 struct device *dev; 2715 int i, nid, ret; 2716 2717 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2718 if (!si_domain) 2719 return -EFAULT; 2720 2721 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2722 domain_exit(si_domain); 2723 return -EFAULT; 2724 } 2725 2726 if (hw) 2727 return 0; 2728 2729 for_each_online_node(nid) { 2730 unsigned long start_pfn, end_pfn; 2731 int i; 2732 2733 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2734 ret = iommu_domain_identity_map(si_domain, 2735 mm_to_dma_pfn(start_pfn), 2736 mm_to_dma_pfn(end_pfn)); 2737 if (ret) 2738 return ret; 2739 } 2740 } 2741 2742 /* 2743 * Identity map the RMRRs so that devices with RMRRs could also use 2744 * the si_domain. 2745 */ 2746 for_each_rmrr_units(rmrr) { 2747 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2748 i, dev) { 2749 unsigned long long start = rmrr->base_address; 2750 unsigned long long end = rmrr->end_address; 2751 2752 if (WARN_ON(end < start || 2753 end >> agaw_to_width(si_domain->agaw))) 2754 continue; 2755 2756 ret = iommu_domain_identity_map(si_domain, 2757 mm_to_dma_pfn(start >> PAGE_SHIFT), 2758 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2759 if (ret) 2760 return ret; 2761 } 2762 } 2763 2764 return 0; 2765 } 2766 2767 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2768 { 2769 struct dmar_domain *ndomain; 2770 struct intel_iommu *iommu; 2771 u8 bus, devfn; 2772 2773 iommu = device_to_iommu(dev, &bus, &devfn); 2774 if (!iommu) 2775 return -ENODEV; 2776 2777 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2778 if (ndomain != domain) 2779 return -EBUSY; 2780 2781 return 0; 2782 } 2783 2784 static bool device_has_rmrr(struct device *dev) 2785 { 2786 struct dmar_rmrr_unit *rmrr; 2787 struct device *tmp; 2788 int i; 2789 2790 rcu_read_lock(); 2791 for_each_rmrr_units(rmrr) { 2792 /* 2793 * Return TRUE if this RMRR contains the device that 2794 * is passed in. 2795 */ 2796 for_each_active_dev_scope(rmrr->devices, 2797 rmrr->devices_cnt, i, tmp) 2798 if (tmp == dev || 2799 is_downstream_to_pci_bridge(dev, tmp)) { 2800 rcu_read_unlock(); 2801 return true; 2802 } 2803 } 2804 rcu_read_unlock(); 2805 return false; 2806 } 2807 2808 /** 2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2810 * is relaxable (ie. is allowed to be not enforced under some conditions) 2811 * @dev: device handle 2812 * 2813 * We assume that PCI USB devices with RMRRs have them largely 2814 * for historical reasons and that the RMRR space is not actively used post 2815 * boot. This exclusion may change if vendors begin to abuse it. 2816 * 2817 * The same exception is made for graphics devices, with the requirement that 2818 * any use of the RMRR regions will be torn down before assigning the device 2819 * to a guest. 2820 * 2821 * Return: true if the RMRR is relaxable, false otherwise 2822 */ 2823 static bool device_rmrr_is_relaxable(struct device *dev) 2824 { 2825 struct pci_dev *pdev; 2826 2827 if (!dev_is_pci(dev)) 2828 return false; 2829 2830 pdev = to_pci_dev(dev); 2831 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2832 return true; 2833 else 2834 return false; 2835 } 2836 2837 /* 2838 * There are a couple cases where we need to restrict the functionality of 2839 * devices associated with RMRRs. The first is when evaluating a device for 2840 * identity mapping because problems exist when devices are moved in and out 2841 * of domains and their respective RMRR information is lost. This means that 2842 * a device with associated RMRRs will never be in a "passthrough" domain. 2843 * The second is use of the device through the IOMMU API. This interface 2844 * expects to have full control of the IOVA space for the device. We cannot 2845 * satisfy both the requirement that RMRR access is maintained and have an 2846 * unencumbered IOVA space. We also have no ability to quiesce the device's 2847 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2848 * We therefore prevent devices associated with an RMRR from participating in 2849 * the IOMMU API, which eliminates them from device assignment. 2850 * 2851 * In both cases, devices which have relaxable RMRRs are not concerned by this 2852 * restriction. See device_rmrr_is_relaxable comment. 2853 */ 2854 static bool device_is_rmrr_locked(struct device *dev) 2855 { 2856 if (!device_has_rmrr(dev)) 2857 return false; 2858 2859 if (device_rmrr_is_relaxable(dev)) 2860 return false; 2861 2862 return true; 2863 } 2864 2865 /* 2866 * Return the required default domain type for a specific device. 2867 * 2868 * @dev: the device in query 2869 * @startup: true if this is during early boot 2870 * 2871 * Returns: 2872 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2873 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2874 * - 0: both identity and dynamic domains work for this device 2875 */ 2876 static int device_def_domain_type(struct device *dev) 2877 { 2878 if (dev_is_pci(dev)) { 2879 struct pci_dev *pdev = to_pci_dev(dev); 2880 2881 /* 2882 * Prevent any device marked as untrusted from getting 2883 * placed into the statically identity mapping domain. 2884 */ 2885 if (pdev->untrusted) 2886 return IOMMU_DOMAIN_DMA; 2887 2888 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2889 return IOMMU_DOMAIN_IDENTITY; 2890 2891 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2892 return IOMMU_DOMAIN_IDENTITY; 2893 } 2894 2895 return 0; 2896 } 2897 2898 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2899 { 2900 /* 2901 * Start from the sane iommu hardware state. 2902 * If the queued invalidation is already initialized by us 2903 * (for example, while enabling interrupt-remapping) then 2904 * we got the things already rolling from a sane state. 2905 */ 2906 if (!iommu->qi) { 2907 /* 2908 * Clear any previous faults. 2909 */ 2910 dmar_fault(-1, iommu); 2911 /* 2912 * Disable queued invalidation if supported and already enabled 2913 * before OS handover. 2914 */ 2915 dmar_disable_qi(iommu); 2916 } 2917 2918 if (dmar_enable_qi(iommu)) { 2919 /* 2920 * Queued Invalidate not enabled, use Register Based Invalidate 2921 */ 2922 iommu->flush.flush_context = __iommu_flush_context; 2923 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2924 pr_info("%s: Using Register based invalidation\n", 2925 iommu->name); 2926 } else { 2927 iommu->flush.flush_context = qi_flush_context; 2928 iommu->flush.flush_iotlb = qi_flush_iotlb; 2929 pr_info("%s: Using Queued invalidation\n", iommu->name); 2930 } 2931 } 2932 2933 static int copy_context_table(struct intel_iommu *iommu, 2934 struct root_entry *old_re, 2935 struct context_entry **tbl, 2936 int bus, bool ext) 2937 { 2938 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2939 struct context_entry *new_ce = NULL, ce; 2940 struct context_entry *old_ce = NULL; 2941 struct root_entry re; 2942 phys_addr_t old_ce_phys; 2943 2944 tbl_idx = ext ? bus * 2 : bus; 2945 memcpy(&re, old_re, sizeof(re)); 2946 2947 for (devfn = 0; devfn < 256; devfn++) { 2948 /* First calculate the correct index */ 2949 idx = (ext ? devfn * 2 : devfn) % 256; 2950 2951 if (idx == 0) { 2952 /* First save what we may have and clean up */ 2953 if (new_ce) { 2954 tbl[tbl_idx] = new_ce; 2955 __iommu_flush_cache(iommu, new_ce, 2956 VTD_PAGE_SIZE); 2957 pos = 1; 2958 } 2959 2960 if (old_ce) 2961 memunmap(old_ce); 2962 2963 ret = 0; 2964 if (devfn < 0x80) 2965 old_ce_phys = root_entry_lctp(&re); 2966 else 2967 old_ce_phys = root_entry_uctp(&re); 2968 2969 if (!old_ce_phys) { 2970 if (ext && devfn == 0) { 2971 /* No LCTP, try UCTP */ 2972 devfn = 0x7f; 2973 continue; 2974 } else { 2975 goto out; 2976 } 2977 } 2978 2979 ret = -ENOMEM; 2980 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2981 MEMREMAP_WB); 2982 if (!old_ce) 2983 goto out; 2984 2985 new_ce = alloc_pgtable_page(iommu->node); 2986 if (!new_ce) 2987 goto out_unmap; 2988 2989 ret = 0; 2990 } 2991 2992 /* Now copy the context entry */ 2993 memcpy(&ce, old_ce + idx, sizeof(ce)); 2994 2995 if (!__context_present(&ce)) 2996 continue; 2997 2998 did = context_domain_id(&ce); 2999 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3000 set_bit(did, iommu->domain_ids); 3001 3002 /* 3003 * We need a marker for copied context entries. This 3004 * marker needs to work for the old format as well as 3005 * for extended context entries. 3006 * 3007 * Bit 67 of the context entry is used. In the old 3008 * format this bit is available to software, in the 3009 * extended format it is the PGE bit, but PGE is ignored 3010 * by HW if PASIDs are disabled (and thus still 3011 * available). 3012 * 3013 * So disable PASIDs first and then mark the entry 3014 * copied. This means that we don't copy PASID 3015 * translations from the old kernel, but this is fine as 3016 * faults there are not fatal. 3017 */ 3018 context_clear_pasid_enable(&ce); 3019 context_set_copied(&ce); 3020 3021 new_ce[idx] = ce; 3022 } 3023 3024 tbl[tbl_idx + pos] = new_ce; 3025 3026 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3027 3028 out_unmap: 3029 memunmap(old_ce); 3030 3031 out: 3032 return ret; 3033 } 3034 3035 static int copy_translation_tables(struct intel_iommu *iommu) 3036 { 3037 struct context_entry **ctxt_tbls; 3038 struct root_entry *old_rt; 3039 phys_addr_t old_rt_phys; 3040 int ctxt_table_entries; 3041 unsigned long flags; 3042 u64 rtaddr_reg; 3043 int bus, ret; 3044 bool new_ext, ext; 3045 3046 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3047 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3048 new_ext = !!ecap_ecs(iommu->ecap); 3049 3050 /* 3051 * The RTT bit can only be changed when translation is disabled, 3052 * but disabling translation means to open a window for data 3053 * corruption. So bail out and don't copy anything if we would 3054 * have to change the bit. 3055 */ 3056 if (new_ext != ext) 3057 return -EINVAL; 3058 3059 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3060 if (!old_rt_phys) 3061 return -EINVAL; 3062 3063 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3064 if (!old_rt) 3065 return -ENOMEM; 3066 3067 /* This is too big for the stack - allocate it from slab */ 3068 ctxt_table_entries = ext ? 512 : 256; 3069 ret = -ENOMEM; 3070 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3071 if (!ctxt_tbls) 3072 goto out_unmap; 3073 3074 for (bus = 0; bus < 256; bus++) { 3075 ret = copy_context_table(iommu, &old_rt[bus], 3076 ctxt_tbls, bus, ext); 3077 if (ret) { 3078 pr_err("%s: Failed to copy context table for bus %d\n", 3079 iommu->name, bus); 3080 continue; 3081 } 3082 } 3083 3084 spin_lock_irqsave(&iommu->lock, flags); 3085 3086 /* Context tables are copied, now write them to the root_entry table */ 3087 for (bus = 0; bus < 256; bus++) { 3088 int idx = ext ? bus * 2 : bus; 3089 u64 val; 3090 3091 if (ctxt_tbls[idx]) { 3092 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3093 iommu->root_entry[bus].lo = val; 3094 } 3095 3096 if (!ext || !ctxt_tbls[idx + 1]) 3097 continue; 3098 3099 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3100 iommu->root_entry[bus].hi = val; 3101 } 3102 3103 spin_unlock_irqrestore(&iommu->lock, flags); 3104 3105 kfree(ctxt_tbls); 3106 3107 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3108 3109 ret = 0; 3110 3111 out_unmap: 3112 memunmap(old_rt); 3113 3114 return ret; 3115 } 3116 3117 #ifdef CONFIG_INTEL_IOMMU_SVM 3118 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3119 { 3120 struct intel_iommu *iommu = data; 3121 ioasid_t ioasid; 3122 3123 if (!iommu) 3124 return INVALID_IOASID; 3125 /* 3126 * VT-d virtual command interface always uses the full 20 bit 3127 * PASID range. Host can partition guest PASID range based on 3128 * policies but it is out of guest's control. 3129 */ 3130 if (min < PASID_MIN || max > intel_pasid_max_id) 3131 return INVALID_IOASID; 3132 3133 if (vcmd_alloc_pasid(iommu, &ioasid)) 3134 return INVALID_IOASID; 3135 3136 return ioasid; 3137 } 3138 3139 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3140 { 3141 struct intel_iommu *iommu = data; 3142 3143 if (!iommu) 3144 return; 3145 /* 3146 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3147 * We can only free the PASID when all the devices are unbound. 3148 */ 3149 if (ioasid_find(NULL, ioasid, NULL)) { 3150 pr_alert("Cannot free active IOASID %d\n", ioasid); 3151 return; 3152 } 3153 vcmd_free_pasid(iommu, ioasid); 3154 } 3155 3156 static void register_pasid_allocator(struct intel_iommu *iommu) 3157 { 3158 /* 3159 * If we are running in the host, no need for custom allocator 3160 * in that PASIDs are allocated from the host system-wide. 3161 */ 3162 if (!cap_caching_mode(iommu->cap)) 3163 return; 3164 3165 if (!sm_supported(iommu)) { 3166 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3167 return; 3168 } 3169 3170 /* 3171 * Register a custom PASID allocator if we are running in a guest, 3172 * guest PASID must be obtained via virtual command interface. 3173 * There can be multiple vIOMMUs in each guest but only one allocator 3174 * is active. All vIOMMU allocators will eventually be calling the same 3175 * host allocator. 3176 */ 3177 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) 3178 return; 3179 3180 pr_info("Register custom PASID allocator\n"); 3181 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3182 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3183 iommu->pasid_allocator.pdata = (void *)iommu; 3184 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3185 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3186 /* 3187 * Disable scalable mode on this IOMMU if there 3188 * is no custom allocator. Mixing SM capable vIOMMU 3189 * and non-SM vIOMMU are not supported. 3190 */ 3191 intel_iommu_sm = 0; 3192 } 3193 } 3194 #endif 3195 3196 static int __init init_dmars(void) 3197 { 3198 struct dmar_drhd_unit *drhd; 3199 struct intel_iommu *iommu; 3200 int ret; 3201 3202 /* 3203 * for each drhd 3204 * allocate root 3205 * initialize and program root entry to not present 3206 * endfor 3207 */ 3208 for_each_drhd_unit(drhd) { 3209 /* 3210 * lock not needed as this is only incremented in the single 3211 * threaded kernel __init code path all other access are read 3212 * only 3213 */ 3214 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3215 g_num_of_iommus++; 3216 continue; 3217 } 3218 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3219 } 3220 3221 /* Preallocate enough resources for IOMMU hot-addition */ 3222 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3223 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3224 3225 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3226 GFP_KERNEL); 3227 if (!g_iommus) { 3228 pr_err("Allocating global iommu array failed\n"); 3229 ret = -ENOMEM; 3230 goto error; 3231 } 3232 3233 for_each_iommu(iommu, drhd) { 3234 if (drhd->ignored) { 3235 iommu_disable_translation(iommu); 3236 continue; 3237 } 3238 3239 /* 3240 * Find the max pasid size of all IOMMU's in the system. 3241 * We need to ensure the system pasid table is no bigger 3242 * than the smallest supported. 3243 */ 3244 if (pasid_supported(iommu)) { 3245 u32 temp = 2 << ecap_pss(iommu->ecap); 3246 3247 intel_pasid_max_id = min_t(u32, temp, 3248 intel_pasid_max_id); 3249 } 3250 3251 g_iommus[iommu->seq_id] = iommu; 3252 3253 intel_iommu_init_qi(iommu); 3254 3255 ret = iommu_init_domains(iommu); 3256 if (ret) 3257 goto free_iommu; 3258 3259 init_translation_status(iommu); 3260 3261 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3262 iommu_disable_translation(iommu); 3263 clear_translation_pre_enabled(iommu); 3264 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3265 iommu->name); 3266 } 3267 3268 /* 3269 * TBD: 3270 * we could share the same root & context tables 3271 * among all IOMMU's. Need to Split it later. 3272 */ 3273 ret = iommu_alloc_root_entry(iommu); 3274 if (ret) 3275 goto free_iommu; 3276 3277 if (translation_pre_enabled(iommu)) { 3278 pr_info("Translation already enabled - trying to copy translation structures\n"); 3279 3280 ret = copy_translation_tables(iommu); 3281 if (ret) { 3282 /* 3283 * We found the IOMMU with translation 3284 * enabled - but failed to copy over the 3285 * old root-entry table. Try to proceed 3286 * by disabling translation now and 3287 * allocating a clean root-entry table. 3288 * This might cause DMAR faults, but 3289 * probably the dump will still succeed. 3290 */ 3291 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3292 iommu->name); 3293 iommu_disable_translation(iommu); 3294 clear_translation_pre_enabled(iommu); 3295 } else { 3296 pr_info("Copied translation tables from previous kernel for %s\n", 3297 iommu->name); 3298 } 3299 } 3300 3301 if (!ecap_pass_through(iommu->ecap)) 3302 hw_pass_through = 0; 3303 intel_svm_check(iommu); 3304 } 3305 3306 /* 3307 * Now that qi is enabled on all iommus, set the root entry and flush 3308 * caches. This is required on some Intel X58 chipsets, otherwise the 3309 * flush_context function will loop forever and the boot hangs. 3310 */ 3311 for_each_active_iommu(iommu, drhd) { 3312 iommu_flush_write_buffer(iommu); 3313 #ifdef CONFIG_INTEL_IOMMU_SVM 3314 register_pasid_allocator(iommu); 3315 #endif 3316 iommu_set_root_entry(iommu); 3317 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3318 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3319 } 3320 3321 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3322 dmar_map_gfx = 0; 3323 #endif 3324 3325 if (!dmar_map_gfx) 3326 iommu_identity_mapping |= IDENTMAP_GFX; 3327 3328 check_tylersburg_isoch(); 3329 3330 ret = si_domain_init(hw_pass_through); 3331 if (ret) 3332 goto free_iommu; 3333 3334 /* 3335 * for each drhd 3336 * enable fault log 3337 * global invalidate context cache 3338 * global invalidate iotlb 3339 * enable translation 3340 */ 3341 for_each_iommu(iommu, drhd) { 3342 if (drhd->ignored) { 3343 /* 3344 * we always have to disable PMRs or DMA may fail on 3345 * this device 3346 */ 3347 if (force_on) 3348 iommu_disable_protect_mem_regions(iommu); 3349 continue; 3350 } 3351 3352 iommu_flush_write_buffer(iommu); 3353 3354 #ifdef CONFIG_INTEL_IOMMU_SVM 3355 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3356 /* 3357 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3358 * could cause possible lock race condition. 3359 */ 3360 up_write(&dmar_global_lock); 3361 ret = intel_svm_enable_prq(iommu); 3362 down_write(&dmar_global_lock); 3363 if (ret) 3364 goto free_iommu; 3365 } 3366 #endif 3367 ret = dmar_set_interrupt(iommu); 3368 if (ret) 3369 goto free_iommu; 3370 } 3371 3372 return 0; 3373 3374 free_iommu: 3375 for_each_active_iommu(iommu, drhd) { 3376 disable_dmar_iommu(iommu); 3377 free_dmar_iommu(iommu); 3378 } 3379 3380 kfree(g_iommus); 3381 3382 error: 3383 return ret; 3384 } 3385 3386 /* This takes a number of _MM_ pages, not VTD pages */ 3387 static unsigned long intel_alloc_iova(struct device *dev, 3388 struct dmar_domain *domain, 3389 unsigned long nrpages, uint64_t dma_mask) 3390 { 3391 unsigned long iova_pfn; 3392 3393 /* 3394 * Restrict dma_mask to the width that the iommu can handle. 3395 * First-level translation restricts the input-address to a 3396 * canonical address (i.e., address bits 63:N have the same 3397 * value as address bit [N-1], where N is 48-bits with 4-level 3398 * paging and 57-bits with 5-level paging). Hence, skip bit 3399 * [N-1]. 3400 */ 3401 if (domain_use_first_level(domain)) 3402 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1), 3403 dma_mask); 3404 else 3405 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), 3406 dma_mask); 3407 3408 /* Ensure we reserve the whole size-aligned region */ 3409 nrpages = __roundup_pow_of_two(nrpages); 3410 3411 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { 3412 /* 3413 * First try to allocate an io virtual address in 3414 * DMA_BIT_MASK(32) and if that fails then try allocating 3415 * from higher range 3416 */ 3417 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3418 IOVA_PFN(DMA_BIT_MASK(32)), false); 3419 if (iova_pfn) 3420 return iova_pfn; 3421 } 3422 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3423 IOVA_PFN(dma_mask), true); 3424 if (unlikely(!iova_pfn)) { 3425 dev_err_once(dev, "Allocating %ld-page iova failed\n", 3426 nrpages); 3427 return 0; 3428 } 3429 3430 return iova_pfn; 3431 } 3432 3433 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, 3434 size_t size, int dir, u64 dma_mask) 3435 { 3436 struct dmar_domain *domain; 3437 phys_addr_t start_paddr; 3438 unsigned long iova_pfn; 3439 int prot = 0; 3440 int ret; 3441 struct intel_iommu *iommu; 3442 unsigned long paddr_pfn = paddr >> PAGE_SHIFT; 3443 3444 BUG_ON(dir == DMA_NONE); 3445 3446 if (unlikely(attach_deferred(dev))) 3447 do_deferred_attach(dev); 3448 3449 domain = find_domain(dev); 3450 if (!domain) 3451 return DMA_MAPPING_ERROR; 3452 3453 iommu = domain_get_iommu(domain); 3454 size = aligned_nrpages(paddr, size); 3455 3456 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); 3457 if (!iova_pfn) 3458 goto error; 3459 3460 /* 3461 * Check if DMAR supports zero-length reads on write only 3462 * mappings.. 3463 */ 3464 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3465 !cap_zlr(iommu->cap)) 3466 prot |= DMA_PTE_READ; 3467 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3468 prot |= DMA_PTE_WRITE; 3469 /* 3470 * paddr - (paddr + size) might be partial page, we should map the whole 3471 * page. Note: if two part of one page are separately mapped, we 3472 * might have two guest_addr mapping to the same host paddr, but this 3473 * is not a big problem 3474 */ 3475 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3476 mm_to_dma_pfn(paddr_pfn), size, prot); 3477 if (ret) 3478 goto error; 3479 3480 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT; 3481 start_paddr += paddr & ~PAGE_MASK; 3482 3483 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT); 3484 3485 return start_paddr; 3486 3487 error: 3488 if (iova_pfn) 3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3490 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n", 3491 size, (unsigned long long)paddr, dir); 3492 return DMA_MAPPING_ERROR; 3493 } 3494 3495 static dma_addr_t intel_map_page(struct device *dev, struct page *page, 3496 unsigned long offset, size_t size, 3497 enum dma_data_direction dir, 3498 unsigned long attrs) 3499 { 3500 return __intel_map_single(dev, page_to_phys(page) + offset, 3501 size, dir, *dev->dma_mask); 3502 } 3503 3504 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr, 3505 size_t size, enum dma_data_direction dir, 3506 unsigned long attrs) 3507 { 3508 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask); 3509 } 3510 3511 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) 3512 { 3513 struct dmar_domain *domain; 3514 unsigned long start_pfn, last_pfn; 3515 unsigned long nrpages; 3516 unsigned long iova_pfn; 3517 struct intel_iommu *iommu; 3518 struct page *freelist; 3519 struct pci_dev *pdev = NULL; 3520 3521 domain = find_domain(dev); 3522 BUG_ON(!domain); 3523 3524 iommu = domain_get_iommu(domain); 3525 3526 iova_pfn = IOVA_PFN(dev_addr); 3527 3528 nrpages = aligned_nrpages(dev_addr, size); 3529 start_pfn = mm_to_dma_pfn(iova_pfn); 3530 last_pfn = start_pfn + nrpages - 1; 3531 3532 if (dev_is_pci(dev)) 3533 pdev = to_pci_dev(dev); 3534 3535 freelist = domain_unmap(domain, start_pfn, last_pfn); 3536 if (intel_iommu_strict || (pdev && pdev->untrusted) || 3537 !has_iova_flush_queue(&domain->iovad)) { 3538 iommu_flush_iotlb_psi(iommu, domain, start_pfn, 3539 nrpages, !freelist, 0); 3540 /* free iova */ 3541 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3542 dma_free_pagelist(freelist); 3543 } else { 3544 queue_iova(&domain->iovad, iova_pfn, nrpages, 3545 (unsigned long)freelist); 3546 /* 3547 * queue up the release of the unmap to save the 1/6th of the 3548 * cpu used up by the iotlb flush operation... 3549 */ 3550 } 3551 3552 trace_unmap_single(dev, dev_addr, size); 3553 } 3554 3555 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, 3556 size_t size, enum dma_data_direction dir, 3557 unsigned long attrs) 3558 { 3559 intel_unmap(dev, dev_addr, size); 3560 } 3561 3562 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr, 3563 size_t size, enum dma_data_direction dir, unsigned long attrs) 3564 { 3565 intel_unmap(dev, dev_addr, size); 3566 } 3567 3568 static void *intel_alloc_coherent(struct device *dev, size_t size, 3569 dma_addr_t *dma_handle, gfp_t flags, 3570 unsigned long attrs) 3571 { 3572 struct page *page = NULL; 3573 int order; 3574 3575 if (unlikely(attach_deferred(dev))) 3576 do_deferred_attach(dev); 3577 3578 size = PAGE_ALIGN(size); 3579 order = get_order(size); 3580 3581 if (gfpflags_allow_blocking(flags)) { 3582 unsigned int count = size >> PAGE_SHIFT; 3583 3584 page = dma_alloc_from_contiguous(dev, count, order, 3585 flags & __GFP_NOWARN); 3586 } 3587 3588 if (!page) 3589 page = alloc_pages(flags, order); 3590 if (!page) 3591 return NULL; 3592 memset(page_address(page), 0, size); 3593 3594 *dma_handle = __intel_map_single(dev, page_to_phys(page), size, 3595 DMA_BIDIRECTIONAL, 3596 dev->coherent_dma_mask); 3597 if (*dma_handle != DMA_MAPPING_ERROR) 3598 return page_address(page); 3599 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3600 __free_pages(page, order); 3601 3602 return NULL; 3603 } 3604 3605 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, 3606 dma_addr_t dma_handle, unsigned long attrs) 3607 { 3608 int order; 3609 struct page *page = virt_to_page(vaddr); 3610 3611 size = PAGE_ALIGN(size); 3612 order = get_order(size); 3613 3614 intel_unmap(dev, dma_handle, size); 3615 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3616 __free_pages(page, order); 3617 } 3618 3619 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, 3620 int nelems, enum dma_data_direction dir, 3621 unsigned long attrs) 3622 { 3623 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK; 3624 unsigned long nrpages = 0; 3625 struct scatterlist *sg; 3626 int i; 3627 3628 for_each_sg(sglist, sg, nelems, i) { 3629 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); 3630 } 3631 3632 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3633 3634 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3635 } 3636 3637 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3638 enum dma_data_direction dir, unsigned long attrs) 3639 { 3640 int i; 3641 struct dmar_domain *domain; 3642 size_t size = 0; 3643 int prot = 0; 3644 unsigned long iova_pfn; 3645 int ret; 3646 struct scatterlist *sg; 3647 unsigned long start_vpfn; 3648 struct intel_iommu *iommu; 3649 3650 BUG_ON(dir == DMA_NONE); 3651 3652 if (unlikely(attach_deferred(dev))) 3653 do_deferred_attach(dev); 3654 3655 domain = find_domain(dev); 3656 if (!domain) 3657 return 0; 3658 3659 iommu = domain_get_iommu(domain); 3660 3661 for_each_sg(sglist, sg, nelems, i) 3662 size += aligned_nrpages(sg->offset, sg->length); 3663 3664 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), 3665 *dev->dma_mask); 3666 if (!iova_pfn) { 3667 sglist->dma_length = 0; 3668 return 0; 3669 } 3670 3671 /* 3672 * Check if DMAR supports zero-length reads on write only 3673 * mappings.. 3674 */ 3675 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3676 !cap_zlr(iommu->cap)) 3677 prot |= DMA_PTE_READ; 3678 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3679 prot |= DMA_PTE_WRITE; 3680 3681 start_vpfn = mm_to_dma_pfn(iova_pfn); 3682 3683 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); 3684 if (unlikely(ret)) { 3685 dma_pte_free_pagetable(domain, start_vpfn, 3686 start_vpfn + size - 1, 3687 agaw_to_level(domain->agaw) + 1); 3688 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3689 return 0; 3690 } 3691 3692 for_each_sg(sglist, sg, nelems, i) 3693 trace_map_sg(dev, i + 1, nelems, sg); 3694 3695 return nelems; 3696 } 3697 3698 static u64 intel_get_required_mask(struct device *dev) 3699 { 3700 return DMA_BIT_MASK(32); 3701 } 3702 3703 static const struct dma_map_ops intel_dma_ops = { 3704 .alloc = intel_alloc_coherent, 3705 .free = intel_free_coherent, 3706 .map_sg = intel_map_sg, 3707 .unmap_sg = intel_unmap_sg, 3708 .map_page = intel_map_page, 3709 .unmap_page = intel_unmap_page, 3710 .map_resource = intel_map_resource, 3711 .unmap_resource = intel_unmap_resource, 3712 .dma_supported = dma_direct_supported, 3713 .mmap = dma_common_mmap, 3714 .get_sgtable = dma_common_get_sgtable, 3715 .get_required_mask = intel_get_required_mask, 3716 }; 3717 3718 static void 3719 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size, 3720 enum dma_data_direction dir, enum dma_sync_target target) 3721 { 3722 struct dmar_domain *domain; 3723 phys_addr_t tlb_addr; 3724 3725 domain = find_domain(dev); 3726 if (WARN_ON(!domain)) 3727 return; 3728 3729 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr); 3730 if (is_swiotlb_buffer(tlb_addr)) 3731 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target); 3732 } 3733 3734 static dma_addr_t 3735 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, 3736 enum dma_data_direction dir, unsigned long attrs, 3737 u64 dma_mask) 3738 { 3739 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3740 struct dmar_domain *domain; 3741 struct intel_iommu *iommu; 3742 unsigned long iova_pfn; 3743 unsigned long nrpages; 3744 phys_addr_t tlb_addr; 3745 int prot = 0; 3746 int ret; 3747 3748 if (unlikely(attach_deferred(dev))) 3749 do_deferred_attach(dev); 3750 3751 domain = find_domain(dev); 3752 3753 if (WARN_ON(dir == DMA_NONE || !domain)) 3754 return DMA_MAPPING_ERROR; 3755 3756 iommu = domain_get_iommu(domain); 3757 if (WARN_ON(!iommu)) 3758 return DMA_MAPPING_ERROR; 3759 3760 nrpages = aligned_nrpages(0, size); 3761 iova_pfn = intel_alloc_iova(dev, domain, 3762 dma_to_mm_pfn(nrpages), dma_mask); 3763 if (!iova_pfn) 3764 return DMA_MAPPING_ERROR; 3765 3766 /* 3767 * Check if DMAR supports zero-length reads on write only 3768 * mappings.. 3769 */ 3770 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || 3771 !cap_zlr(iommu->cap)) 3772 prot |= DMA_PTE_READ; 3773 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3774 prot |= DMA_PTE_WRITE; 3775 3776 /* 3777 * If both the physical buffer start address and size are 3778 * page aligned, we don't need to use a bounce page. 3779 */ 3780 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { 3781 tlb_addr = swiotlb_tbl_map_single(dev, 3782 __phys_to_dma(dev, io_tlb_start), 3783 paddr, size, aligned_size, dir, attrs); 3784 if (tlb_addr == DMA_MAPPING_ERROR) { 3785 goto swiotlb_error; 3786 } else { 3787 /* Cleanup the padding area. */ 3788 void *padding_start = phys_to_virt(tlb_addr); 3789 size_t padding_size = aligned_size; 3790 3791 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 3792 (dir == DMA_TO_DEVICE || 3793 dir == DMA_BIDIRECTIONAL)) { 3794 padding_start += size; 3795 padding_size -= size; 3796 } 3797 3798 memset(padding_start, 0, padding_size); 3799 } 3800 } else { 3801 tlb_addr = paddr; 3802 } 3803 3804 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3805 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot); 3806 if (ret) 3807 goto mapping_error; 3808 3809 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size); 3810 3811 return (phys_addr_t)iova_pfn << PAGE_SHIFT; 3812 3813 mapping_error: 3814 if (is_swiotlb_buffer(tlb_addr)) 3815 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3816 aligned_size, dir, attrs); 3817 swiotlb_error: 3818 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3819 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n", 3820 size, (unsigned long long)paddr, dir); 3821 3822 return DMA_MAPPING_ERROR; 3823 } 3824 3825 static void 3826 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, 3827 enum dma_data_direction dir, unsigned long attrs) 3828 { 3829 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3830 struct dmar_domain *domain; 3831 phys_addr_t tlb_addr; 3832 3833 domain = find_domain(dev); 3834 if (WARN_ON(!domain)) 3835 return; 3836 3837 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr); 3838 if (WARN_ON(!tlb_addr)) 3839 return; 3840 3841 intel_unmap(dev, dev_addr, size); 3842 if (is_swiotlb_buffer(tlb_addr)) 3843 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3844 aligned_size, dir, attrs); 3845 3846 trace_bounce_unmap_single(dev, dev_addr, size); 3847 } 3848 3849 static dma_addr_t 3850 bounce_map_page(struct device *dev, struct page *page, unsigned long offset, 3851 size_t size, enum dma_data_direction dir, unsigned long attrs) 3852 { 3853 return bounce_map_single(dev, page_to_phys(page) + offset, 3854 size, dir, attrs, *dev->dma_mask); 3855 } 3856 3857 static dma_addr_t 3858 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, 3859 enum dma_data_direction dir, unsigned long attrs) 3860 { 3861 return bounce_map_single(dev, phys_addr, size, 3862 dir, attrs, *dev->dma_mask); 3863 } 3864 3865 static void 3866 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, 3867 enum dma_data_direction dir, unsigned long attrs) 3868 { 3869 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3870 } 3871 3872 static void 3873 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size, 3874 enum dma_data_direction dir, unsigned long attrs) 3875 { 3876 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3877 } 3878 3879 static void 3880 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3881 enum dma_data_direction dir, unsigned long attrs) 3882 { 3883 struct scatterlist *sg; 3884 int i; 3885 3886 for_each_sg(sglist, sg, nelems, i) 3887 bounce_unmap_page(dev, sg->dma_address, 3888 sg_dma_len(sg), dir, attrs); 3889 } 3890 3891 static int 3892 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3893 enum dma_data_direction dir, unsigned long attrs) 3894 { 3895 int i; 3896 struct scatterlist *sg; 3897 3898 for_each_sg(sglist, sg, nelems, i) { 3899 sg->dma_address = bounce_map_page(dev, sg_page(sg), 3900 sg->offset, sg->length, 3901 dir, attrs); 3902 if (sg->dma_address == DMA_MAPPING_ERROR) 3903 goto out_unmap; 3904 sg_dma_len(sg) = sg->length; 3905 } 3906 3907 for_each_sg(sglist, sg, nelems, i) 3908 trace_bounce_map_sg(dev, i + 1, nelems, sg); 3909 3910 return nelems; 3911 3912 out_unmap: 3913 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 3914 return 0; 3915 } 3916 3917 static void 3918 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr, 3919 size_t size, enum dma_data_direction dir) 3920 { 3921 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU); 3922 } 3923 3924 static void 3925 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr, 3926 size_t size, enum dma_data_direction dir) 3927 { 3928 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE); 3929 } 3930 3931 static void 3932 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, 3933 int nelems, enum dma_data_direction dir) 3934 { 3935 struct scatterlist *sg; 3936 int i; 3937 3938 for_each_sg(sglist, sg, nelems, i) 3939 bounce_sync_single(dev, sg_dma_address(sg), 3940 sg_dma_len(sg), dir, SYNC_FOR_CPU); 3941 } 3942 3943 static void 3944 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, 3945 int nelems, enum dma_data_direction dir) 3946 { 3947 struct scatterlist *sg; 3948 int i; 3949 3950 for_each_sg(sglist, sg, nelems, i) 3951 bounce_sync_single(dev, sg_dma_address(sg), 3952 sg_dma_len(sg), dir, SYNC_FOR_DEVICE); 3953 } 3954 3955 static const struct dma_map_ops bounce_dma_ops = { 3956 .alloc = intel_alloc_coherent, 3957 .free = intel_free_coherent, 3958 .map_sg = bounce_map_sg, 3959 .unmap_sg = bounce_unmap_sg, 3960 .map_page = bounce_map_page, 3961 .unmap_page = bounce_unmap_page, 3962 .sync_single_for_cpu = bounce_sync_single_for_cpu, 3963 .sync_single_for_device = bounce_sync_single_for_device, 3964 .sync_sg_for_cpu = bounce_sync_sg_for_cpu, 3965 .sync_sg_for_device = bounce_sync_sg_for_device, 3966 .map_resource = bounce_map_resource, 3967 .unmap_resource = bounce_unmap_resource, 3968 .dma_supported = dma_direct_supported, 3969 }; 3970 3971 static inline int iommu_domain_cache_init(void) 3972 { 3973 int ret = 0; 3974 3975 iommu_domain_cache = kmem_cache_create("iommu_domain", 3976 sizeof(struct dmar_domain), 3977 0, 3978 SLAB_HWCACHE_ALIGN, 3979 3980 NULL); 3981 if (!iommu_domain_cache) { 3982 pr_err("Couldn't create iommu_domain cache\n"); 3983 ret = -ENOMEM; 3984 } 3985 3986 return ret; 3987 } 3988 3989 static inline int iommu_devinfo_cache_init(void) 3990 { 3991 int ret = 0; 3992 3993 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3994 sizeof(struct device_domain_info), 3995 0, 3996 SLAB_HWCACHE_ALIGN, 3997 NULL); 3998 if (!iommu_devinfo_cache) { 3999 pr_err("Couldn't create devinfo cache\n"); 4000 ret = -ENOMEM; 4001 } 4002 4003 return ret; 4004 } 4005 4006 static int __init iommu_init_mempool(void) 4007 { 4008 int ret; 4009 ret = iova_cache_get(); 4010 if (ret) 4011 return ret; 4012 4013 ret = iommu_domain_cache_init(); 4014 if (ret) 4015 goto domain_error; 4016 4017 ret = iommu_devinfo_cache_init(); 4018 if (!ret) 4019 return ret; 4020 4021 kmem_cache_destroy(iommu_domain_cache); 4022 domain_error: 4023 iova_cache_put(); 4024 4025 return -ENOMEM; 4026 } 4027 4028 static void __init iommu_exit_mempool(void) 4029 { 4030 kmem_cache_destroy(iommu_devinfo_cache); 4031 kmem_cache_destroy(iommu_domain_cache); 4032 iova_cache_put(); 4033 } 4034 4035 static void __init init_no_remapping_devices(void) 4036 { 4037 struct dmar_drhd_unit *drhd; 4038 struct device *dev; 4039 int i; 4040 4041 for_each_drhd_unit(drhd) { 4042 if (!drhd->include_all) { 4043 for_each_active_dev_scope(drhd->devices, 4044 drhd->devices_cnt, i, dev) 4045 break; 4046 /* ignore DMAR unit if no devices exist */ 4047 if (i == drhd->devices_cnt) 4048 drhd->ignored = 1; 4049 } 4050 } 4051 4052 for_each_active_drhd_unit(drhd) { 4053 if (drhd->include_all) 4054 continue; 4055 4056 for_each_active_dev_scope(drhd->devices, 4057 drhd->devices_cnt, i, dev) 4058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 4059 break; 4060 if (i < drhd->devices_cnt) 4061 continue; 4062 4063 /* This IOMMU has *only* gfx devices. Either bypass it or 4064 set the gfx_mapped flag, as appropriate */ 4065 drhd->gfx_dedicated = 1; 4066 if (!dmar_map_gfx) 4067 drhd->ignored = 1; 4068 } 4069 } 4070 4071 #ifdef CONFIG_SUSPEND 4072 static int init_iommu_hw(void) 4073 { 4074 struct dmar_drhd_unit *drhd; 4075 struct intel_iommu *iommu = NULL; 4076 4077 for_each_active_iommu(iommu, drhd) 4078 if (iommu->qi) 4079 dmar_reenable_qi(iommu); 4080 4081 for_each_iommu(iommu, drhd) { 4082 if (drhd->ignored) { 4083 /* 4084 * we always have to disable PMRs or DMA may fail on 4085 * this device 4086 */ 4087 if (force_on) 4088 iommu_disable_protect_mem_regions(iommu); 4089 continue; 4090 } 4091 4092 iommu_flush_write_buffer(iommu); 4093 4094 iommu_set_root_entry(iommu); 4095 4096 iommu->flush.flush_context(iommu, 0, 0, 0, 4097 DMA_CCMD_GLOBAL_INVL); 4098 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4099 iommu_enable_translation(iommu); 4100 iommu_disable_protect_mem_regions(iommu); 4101 } 4102 4103 return 0; 4104 } 4105 4106 static void iommu_flush_all(void) 4107 { 4108 struct dmar_drhd_unit *drhd; 4109 struct intel_iommu *iommu; 4110 4111 for_each_active_iommu(iommu, drhd) { 4112 iommu->flush.flush_context(iommu, 0, 0, 0, 4113 DMA_CCMD_GLOBAL_INVL); 4114 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 4115 DMA_TLB_GLOBAL_FLUSH); 4116 } 4117 } 4118 4119 static int iommu_suspend(void) 4120 { 4121 struct dmar_drhd_unit *drhd; 4122 struct intel_iommu *iommu = NULL; 4123 unsigned long flag; 4124 4125 for_each_active_iommu(iommu, drhd) { 4126 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 4127 GFP_ATOMIC); 4128 if (!iommu->iommu_state) 4129 goto nomem; 4130 } 4131 4132 iommu_flush_all(); 4133 4134 for_each_active_iommu(iommu, drhd) { 4135 iommu_disable_translation(iommu); 4136 4137 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4138 4139 iommu->iommu_state[SR_DMAR_FECTL_REG] = 4140 readl(iommu->reg + DMAR_FECTL_REG); 4141 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 4142 readl(iommu->reg + DMAR_FEDATA_REG); 4143 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 4144 readl(iommu->reg + DMAR_FEADDR_REG); 4145 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 4146 readl(iommu->reg + DMAR_FEUADDR_REG); 4147 4148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4149 } 4150 return 0; 4151 4152 nomem: 4153 for_each_active_iommu(iommu, drhd) 4154 kfree(iommu->iommu_state); 4155 4156 return -ENOMEM; 4157 } 4158 4159 static void iommu_resume(void) 4160 { 4161 struct dmar_drhd_unit *drhd; 4162 struct intel_iommu *iommu = NULL; 4163 unsigned long flag; 4164 4165 if (init_iommu_hw()) { 4166 if (force_on) 4167 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 4168 else 4169 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 4170 return; 4171 } 4172 4173 for_each_active_iommu(iommu, drhd) { 4174 4175 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4176 4177 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 4178 iommu->reg + DMAR_FECTL_REG); 4179 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 4180 iommu->reg + DMAR_FEDATA_REG); 4181 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 4182 iommu->reg + DMAR_FEADDR_REG); 4183 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 4184 iommu->reg + DMAR_FEUADDR_REG); 4185 4186 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4187 } 4188 4189 for_each_active_iommu(iommu, drhd) 4190 kfree(iommu->iommu_state); 4191 } 4192 4193 static struct syscore_ops iommu_syscore_ops = { 4194 .resume = iommu_resume, 4195 .suspend = iommu_suspend, 4196 }; 4197 4198 static void __init init_iommu_pm_ops(void) 4199 { 4200 register_syscore_ops(&iommu_syscore_ops); 4201 } 4202 4203 #else 4204 static inline void init_iommu_pm_ops(void) {} 4205 #endif /* CONFIG_PM */ 4206 4207 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 4208 { 4209 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 4210 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 4211 rmrr->end_address <= rmrr->base_address || 4212 arch_rmrr_sanity_check(rmrr)) 4213 return -EINVAL; 4214 4215 return 0; 4216 } 4217 4218 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 4219 { 4220 struct acpi_dmar_reserved_memory *rmrr; 4221 struct dmar_rmrr_unit *rmrru; 4222 4223 rmrr = (struct acpi_dmar_reserved_memory *)header; 4224 if (rmrr_sanity_check(rmrr)) { 4225 pr_warn(FW_BUG 4226 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 4227 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4228 rmrr->base_address, rmrr->end_address, 4229 dmi_get_system_info(DMI_BIOS_VENDOR), 4230 dmi_get_system_info(DMI_BIOS_VERSION), 4231 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4232 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 4233 } 4234 4235 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 4236 if (!rmrru) 4237 goto out; 4238 4239 rmrru->hdr = header; 4240 4241 rmrru->base_address = rmrr->base_address; 4242 rmrru->end_address = rmrr->end_address; 4243 4244 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 4245 ((void *)rmrr) + rmrr->header.length, 4246 &rmrru->devices_cnt); 4247 if (rmrru->devices_cnt && rmrru->devices == NULL) 4248 goto free_rmrru; 4249 4250 list_add(&rmrru->list, &dmar_rmrr_units); 4251 4252 return 0; 4253 free_rmrru: 4254 kfree(rmrru); 4255 out: 4256 return -ENOMEM; 4257 } 4258 4259 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 4260 { 4261 struct dmar_atsr_unit *atsru; 4262 struct acpi_dmar_atsr *tmp; 4263 4264 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 4265 dmar_rcu_check()) { 4266 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 4267 if (atsr->segment != tmp->segment) 4268 continue; 4269 if (atsr->header.length != tmp->header.length) 4270 continue; 4271 if (memcmp(atsr, tmp, atsr->header.length) == 0) 4272 return atsru; 4273 } 4274 4275 return NULL; 4276 } 4277 4278 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4279 { 4280 struct acpi_dmar_atsr *atsr; 4281 struct dmar_atsr_unit *atsru; 4282 4283 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 4284 return 0; 4285 4286 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4287 atsru = dmar_find_atsr(atsr); 4288 if (atsru) 4289 return 0; 4290 4291 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 4292 if (!atsru) 4293 return -ENOMEM; 4294 4295 /* 4296 * If memory is allocated from slab by ACPI _DSM method, we need to 4297 * copy the memory content because the memory buffer will be freed 4298 * on return. 4299 */ 4300 atsru->hdr = (void *)(atsru + 1); 4301 memcpy(atsru->hdr, hdr, hdr->length); 4302 atsru->include_all = atsr->flags & 0x1; 4303 if (!atsru->include_all) { 4304 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 4305 (void *)atsr + atsr->header.length, 4306 &atsru->devices_cnt); 4307 if (atsru->devices_cnt && atsru->devices == NULL) { 4308 kfree(atsru); 4309 return -ENOMEM; 4310 } 4311 } 4312 4313 list_add_rcu(&atsru->list, &dmar_atsr_units); 4314 4315 return 0; 4316 } 4317 4318 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 4319 { 4320 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 4321 kfree(atsru); 4322 } 4323 4324 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4325 { 4326 struct acpi_dmar_atsr *atsr; 4327 struct dmar_atsr_unit *atsru; 4328 4329 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4330 atsru = dmar_find_atsr(atsr); 4331 if (atsru) { 4332 list_del_rcu(&atsru->list); 4333 synchronize_rcu(); 4334 intel_iommu_free_atsr(atsru); 4335 } 4336 4337 return 0; 4338 } 4339 4340 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4341 { 4342 int i; 4343 struct device *dev; 4344 struct acpi_dmar_atsr *atsr; 4345 struct dmar_atsr_unit *atsru; 4346 4347 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4348 atsru = dmar_find_atsr(atsr); 4349 if (!atsru) 4350 return 0; 4351 4352 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 4353 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 4354 i, dev) 4355 return -EBUSY; 4356 } 4357 4358 return 0; 4359 } 4360 4361 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 4362 { 4363 int sp, ret; 4364 struct intel_iommu *iommu = dmaru->iommu; 4365 4366 if (g_iommus[iommu->seq_id]) 4367 return 0; 4368 4369 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 4370 pr_warn("%s: Doesn't support hardware pass through.\n", 4371 iommu->name); 4372 return -ENXIO; 4373 } 4374 if (!ecap_sc_support(iommu->ecap) && 4375 domain_update_iommu_snooping(iommu)) { 4376 pr_warn("%s: Doesn't support snooping.\n", 4377 iommu->name); 4378 return -ENXIO; 4379 } 4380 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 4381 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 4382 pr_warn("%s: Doesn't support large page.\n", 4383 iommu->name); 4384 return -ENXIO; 4385 } 4386 4387 /* 4388 * Disable translation if already enabled prior to OS handover. 4389 */ 4390 if (iommu->gcmd & DMA_GCMD_TE) 4391 iommu_disable_translation(iommu); 4392 4393 g_iommus[iommu->seq_id] = iommu; 4394 ret = iommu_init_domains(iommu); 4395 if (ret == 0) 4396 ret = iommu_alloc_root_entry(iommu); 4397 if (ret) 4398 goto out; 4399 4400 intel_svm_check(iommu); 4401 4402 if (dmaru->ignored) { 4403 /* 4404 * we always have to disable PMRs or DMA may fail on this device 4405 */ 4406 if (force_on) 4407 iommu_disable_protect_mem_regions(iommu); 4408 return 0; 4409 } 4410 4411 intel_iommu_init_qi(iommu); 4412 iommu_flush_write_buffer(iommu); 4413 4414 #ifdef CONFIG_INTEL_IOMMU_SVM 4415 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 4416 ret = intel_svm_enable_prq(iommu); 4417 if (ret) 4418 goto disable_iommu; 4419 } 4420 #endif 4421 ret = dmar_set_interrupt(iommu); 4422 if (ret) 4423 goto disable_iommu; 4424 4425 iommu_set_root_entry(iommu); 4426 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 4427 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4428 iommu_enable_translation(iommu); 4429 4430 iommu_disable_protect_mem_regions(iommu); 4431 return 0; 4432 4433 disable_iommu: 4434 disable_dmar_iommu(iommu); 4435 out: 4436 free_dmar_iommu(iommu); 4437 return ret; 4438 } 4439 4440 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4441 { 4442 int ret = 0; 4443 struct intel_iommu *iommu = dmaru->iommu; 4444 4445 if (!intel_iommu_enabled) 4446 return 0; 4447 if (iommu == NULL) 4448 return -EINVAL; 4449 4450 if (insert) { 4451 ret = intel_iommu_add(dmaru); 4452 } else { 4453 disable_dmar_iommu(iommu); 4454 free_dmar_iommu(iommu); 4455 } 4456 4457 return ret; 4458 } 4459 4460 static void intel_iommu_free_dmars(void) 4461 { 4462 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4463 struct dmar_atsr_unit *atsru, *atsr_n; 4464 4465 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4466 list_del(&rmrru->list); 4467 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4468 kfree(rmrru); 4469 } 4470 4471 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4472 list_del(&atsru->list); 4473 intel_iommu_free_atsr(atsru); 4474 } 4475 } 4476 4477 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4478 { 4479 int i, ret = 1; 4480 struct pci_bus *bus; 4481 struct pci_dev *bridge = NULL; 4482 struct device *tmp; 4483 struct acpi_dmar_atsr *atsr; 4484 struct dmar_atsr_unit *atsru; 4485 4486 dev = pci_physfn(dev); 4487 for (bus = dev->bus; bus; bus = bus->parent) { 4488 bridge = bus->self; 4489 /* If it's an integrated device, allow ATS */ 4490 if (!bridge) 4491 return 1; 4492 /* Connected via non-PCIe: no ATS */ 4493 if (!pci_is_pcie(bridge) || 4494 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4495 return 0; 4496 /* If we found the root port, look it up in the ATSR */ 4497 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4498 break; 4499 } 4500 4501 rcu_read_lock(); 4502 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4504 if (atsr->segment != pci_domain_nr(dev->bus)) 4505 continue; 4506 4507 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4508 if (tmp == &bridge->dev) 4509 goto out; 4510 4511 if (atsru->include_all) 4512 goto out; 4513 } 4514 ret = 0; 4515 out: 4516 rcu_read_unlock(); 4517 4518 return ret; 4519 } 4520 4521 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4522 { 4523 int ret; 4524 struct dmar_rmrr_unit *rmrru; 4525 struct dmar_atsr_unit *atsru; 4526 struct acpi_dmar_atsr *atsr; 4527 struct acpi_dmar_reserved_memory *rmrr; 4528 4529 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4530 return 0; 4531 4532 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4533 rmrr = container_of(rmrru->hdr, 4534 struct acpi_dmar_reserved_memory, header); 4535 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4536 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4537 ((void *)rmrr) + rmrr->header.length, 4538 rmrr->segment, rmrru->devices, 4539 rmrru->devices_cnt); 4540 if (ret < 0) 4541 return ret; 4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4543 dmar_remove_dev_scope(info, rmrr->segment, 4544 rmrru->devices, rmrru->devices_cnt); 4545 } 4546 } 4547 4548 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4549 if (atsru->include_all) 4550 continue; 4551 4552 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4553 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4554 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4555 (void *)atsr + atsr->header.length, 4556 atsr->segment, atsru->devices, 4557 atsru->devices_cnt); 4558 if (ret > 0) 4559 break; 4560 else if (ret < 0) 4561 return ret; 4562 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4563 if (dmar_remove_dev_scope(info, atsr->segment, 4564 atsru->devices, atsru->devices_cnt)) 4565 break; 4566 } 4567 } 4568 4569 return 0; 4570 } 4571 4572 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4573 unsigned long val, void *v) 4574 { 4575 struct memory_notify *mhp = v; 4576 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4577 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4578 mhp->nr_pages - 1); 4579 4580 switch (val) { 4581 case MEM_GOING_ONLINE: 4582 if (iommu_domain_identity_map(si_domain, 4583 start_vpfn, last_vpfn)) { 4584 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4585 start_vpfn, last_vpfn); 4586 return NOTIFY_BAD; 4587 } 4588 break; 4589 4590 case MEM_OFFLINE: 4591 case MEM_CANCEL_ONLINE: 4592 { 4593 struct dmar_drhd_unit *drhd; 4594 struct intel_iommu *iommu; 4595 struct page *freelist; 4596 4597 freelist = domain_unmap(si_domain, 4598 start_vpfn, last_vpfn); 4599 4600 rcu_read_lock(); 4601 for_each_active_iommu(iommu, drhd) 4602 iommu_flush_iotlb_psi(iommu, si_domain, 4603 start_vpfn, mhp->nr_pages, 4604 !freelist, 0); 4605 rcu_read_unlock(); 4606 dma_free_pagelist(freelist); 4607 } 4608 break; 4609 } 4610 4611 return NOTIFY_OK; 4612 } 4613 4614 static struct notifier_block intel_iommu_memory_nb = { 4615 .notifier_call = intel_iommu_memory_notifier, 4616 .priority = 0 4617 }; 4618 4619 static void free_all_cpu_cached_iovas(unsigned int cpu) 4620 { 4621 int i; 4622 4623 for (i = 0; i < g_num_of_iommus; i++) { 4624 struct intel_iommu *iommu = g_iommus[i]; 4625 struct dmar_domain *domain; 4626 int did; 4627 4628 if (!iommu) 4629 continue; 4630 4631 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4632 domain = get_iommu_domain(iommu, (u16)did); 4633 4634 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4635 continue; 4636 4637 free_cpu_cached_iovas(cpu, &domain->iovad); 4638 } 4639 } 4640 } 4641 4642 static int intel_iommu_cpu_dead(unsigned int cpu) 4643 { 4644 free_all_cpu_cached_iovas(cpu); 4645 return 0; 4646 } 4647 4648 static void intel_disable_iommus(void) 4649 { 4650 struct intel_iommu *iommu = NULL; 4651 struct dmar_drhd_unit *drhd; 4652 4653 for_each_iommu(iommu, drhd) 4654 iommu_disable_translation(iommu); 4655 } 4656 4657 void intel_iommu_shutdown(void) 4658 { 4659 struct dmar_drhd_unit *drhd; 4660 struct intel_iommu *iommu = NULL; 4661 4662 if (no_iommu || dmar_disabled) 4663 return; 4664 4665 down_write(&dmar_global_lock); 4666 4667 /* Disable PMRs explicitly here. */ 4668 for_each_iommu(iommu, drhd) 4669 iommu_disable_protect_mem_regions(iommu); 4670 4671 /* Make sure the IOMMUs are switched off */ 4672 intel_disable_iommus(); 4673 4674 up_write(&dmar_global_lock); 4675 } 4676 4677 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4678 { 4679 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4680 4681 return container_of(iommu_dev, struct intel_iommu, iommu); 4682 } 4683 4684 static ssize_t intel_iommu_show_version(struct device *dev, 4685 struct device_attribute *attr, 4686 char *buf) 4687 { 4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4689 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4690 return sprintf(buf, "%d:%d\n", 4691 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4692 } 4693 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4694 4695 static ssize_t intel_iommu_show_address(struct device *dev, 4696 struct device_attribute *attr, 4697 char *buf) 4698 { 4699 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4700 return sprintf(buf, "%llx\n", iommu->reg_phys); 4701 } 4702 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4703 4704 static ssize_t intel_iommu_show_cap(struct device *dev, 4705 struct device_attribute *attr, 4706 char *buf) 4707 { 4708 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4709 return sprintf(buf, "%llx\n", iommu->cap); 4710 } 4711 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4712 4713 static ssize_t intel_iommu_show_ecap(struct device *dev, 4714 struct device_attribute *attr, 4715 char *buf) 4716 { 4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4718 return sprintf(buf, "%llx\n", iommu->ecap); 4719 } 4720 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4721 4722 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4723 struct device_attribute *attr, 4724 char *buf) 4725 { 4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4727 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4728 } 4729 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4730 4731 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4732 struct device_attribute *attr, 4733 char *buf) 4734 { 4735 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4736 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4737 cap_ndoms(iommu->cap))); 4738 } 4739 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4740 4741 static struct attribute *intel_iommu_attrs[] = { 4742 &dev_attr_version.attr, 4743 &dev_attr_address.attr, 4744 &dev_attr_cap.attr, 4745 &dev_attr_ecap.attr, 4746 &dev_attr_domains_supported.attr, 4747 &dev_attr_domains_used.attr, 4748 NULL, 4749 }; 4750 4751 static struct attribute_group intel_iommu_group = { 4752 .name = "intel-iommu", 4753 .attrs = intel_iommu_attrs, 4754 }; 4755 4756 const struct attribute_group *intel_iommu_groups[] = { 4757 &intel_iommu_group, 4758 NULL, 4759 }; 4760 4761 static inline bool has_external_pci(void) 4762 { 4763 struct pci_dev *pdev = NULL; 4764 4765 for_each_pci_dev(pdev) 4766 if (pdev->external_facing) 4767 return true; 4768 4769 return false; 4770 } 4771 4772 static int __init platform_optin_force_iommu(void) 4773 { 4774 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4775 return 0; 4776 4777 if (no_iommu || dmar_disabled) 4778 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4779 4780 /* 4781 * If Intel-IOMMU is disabled by default, we will apply identity 4782 * map for all devices except those marked as being untrusted. 4783 */ 4784 if (dmar_disabled) 4785 iommu_set_default_passthrough(false); 4786 4787 dmar_disabled = 0; 4788 no_iommu = 0; 4789 4790 return 1; 4791 } 4792 4793 static int __init probe_acpi_namespace_devices(void) 4794 { 4795 struct dmar_drhd_unit *drhd; 4796 /* To avoid a -Wunused-but-set-variable warning. */ 4797 struct intel_iommu *iommu __maybe_unused; 4798 struct device *dev; 4799 int i, ret = 0; 4800 4801 for_each_active_iommu(iommu, drhd) { 4802 for_each_active_dev_scope(drhd->devices, 4803 drhd->devices_cnt, i, dev) { 4804 struct acpi_device_physical_node *pn; 4805 struct iommu_group *group; 4806 struct acpi_device *adev; 4807 4808 if (dev->bus != &acpi_bus_type) 4809 continue; 4810 4811 adev = to_acpi_device(dev); 4812 mutex_lock(&adev->physical_node_lock); 4813 list_for_each_entry(pn, 4814 &adev->physical_node_list, node) { 4815 group = iommu_group_get(pn->dev); 4816 if (group) { 4817 iommu_group_put(group); 4818 continue; 4819 } 4820 4821 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4822 ret = iommu_probe_device(pn->dev); 4823 if (ret) 4824 break; 4825 } 4826 mutex_unlock(&adev->physical_node_lock); 4827 4828 if (ret) 4829 return ret; 4830 } 4831 } 4832 4833 return 0; 4834 } 4835 4836 int __init intel_iommu_init(void) 4837 { 4838 int ret = -ENODEV; 4839 struct dmar_drhd_unit *drhd; 4840 struct intel_iommu *iommu; 4841 4842 /* 4843 * Intel IOMMU is required for a TXT/tboot launch or platform 4844 * opt in, so enforce that. 4845 */ 4846 force_on = tboot_force_iommu() || platform_optin_force_iommu(); 4847 4848 if (iommu_init_mempool()) { 4849 if (force_on) 4850 panic("tboot: Failed to initialize iommu memory\n"); 4851 return -ENOMEM; 4852 } 4853 4854 down_write(&dmar_global_lock); 4855 if (dmar_table_init()) { 4856 if (force_on) 4857 panic("tboot: Failed to initialize DMAR table\n"); 4858 goto out_free_dmar; 4859 } 4860 4861 if (dmar_dev_scope_init() < 0) { 4862 if (force_on) 4863 panic("tboot: Failed to initialize DMAR device scope\n"); 4864 goto out_free_dmar; 4865 } 4866 4867 up_write(&dmar_global_lock); 4868 4869 /* 4870 * The bus notifier takes the dmar_global_lock, so lockdep will 4871 * complain later when we register it under the lock. 4872 */ 4873 dmar_register_bus_notifier(); 4874 4875 down_write(&dmar_global_lock); 4876 4877 if (!no_iommu) 4878 intel_iommu_debugfs_init(); 4879 4880 if (no_iommu || dmar_disabled) { 4881 /* 4882 * We exit the function here to ensure IOMMU's remapping and 4883 * mempool aren't setup, which means that the IOMMU's PMRs 4884 * won't be disabled via the call to init_dmars(). So disable 4885 * it explicitly here. The PMRs were setup by tboot prior to 4886 * calling SENTER, but the kernel is expected to reset/tear 4887 * down the PMRs. 4888 */ 4889 if (intel_iommu_tboot_noforce) { 4890 for_each_iommu(iommu, drhd) 4891 iommu_disable_protect_mem_regions(iommu); 4892 } 4893 4894 /* 4895 * Make sure the IOMMUs are switched off, even when we 4896 * boot into a kexec kernel and the previous kernel left 4897 * them enabled 4898 */ 4899 intel_disable_iommus(); 4900 goto out_free_dmar; 4901 } 4902 4903 if (list_empty(&dmar_rmrr_units)) 4904 pr_info("No RMRR found\n"); 4905 4906 if (list_empty(&dmar_atsr_units)) 4907 pr_info("No ATSR found\n"); 4908 4909 if (dmar_init_reserved_ranges()) { 4910 if (force_on) 4911 panic("tboot: Failed to reserve iommu ranges\n"); 4912 goto out_free_reserved_range; 4913 } 4914 4915 if (dmar_map_gfx) 4916 intel_iommu_gfx_mapped = 1; 4917 4918 init_no_remapping_devices(); 4919 4920 ret = init_dmars(); 4921 if (ret) { 4922 if (force_on) 4923 panic("tboot: Failed to initialize DMARs\n"); 4924 pr_err("Initialization failed\n"); 4925 goto out_free_reserved_range; 4926 } 4927 up_write(&dmar_global_lock); 4928 4929 init_iommu_pm_ops(); 4930 4931 down_read(&dmar_global_lock); 4932 for_each_active_iommu(iommu, drhd) { 4933 iommu_device_sysfs_add(&iommu->iommu, NULL, 4934 intel_iommu_groups, 4935 "%s", iommu->name); 4936 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4937 iommu_device_register(&iommu->iommu); 4938 } 4939 up_read(&dmar_global_lock); 4940 4941 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4942 if (si_domain && !hw_pass_through) 4943 register_memory_notifier(&intel_iommu_memory_nb); 4944 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4945 intel_iommu_cpu_dead); 4946 4947 down_read(&dmar_global_lock); 4948 if (probe_acpi_namespace_devices()) 4949 pr_warn("ACPI name space devices didn't probe correctly\n"); 4950 4951 /* Finally, we enable the DMA remapping hardware. */ 4952 for_each_iommu(iommu, drhd) { 4953 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4954 iommu_enable_translation(iommu); 4955 4956 iommu_disable_protect_mem_regions(iommu); 4957 } 4958 up_read(&dmar_global_lock); 4959 4960 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4961 4962 intel_iommu_enabled = 1; 4963 4964 return 0; 4965 4966 out_free_reserved_range: 4967 put_iova_domain(&reserved_iova_list); 4968 out_free_dmar: 4969 intel_iommu_free_dmars(); 4970 up_write(&dmar_global_lock); 4971 iommu_exit_mempool(); 4972 return ret; 4973 } 4974 4975 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4976 { 4977 struct intel_iommu *iommu = opaque; 4978 4979 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 4980 return 0; 4981 } 4982 4983 /* 4984 * NB - intel-iommu lacks any sort of reference counting for the users of 4985 * dependent devices. If multiple endpoints have intersecting dependent 4986 * devices, unbinding the driver from any one of them will possibly leave 4987 * the others unable to operate. 4988 */ 4989 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 4990 { 4991 if (!iommu || !dev || !dev_is_pci(dev)) 4992 return; 4993 4994 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 4995 } 4996 4997 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4998 { 4999 struct dmar_domain *domain; 5000 struct intel_iommu *iommu; 5001 unsigned long flags; 5002 5003 assert_spin_locked(&device_domain_lock); 5004 5005 if (WARN_ON(!info)) 5006 return; 5007 5008 iommu = info->iommu; 5009 domain = info->domain; 5010 5011 if (info->dev) { 5012 if (dev_is_pci(info->dev) && sm_supported(iommu)) 5013 intel_pasid_tear_down_entry(iommu, info->dev, 5014 PASID_RID2PASID, false); 5015 5016 iommu_disable_dev_iotlb(info); 5017 if (!dev_is_real_dma_subdevice(info->dev)) 5018 domain_context_clear(iommu, info->dev); 5019 intel_pasid_free_table(info->dev); 5020 } 5021 5022 unlink_domain_info(info); 5023 5024 spin_lock_irqsave(&iommu->lock, flags); 5025 domain_detach_iommu(domain, iommu); 5026 spin_unlock_irqrestore(&iommu->lock, flags); 5027 5028 free_devinfo_mem(info); 5029 } 5030 5031 static void dmar_remove_one_dev_info(struct device *dev) 5032 { 5033 struct device_domain_info *info; 5034 unsigned long flags; 5035 5036 spin_lock_irqsave(&device_domain_lock, flags); 5037 info = get_domain_info(dev); 5038 if (info) 5039 __dmar_remove_one_dev_info(info); 5040 spin_unlock_irqrestore(&device_domain_lock, flags); 5041 } 5042 5043 static int md_domain_init(struct dmar_domain *domain, int guest_width) 5044 { 5045 int adjust_width; 5046 5047 /* calculate AGAW */ 5048 domain->gaw = guest_width; 5049 adjust_width = guestwidth_to_adjustwidth(guest_width); 5050 domain->agaw = width_to_agaw(adjust_width); 5051 5052 domain->iommu_coherency = 0; 5053 domain->iommu_snooping = 0; 5054 domain->iommu_superpage = 0; 5055 domain->max_addr = 0; 5056 5057 /* always allocate the top pgd */ 5058 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 5059 if (!domain->pgd) 5060 return -ENOMEM; 5061 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 5062 return 0; 5063 } 5064 5065 static void intel_init_iova_domain(struct dmar_domain *dmar_domain) 5066 { 5067 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); 5068 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad); 5069 5070 if (!intel_iommu_strict && 5071 init_iova_flush_queue(&dmar_domain->iovad, 5072 iommu_flush_iova, iova_entry_free)) 5073 pr_info("iova flush queue initialization failed\n"); 5074 } 5075 5076 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 5077 { 5078 struct dmar_domain *dmar_domain; 5079 struct iommu_domain *domain; 5080 5081 switch (type) { 5082 case IOMMU_DOMAIN_DMA: 5083 case IOMMU_DOMAIN_UNMANAGED: 5084 dmar_domain = alloc_domain(0); 5085 if (!dmar_domain) { 5086 pr_err("Can't allocate dmar_domain\n"); 5087 return NULL; 5088 } 5089 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 5090 pr_err("Domain initialization failed\n"); 5091 domain_exit(dmar_domain); 5092 return NULL; 5093 } 5094 5095 if (type == IOMMU_DOMAIN_DMA) 5096 intel_init_iova_domain(dmar_domain); 5097 5098 domain_update_iommu_cap(dmar_domain); 5099 5100 domain = &dmar_domain->domain; 5101 domain->geometry.aperture_start = 0; 5102 domain->geometry.aperture_end = 5103 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 5104 domain->geometry.force_aperture = true; 5105 5106 return domain; 5107 case IOMMU_DOMAIN_IDENTITY: 5108 return &si_domain->domain; 5109 default: 5110 return NULL; 5111 } 5112 5113 return NULL; 5114 } 5115 5116 static void intel_iommu_domain_free(struct iommu_domain *domain) 5117 { 5118 if (domain != &si_domain->domain) 5119 domain_exit(to_dmar_domain(domain)); 5120 } 5121 5122 /* 5123 * Check whether a @domain could be attached to the @dev through the 5124 * aux-domain attach/detach APIs. 5125 */ 5126 static inline bool 5127 is_aux_domain(struct device *dev, struct iommu_domain *domain) 5128 { 5129 struct device_domain_info *info = get_domain_info(dev); 5130 5131 return info && info->auxd_enabled && 5132 domain->type == IOMMU_DOMAIN_UNMANAGED; 5133 } 5134 5135 static void auxiliary_link_device(struct dmar_domain *domain, 5136 struct device *dev) 5137 { 5138 struct device_domain_info *info = get_domain_info(dev); 5139 5140 assert_spin_locked(&device_domain_lock); 5141 if (WARN_ON(!info)) 5142 return; 5143 5144 domain->auxd_refcnt++; 5145 list_add(&domain->auxd, &info->auxiliary_domains); 5146 } 5147 5148 static void auxiliary_unlink_device(struct dmar_domain *domain, 5149 struct device *dev) 5150 { 5151 struct device_domain_info *info = get_domain_info(dev); 5152 5153 assert_spin_locked(&device_domain_lock); 5154 if (WARN_ON(!info)) 5155 return; 5156 5157 list_del(&domain->auxd); 5158 domain->auxd_refcnt--; 5159 5160 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5161 ioasid_free(domain->default_pasid); 5162 } 5163 5164 static int aux_domain_add_dev(struct dmar_domain *domain, 5165 struct device *dev) 5166 { 5167 int ret; 5168 unsigned long flags; 5169 struct intel_iommu *iommu; 5170 5171 iommu = device_to_iommu(dev, NULL, NULL); 5172 if (!iommu) 5173 return -ENODEV; 5174 5175 if (domain->default_pasid <= 0) { 5176 int pasid; 5177 5178 /* No private data needed for the default pasid */ 5179 pasid = ioasid_alloc(NULL, PASID_MIN, 5180 pci_max_pasids(to_pci_dev(dev)) - 1, 5181 NULL); 5182 if (pasid == INVALID_IOASID) { 5183 pr_err("Can't allocate default pasid\n"); 5184 return -ENODEV; 5185 } 5186 domain->default_pasid = pasid; 5187 } 5188 5189 spin_lock_irqsave(&device_domain_lock, flags); 5190 /* 5191 * iommu->lock must be held to attach domain to iommu and setup the 5192 * pasid entry for second level translation. 5193 */ 5194 spin_lock(&iommu->lock); 5195 ret = domain_attach_iommu(domain, iommu); 5196 if (ret) 5197 goto attach_failed; 5198 5199 /* Setup the PASID entry for mediated devices: */ 5200 if (domain_use_first_level(domain)) 5201 ret = domain_setup_first_level(iommu, domain, dev, 5202 domain->default_pasid); 5203 else 5204 ret = intel_pasid_setup_second_level(iommu, domain, dev, 5205 domain->default_pasid); 5206 if (ret) 5207 goto table_failed; 5208 spin_unlock(&iommu->lock); 5209 5210 auxiliary_link_device(domain, dev); 5211 5212 spin_unlock_irqrestore(&device_domain_lock, flags); 5213 5214 return 0; 5215 5216 table_failed: 5217 domain_detach_iommu(domain, iommu); 5218 attach_failed: 5219 spin_unlock(&iommu->lock); 5220 spin_unlock_irqrestore(&device_domain_lock, flags); 5221 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5222 ioasid_free(domain->default_pasid); 5223 5224 return ret; 5225 } 5226 5227 static void aux_domain_remove_dev(struct dmar_domain *domain, 5228 struct device *dev) 5229 { 5230 struct device_domain_info *info; 5231 struct intel_iommu *iommu; 5232 unsigned long flags; 5233 5234 if (!is_aux_domain(dev, &domain->domain)) 5235 return; 5236 5237 spin_lock_irqsave(&device_domain_lock, flags); 5238 info = get_domain_info(dev); 5239 iommu = info->iommu; 5240 5241 auxiliary_unlink_device(domain, dev); 5242 5243 spin_lock(&iommu->lock); 5244 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); 5245 domain_detach_iommu(domain, iommu); 5246 spin_unlock(&iommu->lock); 5247 5248 spin_unlock_irqrestore(&device_domain_lock, flags); 5249 } 5250 5251 static int prepare_domain_attach_device(struct iommu_domain *domain, 5252 struct device *dev) 5253 { 5254 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5255 struct intel_iommu *iommu; 5256 int addr_width; 5257 5258 iommu = device_to_iommu(dev, NULL, NULL); 5259 if (!iommu) 5260 return -ENODEV; 5261 5262 /* check if this iommu agaw is sufficient for max mapped address */ 5263 addr_width = agaw_to_width(iommu->agaw); 5264 if (addr_width > cap_mgaw(iommu->cap)) 5265 addr_width = cap_mgaw(iommu->cap); 5266 5267 if (dmar_domain->max_addr > (1LL << addr_width)) { 5268 dev_err(dev, "%s: iommu width (%d) is not " 5269 "sufficient for the mapped address (%llx)\n", 5270 __func__, addr_width, dmar_domain->max_addr); 5271 return -EFAULT; 5272 } 5273 dmar_domain->gaw = addr_width; 5274 5275 /* 5276 * Knock out extra levels of page tables if necessary 5277 */ 5278 while (iommu->agaw < dmar_domain->agaw) { 5279 struct dma_pte *pte; 5280 5281 pte = dmar_domain->pgd; 5282 if (dma_pte_present(pte)) { 5283 dmar_domain->pgd = (struct dma_pte *) 5284 phys_to_virt(dma_pte_addr(pte)); 5285 free_pgtable_page(pte); 5286 } 5287 dmar_domain->agaw--; 5288 } 5289 5290 return 0; 5291 } 5292 5293 static int intel_iommu_attach_device(struct iommu_domain *domain, 5294 struct device *dev) 5295 { 5296 int ret; 5297 5298 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 5299 device_is_rmrr_locked(dev)) { 5300 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 5301 return -EPERM; 5302 } 5303 5304 if (is_aux_domain(dev, domain)) 5305 return -EPERM; 5306 5307 /* normally dev is not mapped */ 5308 if (unlikely(domain_context_mapped(dev))) { 5309 struct dmar_domain *old_domain; 5310 5311 old_domain = find_domain(dev); 5312 if (old_domain) 5313 dmar_remove_one_dev_info(dev); 5314 } 5315 5316 ret = prepare_domain_attach_device(domain, dev); 5317 if (ret) 5318 return ret; 5319 5320 return domain_add_dev_info(to_dmar_domain(domain), dev); 5321 } 5322 5323 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 5324 struct device *dev) 5325 { 5326 int ret; 5327 5328 if (!is_aux_domain(dev, domain)) 5329 return -EPERM; 5330 5331 ret = prepare_domain_attach_device(domain, dev); 5332 if (ret) 5333 return ret; 5334 5335 return aux_domain_add_dev(to_dmar_domain(domain), dev); 5336 } 5337 5338 static void intel_iommu_detach_device(struct iommu_domain *domain, 5339 struct device *dev) 5340 { 5341 dmar_remove_one_dev_info(dev); 5342 } 5343 5344 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 5345 struct device *dev) 5346 { 5347 aux_domain_remove_dev(to_dmar_domain(domain), dev); 5348 } 5349 5350 /* 5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 5352 * VT-d granularity. Invalidation is typically included in the unmap operation 5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest 5354 * owns the first level page tables. Invalidations of translation caches in the 5355 * guest are trapped and passed down to the host. 5356 * 5357 * vIOMMU in the guest will only expose first level page tables, therefore 5358 * we do not support IOTLB granularity for request without PASID (second level). 5359 * 5360 * For example, to find the VT-d granularity encoding for IOTLB 5361 * type and page selective granularity within PASID: 5362 * X: indexed by iommu cache type 5363 * Y: indexed by enum iommu_inv_granularity 5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 5365 */ 5366 5367 static const int 5368 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 5369 /* 5370 * PASID based IOTLB invalidation: PASID selective (per PASID), 5371 * page selective (address granularity) 5372 */ 5373 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 5374 /* PASID based dev TLBs */ 5375 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 5376 /* PASID cache */ 5377 {-EINVAL, -EINVAL, -EINVAL} 5378 }; 5379 5380 static inline int to_vtd_granularity(int type, int granu) 5381 { 5382 return inv_type_granu_table[type][granu]; 5383 } 5384 5385 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 5386 { 5387 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 5388 5389 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5390 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5391 * granu size in contiguous memory. 5392 */ 5393 return order_base_2(nr_pages); 5394 } 5395 5396 #ifdef CONFIG_INTEL_IOMMU_SVM 5397 static int 5398 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5399 struct iommu_cache_invalidate_info *inv_info) 5400 { 5401 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5402 struct device_domain_info *info; 5403 struct intel_iommu *iommu; 5404 unsigned long flags; 5405 int cache_type; 5406 u8 bus, devfn; 5407 u16 did, sid; 5408 int ret = 0; 5409 u64 size = 0; 5410 5411 if (!inv_info || !dmar_domain || 5412 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) 5413 return -EINVAL; 5414 5415 if (!dev || !dev_is_pci(dev)) 5416 return -ENODEV; 5417 5418 iommu = device_to_iommu(dev, &bus, &devfn); 5419 if (!iommu) 5420 return -ENODEV; 5421 5422 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5423 return -EINVAL; 5424 5425 spin_lock_irqsave(&device_domain_lock, flags); 5426 spin_lock(&iommu->lock); 5427 info = get_domain_info(dev); 5428 if (!info) { 5429 ret = -EINVAL; 5430 goto out_unlock; 5431 } 5432 did = dmar_domain->iommu_did[iommu->seq_id]; 5433 sid = PCI_DEVID(bus, devfn); 5434 5435 /* Size is only valid in address selective invalidation */ 5436 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5437 size = to_vtd_size(inv_info->addr_info.granule_size, 5438 inv_info->addr_info.nb_granules); 5439 5440 for_each_set_bit(cache_type, 5441 (unsigned long *)&inv_info->cache, 5442 IOMMU_CACHE_INV_TYPE_NR) { 5443 int granu = 0; 5444 u64 pasid = 0; 5445 u64 addr = 0; 5446 5447 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5448 if (granu == -EINVAL) { 5449 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5450 cache_type, inv_info->granularity); 5451 break; 5452 } 5453 5454 /* 5455 * PASID is stored in different locations based on the 5456 * granularity. 5457 */ 5458 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5459 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5460 pasid = inv_info->pasid_info.pasid; 5461 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5462 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5463 pasid = inv_info->addr_info.pasid; 5464 5465 switch (BIT(cache_type)) { 5466 case IOMMU_CACHE_INV_TYPE_IOTLB: 5467 /* HW will ignore LSB bits based on address mask */ 5468 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5469 size && 5470 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5471 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5472 inv_info->addr_info.addr, size); 5473 } 5474 5475 /* 5476 * If granu is PASID-selective, address is ignored. 5477 * We use npages = -1 to indicate that. 5478 */ 5479 qi_flush_piotlb(iommu, did, pasid, 5480 mm_to_dma_pfn(inv_info->addr_info.addr), 5481 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5482 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5483 5484 if (!info->ats_enabled) 5485 break; 5486 /* 5487 * Always flush device IOTLB if ATS is enabled. vIOMMU 5488 * in the guest may assume IOTLB flush is inclusive, 5489 * which is more efficient. 5490 */ 5491 fallthrough; 5492 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5493 /* 5494 * PASID based device TLB invalidation does not support 5495 * IOMMU_INV_GRANU_PASID granularity but only supports 5496 * IOMMU_INV_GRANU_ADDR. 5497 * The equivalent of that is we set the size to be the 5498 * entire range of 64 bit. User only provides PASID info 5499 * without address info. So we set addr to 0. 5500 */ 5501 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5502 size = 64 - VTD_PAGE_SHIFT; 5503 addr = 0; 5504 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5505 addr = inv_info->addr_info.addr; 5506 } 5507 5508 if (info->ats_enabled) 5509 qi_flush_dev_iotlb_pasid(iommu, sid, 5510 info->pfsid, pasid, 5511 info->ats_qdep, addr, 5512 size); 5513 else 5514 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5515 break; 5516 default: 5517 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5518 cache_type); 5519 ret = -EINVAL; 5520 } 5521 } 5522 out_unlock: 5523 spin_unlock(&iommu->lock); 5524 spin_unlock_irqrestore(&device_domain_lock, flags); 5525 5526 return ret; 5527 } 5528 #endif 5529 5530 static int intel_iommu_map(struct iommu_domain *domain, 5531 unsigned long iova, phys_addr_t hpa, 5532 size_t size, int iommu_prot, gfp_t gfp) 5533 { 5534 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5535 u64 max_addr; 5536 int prot = 0; 5537 int ret; 5538 5539 if (iommu_prot & IOMMU_READ) 5540 prot |= DMA_PTE_READ; 5541 if (iommu_prot & IOMMU_WRITE) 5542 prot |= DMA_PTE_WRITE; 5543 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5544 prot |= DMA_PTE_SNP; 5545 5546 max_addr = iova + size; 5547 if (dmar_domain->max_addr < max_addr) { 5548 u64 end; 5549 5550 /* check if minimum agaw is sufficient for mapped address */ 5551 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5552 if (end < max_addr) { 5553 pr_err("%s: iommu width (%d) is not " 5554 "sufficient for the mapped address (%llx)\n", 5555 __func__, dmar_domain->gaw, max_addr); 5556 return -EFAULT; 5557 } 5558 dmar_domain->max_addr = max_addr; 5559 } 5560 /* Round up size to next multiple of PAGE_SIZE, if it and 5561 the low bits of hpa would take us onto the next page */ 5562 size = aligned_nrpages(hpa, size); 5563 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5564 hpa >> VTD_PAGE_SHIFT, size, prot); 5565 return ret; 5566 } 5567 5568 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5569 unsigned long iova, size_t size, 5570 struct iommu_iotlb_gather *gather) 5571 { 5572 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5573 struct page *freelist = NULL; 5574 unsigned long start_pfn, last_pfn; 5575 unsigned int npages; 5576 int iommu_id, level = 0; 5577 5578 /* Cope with horrid API which requires us to unmap more than the 5579 size argument if it happens to be a large-page mapping. */ 5580 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5581 5582 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5583 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5584 5585 start_pfn = iova >> VTD_PAGE_SHIFT; 5586 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5587 5588 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn); 5589 5590 npages = last_pfn - start_pfn + 1; 5591 5592 for_each_domain_iommu(iommu_id, dmar_domain) 5593 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5594 start_pfn, npages, !freelist, 0); 5595 5596 dma_free_pagelist(freelist); 5597 5598 if (dmar_domain->max_addr == iova + size) 5599 dmar_domain->max_addr = iova; 5600 5601 return size; 5602 } 5603 5604 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5605 dma_addr_t iova) 5606 { 5607 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5608 struct dma_pte *pte; 5609 int level = 0; 5610 u64 phys = 0; 5611 5612 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5613 if (pte && dma_pte_present(pte)) 5614 phys = dma_pte_addr(pte) + 5615 (iova & (BIT_MASK(level_to_offset_bits(level) + 5616 VTD_PAGE_SHIFT) - 1)); 5617 5618 return phys; 5619 } 5620 5621 static inline bool scalable_mode_support(void) 5622 { 5623 struct dmar_drhd_unit *drhd; 5624 struct intel_iommu *iommu; 5625 bool ret = true; 5626 5627 rcu_read_lock(); 5628 for_each_active_iommu(iommu, drhd) { 5629 if (!sm_supported(iommu)) { 5630 ret = false; 5631 break; 5632 } 5633 } 5634 rcu_read_unlock(); 5635 5636 return ret; 5637 } 5638 5639 static inline bool iommu_pasid_support(void) 5640 { 5641 struct dmar_drhd_unit *drhd; 5642 struct intel_iommu *iommu; 5643 bool ret = true; 5644 5645 rcu_read_lock(); 5646 for_each_active_iommu(iommu, drhd) { 5647 if (!pasid_supported(iommu)) { 5648 ret = false; 5649 break; 5650 } 5651 } 5652 rcu_read_unlock(); 5653 5654 return ret; 5655 } 5656 5657 static inline bool nested_mode_support(void) 5658 { 5659 struct dmar_drhd_unit *drhd; 5660 struct intel_iommu *iommu; 5661 bool ret = true; 5662 5663 rcu_read_lock(); 5664 for_each_active_iommu(iommu, drhd) { 5665 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5666 ret = false; 5667 break; 5668 } 5669 } 5670 rcu_read_unlock(); 5671 5672 return ret; 5673 } 5674 5675 static bool intel_iommu_capable(enum iommu_cap cap) 5676 { 5677 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5678 return domain_update_iommu_snooping(NULL) == 1; 5679 if (cap == IOMMU_CAP_INTR_REMAP) 5680 return irq_remapping_enabled == 1; 5681 5682 return false; 5683 } 5684 5685 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5686 { 5687 struct intel_iommu *iommu; 5688 5689 iommu = device_to_iommu(dev, NULL, NULL); 5690 if (!iommu) 5691 return ERR_PTR(-ENODEV); 5692 5693 if (translation_pre_enabled(iommu)) 5694 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5695 5696 return &iommu->iommu; 5697 } 5698 5699 static void intel_iommu_release_device(struct device *dev) 5700 { 5701 struct intel_iommu *iommu; 5702 5703 iommu = device_to_iommu(dev, NULL, NULL); 5704 if (!iommu) 5705 return; 5706 5707 dmar_remove_one_dev_info(dev); 5708 5709 set_dma_ops(dev, NULL); 5710 } 5711 5712 static void intel_iommu_probe_finalize(struct device *dev) 5713 { 5714 struct iommu_domain *domain; 5715 5716 domain = iommu_get_domain_for_dev(dev); 5717 if (device_needs_bounce(dev)) 5718 set_dma_ops(dev, &bounce_dma_ops); 5719 else if (domain && domain->type == IOMMU_DOMAIN_DMA) 5720 set_dma_ops(dev, &intel_dma_ops); 5721 else 5722 set_dma_ops(dev, NULL); 5723 } 5724 5725 static void intel_iommu_get_resv_regions(struct device *device, 5726 struct list_head *head) 5727 { 5728 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5729 struct iommu_resv_region *reg; 5730 struct dmar_rmrr_unit *rmrr; 5731 struct device *i_dev; 5732 int i; 5733 5734 down_read(&dmar_global_lock); 5735 for_each_rmrr_units(rmrr) { 5736 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5737 i, i_dev) { 5738 struct iommu_resv_region *resv; 5739 enum iommu_resv_type type; 5740 size_t length; 5741 5742 if (i_dev != device && 5743 !is_downstream_to_pci_bridge(device, i_dev)) 5744 continue; 5745 5746 length = rmrr->end_address - rmrr->base_address + 1; 5747 5748 type = device_rmrr_is_relaxable(device) ? 5749 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5750 5751 resv = iommu_alloc_resv_region(rmrr->base_address, 5752 length, prot, type); 5753 if (!resv) 5754 break; 5755 5756 list_add_tail(&resv->list, head); 5757 } 5758 } 5759 up_read(&dmar_global_lock); 5760 5761 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5762 if (dev_is_pci(device)) { 5763 struct pci_dev *pdev = to_pci_dev(device); 5764 5765 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5766 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5767 IOMMU_RESV_DIRECT_RELAXABLE); 5768 if (reg) 5769 list_add_tail(®->list, head); 5770 } 5771 } 5772 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5773 5774 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5775 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5776 0, IOMMU_RESV_MSI); 5777 if (!reg) 5778 return; 5779 list_add_tail(®->list, head); 5780 } 5781 5782 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5783 { 5784 struct device_domain_info *info; 5785 struct context_entry *context; 5786 struct dmar_domain *domain; 5787 unsigned long flags; 5788 u64 ctx_lo; 5789 int ret; 5790 5791 domain = find_domain(dev); 5792 if (!domain) 5793 return -EINVAL; 5794 5795 spin_lock_irqsave(&device_domain_lock, flags); 5796 spin_lock(&iommu->lock); 5797 5798 ret = -EINVAL; 5799 info = get_domain_info(dev); 5800 if (!info || !info->pasid_supported) 5801 goto out; 5802 5803 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5804 if (WARN_ON(!context)) 5805 goto out; 5806 5807 ctx_lo = context[0].lo; 5808 5809 if (!(ctx_lo & CONTEXT_PASIDE)) { 5810 ctx_lo |= CONTEXT_PASIDE; 5811 context[0].lo = ctx_lo; 5812 wmb(); 5813 iommu->flush.flush_context(iommu, 5814 domain->iommu_did[iommu->seq_id], 5815 PCI_DEVID(info->bus, info->devfn), 5816 DMA_CCMD_MASK_NOBIT, 5817 DMA_CCMD_DEVICE_INVL); 5818 } 5819 5820 /* Enable PASID support in the device, if it wasn't already */ 5821 if (!info->pasid_enabled) 5822 iommu_enable_dev_iotlb(info); 5823 5824 ret = 0; 5825 5826 out: 5827 spin_unlock(&iommu->lock); 5828 spin_unlock_irqrestore(&device_domain_lock, flags); 5829 5830 return ret; 5831 } 5832 5833 static void intel_iommu_apply_resv_region(struct device *dev, 5834 struct iommu_domain *domain, 5835 struct iommu_resv_region *region) 5836 { 5837 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5838 unsigned long start, end; 5839 5840 start = IOVA_PFN(region->start); 5841 end = IOVA_PFN(region->start + region->length - 1); 5842 5843 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); 5844 } 5845 5846 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5847 { 5848 if (dev_is_pci(dev)) 5849 return pci_device_group(dev); 5850 return generic_device_group(dev); 5851 } 5852 5853 static int intel_iommu_enable_auxd(struct device *dev) 5854 { 5855 struct device_domain_info *info; 5856 struct intel_iommu *iommu; 5857 unsigned long flags; 5858 int ret; 5859 5860 iommu = device_to_iommu(dev, NULL, NULL); 5861 if (!iommu || dmar_disabled) 5862 return -EINVAL; 5863 5864 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5865 return -EINVAL; 5866 5867 ret = intel_iommu_enable_pasid(iommu, dev); 5868 if (ret) 5869 return -ENODEV; 5870 5871 spin_lock_irqsave(&device_domain_lock, flags); 5872 info = get_domain_info(dev); 5873 info->auxd_enabled = 1; 5874 spin_unlock_irqrestore(&device_domain_lock, flags); 5875 5876 return 0; 5877 } 5878 5879 static int intel_iommu_disable_auxd(struct device *dev) 5880 { 5881 struct device_domain_info *info; 5882 unsigned long flags; 5883 5884 spin_lock_irqsave(&device_domain_lock, flags); 5885 info = get_domain_info(dev); 5886 if (!WARN_ON(!info)) 5887 info->auxd_enabled = 0; 5888 spin_unlock_irqrestore(&device_domain_lock, flags); 5889 5890 return 0; 5891 } 5892 5893 /* 5894 * A PCI express designated vendor specific extended capability is defined 5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5896 * for system software and tools to detect endpoint devices supporting the 5897 * Intel scalable IO virtualization without host driver dependency. 5898 * 5899 * Returns the address of the matching extended capability structure within 5900 * the device's PCI configuration space or 0 if the device does not support 5901 * it. 5902 */ 5903 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5904 { 5905 int pos; 5906 u16 vendor, id; 5907 5908 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5909 while (pos) { 5910 pci_read_config_word(pdev, pos + 4, &vendor); 5911 pci_read_config_word(pdev, pos + 8, &id); 5912 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5913 return pos; 5914 5915 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5916 } 5917 5918 return 0; 5919 } 5920 5921 static bool 5922 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5923 { 5924 if (feat == IOMMU_DEV_FEAT_AUX) { 5925 int ret; 5926 5927 if (!dev_is_pci(dev) || dmar_disabled || 5928 !scalable_mode_support() || !iommu_pasid_support()) 5929 return false; 5930 5931 ret = pci_pasid_features(to_pci_dev(dev)); 5932 if (ret < 0) 5933 return false; 5934 5935 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5936 } 5937 5938 if (feat == IOMMU_DEV_FEAT_SVA) { 5939 struct device_domain_info *info = get_domain_info(dev); 5940 5941 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5942 info->pasid_supported && info->pri_supported && 5943 info->ats_supported; 5944 } 5945 5946 return false; 5947 } 5948 5949 static int 5950 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5951 { 5952 if (feat == IOMMU_DEV_FEAT_AUX) 5953 return intel_iommu_enable_auxd(dev); 5954 5955 if (feat == IOMMU_DEV_FEAT_SVA) { 5956 struct device_domain_info *info = get_domain_info(dev); 5957 5958 if (!info) 5959 return -EINVAL; 5960 5961 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5962 return 0; 5963 } 5964 5965 return -ENODEV; 5966 } 5967 5968 static int 5969 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5970 { 5971 if (feat == IOMMU_DEV_FEAT_AUX) 5972 return intel_iommu_disable_auxd(dev); 5973 5974 return -ENODEV; 5975 } 5976 5977 static bool 5978 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5979 { 5980 struct device_domain_info *info = get_domain_info(dev); 5981 5982 if (feat == IOMMU_DEV_FEAT_AUX) 5983 return scalable_mode_support() && info && info->auxd_enabled; 5984 5985 return false; 5986 } 5987 5988 static int 5989 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5990 { 5991 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5992 5993 return dmar_domain->default_pasid > 0 ? 5994 dmar_domain->default_pasid : -EINVAL; 5995 } 5996 5997 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5998 struct device *dev) 5999 { 6000 return attach_deferred(dev); 6001 } 6002 6003 static int 6004 intel_iommu_domain_set_attr(struct iommu_domain *domain, 6005 enum iommu_attr attr, void *data) 6006 { 6007 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6008 unsigned long flags; 6009 int ret = 0; 6010 6011 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 6012 return -EINVAL; 6013 6014 switch (attr) { 6015 case DOMAIN_ATTR_NESTING: 6016 spin_lock_irqsave(&device_domain_lock, flags); 6017 if (nested_mode_support() && 6018 list_empty(&dmar_domain->devices)) { 6019 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 6020 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 6021 } else { 6022 ret = -ENODEV; 6023 } 6024 spin_unlock_irqrestore(&device_domain_lock, flags); 6025 break; 6026 default: 6027 ret = -EINVAL; 6028 break; 6029 } 6030 6031 return ret; 6032 } 6033 6034 /* 6035 * Check that the device does not live on an external facing PCI port that is 6036 * marked as untrusted. Such devices should not be able to apply quirks and 6037 * thus not be able to bypass the IOMMU restrictions. 6038 */ 6039 static bool risky_device(struct pci_dev *pdev) 6040 { 6041 if (pdev->untrusted) { 6042 pci_info(pdev, 6043 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 6044 pdev->vendor, pdev->device); 6045 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 6046 return true; 6047 } 6048 return false; 6049 } 6050 6051 const struct iommu_ops intel_iommu_ops = { 6052 .capable = intel_iommu_capable, 6053 .domain_alloc = intel_iommu_domain_alloc, 6054 .domain_free = intel_iommu_domain_free, 6055 .domain_set_attr = intel_iommu_domain_set_attr, 6056 .attach_dev = intel_iommu_attach_device, 6057 .detach_dev = intel_iommu_detach_device, 6058 .aux_attach_dev = intel_iommu_aux_attach_device, 6059 .aux_detach_dev = intel_iommu_aux_detach_device, 6060 .aux_get_pasid = intel_iommu_aux_get_pasid, 6061 .map = intel_iommu_map, 6062 .unmap = intel_iommu_unmap, 6063 .iova_to_phys = intel_iommu_iova_to_phys, 6064 .probe_device = intel_iommu_probe_device, 6065 .probe_finalize = intel_iommu_probe_finalize, 6066 .release_device = intel_iommu_release_device, 6067 .get_resv_regions = intel_iommu_get_resv_regions, 6068 .put_resv_regions = generic_iommu_put_resv_regions, 6069 .apply_resv_region = intel_iommu_apply_resv_region, 6070 .device_group = intel_iommu_device_group, 6071 .dev_has_feat = intel_iommu_dev_has_feat, 6072 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 6073 .dev_enable_feat = intel_iommu_dev_enable_feat, 6074 .dev_disable_feat = intel_iommu_dev_disable_feat, 6075 .is_attach_deferred = intel_iommu_is_attach_deferred, 6076 .def_domain_type = device_def_domain_type, 6077 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 6078 #ifdef CONFIG_INTEL_IOMMU_SVM 6079 .cache_invalidate = intel_iommu_sva_invalidate, 6080 .sva_bind_gpasid = intel_svm_bind_gpasid, 6081 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 6082 .sva_bind = intel_svm_bind, 6083 .sva_unbind = intel_svm_unbind, 6084 .sva_get_pasid = intel_svm_get_pasid, 6085 .page_response = intel_svm_page_response, 6086 #endif 6087 }; 6088 6089 static void quirk_iommu_igfx(struct pci_dev *dev) 6090 { 6091 if (risky_device(dev)) 6092 return; 6093 6094 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 6095 dmar_map_gfx = 0; 6096 } 6097 6098 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 6106 6107 /* Broadwell igfx malfunctions with dmar */ 6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 6129 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 6130 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 6131 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 6132 6133 static void quirk_iommu_rwbf(struct pci_dev *dev) 6134 { 6135 if (risky_device(dev)) 6136 return; 6137 6138 /* 6139 * Mobile 4 Series Chipset neglects to set RWBF capability, 6140 * but needs it. Same seems to hold for the desktop versions. 6141 */ 6142 pci_info(dev, "Forcing write-buffer flush capability\n"); 6143 rwbf_quirk = 1; 6144 } 6145 6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 6153 6154 #define GGC 0x52 6155 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 6156 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 6157 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 6158 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 6159 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 6160 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 6161 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 6162 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 6163 6164 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 6165 { 6166 unsigned short ggc; 6167 6168 if (risky_device(dev)) 6169 return; 6170 6171 if (pci_read_config_word(dev, GGC, &ggc)) 6172 return; 6173 6174 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 6175 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 6176 dmar_map_gfx = 0; 6177 } else if (dmar_map_gfx) { 6178 /* we have to ensure the gfx device is idle before we flush */ 6179 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 6180 intel_iommu_strict = 1; 6181 } 6182 } 6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 6187 6188 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 6189 { 6190 unsigned short ver; 6191 6192 if (!IS_GFX_DEVICE(dev)) 6193 return; 6194 6195 ver = (dev->device >> 8) & 0xff; 6196 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 6197 ver != 0x4e && ver != 0x8a && ver != 0x98 && 6198 ver != 0x9a) 6199 return; 6200 6201 if (risky_device(dev)) 6202 return; 6203 6204 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 6205 iommu_skip_te_disable = 1; 6206 } 6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 6208 6209 /* On Tylersburg chipsets, some BIOSes have been known to enable the 6210 ISOCH DMAR unit for the Azalia sound device, but not give it any 6211 TLB entries, which causes it to deadlock. Check for that. We do 6212 this in a function called from init_dmars(), instead of in a PCI 6213 quirk, because we don't want to print the obnoxious "BIOS broken" 6214 message if VT-d is actually disabled. 6215 */ 6216 static void __init check_tylersburg_isoch(void) 6217 { 6218 struct pci_dev *pdev; 6219 uint32_t vtisochctrl; 6220 6221 /* If there's no Azalia in the system anyway, forget it. */ 6222 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 6223 if (!pdev) 6224 return; 6225 6226 if (risky_device(pdev)) { 6227 pci_dev_put(pdev); 6228 return; 6229 } 6230 6231 pci_dev_put(pdev); 6232 6233 /* System Management Registers. Might be hidden, in which case 6234 we can't do the sanity check. But that's OK, because the 6235 known-broken BIOSes _don't_ actually hide it, so far. */ 6236 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 6237 if (!pdev) 6238 return; 6239 6240 if (risky_device(pdev)) { 6241 pci_dev_put(pdev); 6242 return; 6243 } 6244 6245 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 6246 pci_dev_put(pdev); 6247 return; 6248 } 6249 6250 pci_dev_put(pdev); 6251 6252 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 6253 if (vtisochctrl & 1) 6254 return; 6255 6256 /* Drop all bits other than the number of TLB entries */ 6257 vtisochctrl &= 0x1c; 6258 6259 /* If we have the recommended number of TLB entries (16), fine. */ 6260 if (vtisochctrl == 0x10) 6261 return; 6262 6263 /* Zero TLB entries? You get to ride the short bus to school. */ 6264 if (!vtisochctrl) { 6265 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 6266 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 6267 dmi_get_system_info(DMI_BIOS_VENDOR), 6268 dmi_get_system_info(DMI_BIOS_VERSION), 6269 dmi_get_system_info(DMI_PRODUCT_VERSION)); 6270 iommu_identity_mapping |= IDENTMAP_AZALIA; 6271 return; 6272 } 6273 6274 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 6275 vtisochctrl); 6276 } 6277