1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-mapping.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/intel-iommu.h> 35 #include <linux/syscore_ops.h> 36 #include <linux/tboot.h> 37 #include <linux/dmi.h> 38 #include <linux/pci-ats.h> 39 #include <linux/memblock.h> 40 #include <linux/dma-contiguous.h> 41 #include <linux/dma-direct.h> 42 #include <linux/crash_dump.h> 43 #include <linux/numa.h> 44 #include <linux/swiotlb.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 #include <trace/events/intel_iommu.h> 49 50 #include "../irq_remapping.h" 51 #include "intel-pasid.h" 52 53 #define ROOT_SIZE VTD_PAGE_SIZE 54 #define CONTEXT_SIZE VTD_PAGE_SIZE 55 56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 60 61 #define IOAPIC_RANGE_START (0xfee00000) 62 #define IOAPIC_RANGE_END (0xfeefffff) 63 #define IOVA_START_ADDR (0x1000) 64 65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 66 67 #define MAX_AGAW_WIDTH 64 68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 69 70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1) 71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1) 72 73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 78 79 /* IO virtual address start page frame number */ 80 #define IOVA_START_PFN (1) 81 82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 83 84 /* page table handling */ 85 #define LEVEL_STRIDE (9) 86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 87 88 /* 89 * This bitmap is used to advertise the page sizes our hardware support 90 * to the IOMMU core, which will then use this information to split 91 * physically contiguous memory regions it is mapping into page sizes 92 * that we support. 93 * 94 * Traditionally the IOMMU core just handed us the mappings directly, 95 * after making sure the size is an order of a 4KiB page and that the 96 * mapping has natural alignment. 97 * 98 * To retain this behavior, we currently advertise that we support 99 * all page sizes that are an order of 4KiB. 100 * 101 * If at some point we'd like to utilize the IOMMU core's new behavior, 102 * we could change this to advertise the real page sizes we support. 103 */ 104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 105 106 static inline int agaw_to_level(int agaw) 107 { 108 return agaw + 2; 109 } 110 111 static inline int agaw_to_width(int agaw) 112 { 113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 114 } 115 116 static inline int width_to_agaw(int width) 117 { 118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 119 } 120 121 static inline unsigned int level_to_offset_bits(int level) 122 { 123 return (level - 1) * LEVEL_STRIDE; 124 } 125 126 static inline int pfn_level_offset(unsigned long pfn, int level) 127 { 128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 129 } 130 131 static inline unsigned long level_mask(int level) 132 { 133 return -1UL << level_to_offset_bits(level); 134 } 135 136 static inline unsigned long level_size(int level) 137 { 138 return 1UL << level_to_offset_bits(level); 139 } 140 141 static inline unsigned long align_to_level(unsigned long pfn, int level) 142 { 143 return (pfn + level_size(level) - 1) & level_mask(level); 144 } 145 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 147 { 148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 149 } 150 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 152 are never going to work. */ 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 154 { 155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 156 } 157 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 159 { 160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 161 } 162 static inline unsigned long page_to_dma_pfn(struct page *pg) 163 { 164 return mm_to_dma_pfn(page_to_pfn(pg)); 165 } 166 static inline unsigned long virt_to_dma_pfn(void *p) 167 { 168 return page_to_dma_pfn(virt_to_page(p)); 169 } 170 171 /* global iommu list, set NULL for ignored DMAR units */ 172 static struct intel_iommu **g_iommus; 173 174 static void __init check_tylersburg_isoch(void); 175 static int rwbf_quirk; 176 177 /* 178 * set to 1 to panic kernel if can't successfully enable VT-d 179 * (used when kernel is launched w/ TXT) 180 */ 181 static int force_on = 0; 182 int intel_iommu_tboot_noforce; 183 static int no_platform_optin; 184 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 186 187 /* 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 189 * if marked present. 190 */ 191 static phys_addr_t root_entry_lctp(struct root_entry *re) 192 { 193 if (!(re->lo & 1)) 194 return 0; 195 196 return re->lo & VTD_PAGE_MASK; 197 } 198 199 /* 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 201 * if marked present. 202 */ 203 static phys_addr_t root_entry_uctp(struct root_entry *re) 204 { 205 if (!(re->hi & 1)) 206 return 0; 207 208 return re->hi & VTD_PAGE_MASK; 209 } 210 211 static inline void context_clear_pasid_enable(struct context_entry *context) 212 { 213 context->lo &= ~(1ULL << 11); 214 } 215 216 static inline bool context_pasid_enabled(struct context_entry *context) 217 { 218 return !!(context->lo & (1ULL << 11)); 219 } 220 221 static inline void context_set_copied(struct context_entry *context) 222 { 223 context->hi |= (1ull << 3); 224 } 225 226 static inline bool context_copied(struct context_entry *context) 227 { 228 return !!(context->hi & (1ULL << 3)); 229 } 230 231 static inline bool __context_present(struct context_entry *context) 232 { 233 return (context->lo & 1); 234 } 235 236 bool context_present(struct context_entry *context) 237 { 238 return context_pasid_enabled(context) ? 239 __context_present(context) : 240 __context_present(context) && !context_copied(context); 241 } 242 243 static inline void context_set_present(struct context_entry *context) 244 { 245 context->lo |= 1; 246 } 247 248 static inline void context_set_fault_enable(struct context_entry *context) 249 { 250 context->lo &= (((u64)-1) << 2) | 1; 251 } 252 253 static inline void context_set_translation_type(struct context_entry *context, 254 unsigned long value) 255 { 256 context->lo &= (((u64)-1) << 4) | 3; 257 context->lo |= (value & 3) << 2; 258 } 259 260 static inline void context_set_address_root(struct context_entry *context, 261 unsigned long value) 262 { 263 context->lo &= ~VTD_PAGE_MASK; 264 context->lo |= value & VTD_PAGE_MASK; 265 } 266 267 static inline void context_set_address_width(struct context_entry *context, 268 unsigned long value) 269 { 270 context->hi |= value & 7; 271 } 272 273 static inline void context_set_domain_id(struct context_entry *context, 274 unsigned long value) 275 { 276 context->hi |= (value & ((1 << 16) - 1)) << 8; 277 } 278 279 static inline int context_domain_id(struct context_entry *c) 280 { 281 return((c->hi >> 8) & 0xffff); 282 } 283 284 static inline void context_clear_entry(struct context_entry *context) 285 { 286 context->lo = 0; 287 context->hi = 0; 288 } 289 290 /* 291 * This domain is a statically identity mapping domain. 292 * 1. This domain creats a static 1:1 mapping to all usable memory. 293 * 2. It maps to each iommu if successful. 294 * 3. Each iommu mapps to this domain if successful. 295 */ 296 static struct dmar_domain *si_domain; 297 static int hw_pass_through = 1; 298 299 #define for_each_domain_iommu(idx, domain) \ 300 for (idx = 0; idx < g_num_of_iommus; idx++) \ 301 if (domain->iommu_refcnt[idx]) 302 303 struct dmar_rmrr_unit { 304 struct list_head list; /* list of rmrr units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 u64 base_address; /* reserved base address*/ 307 u64 end_address; /* reserved end address */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 int devices_cnt; /* target device count */ 310 }; 311 312 struct dmar_atsr_unit { 313 struct list_head list; /* list of ATSR units */ 314 struct acpi_dmar_header *hdr; /* ACPI header */ 315 struct dmar_dev_scope *devices; /* target devices */ 316 int devices_cnt; /* target device count */ 317 u8 include_all:1; /* include all ports */ 318 }; 319 320 static LIST_HEAD(dmar_atsr_units); 321 static LIST_HEAD(dmar_rmrr_units); 322 323 #define for_each_rmrr_units(rmrr) \ 324 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 325 326 /* bitmap for indexing intel_iommus */ 327 static int g_num_of_iommus; 328 329 static void domain_exit(struct dmar_domain *domain); 330 static void domain_remove_dev_info(struct dmar_domain *domain); 331 static void dmar_remove_one_dev_info(struct device *dev); 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 333 static int intel_iommu_attach_device(struct iommu_domain *domain, 334 struct device *dev); 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 336 dma_addr_t iova); 337 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 339 int dmar_disabled = 0; 340 #else 341 int dmar_disabled = 1; 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 343 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 345 int intel_iommu_sm = 1; 346 #else 347 int intel_iommu_sm; 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 349 350 int intel_iommu_enabled = 0; 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 352 353 static int dmar_map_gfx = 1; 354 static int dmar_forcedac; 355 static int intel_iommu_strict; 356 static int intel_iommu_superpage = 1; 357 static int iommu_identity_mapping; 358 static int intel_no_bounce; 359 360 #define IDENTMAP_GFX 2 361 #define IDENTMAP_AZALIA 4 362 363 int intel_iommu_gfx_mapped; 364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 365 366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 368 struct device_domain_info *get_domain_info(struct device *dev) 369 { 370 struct device_domain_info *info; 371 372 if (!dev) 373 return NULL; 374 375 info = dev->archdata.iommu; 376 if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO || 377 info == DEFER_DEVICE_DOMAIN_INFO)) 378 return NULL; 379 380 return info; 381 } 382 383 DEFINE_SPINLOCK(device_domain_lock); 384 static LIST_HEAD(device_domain_list); 385 386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \ 387 to_pci_dev(d)->untrusted) 388 389 /* 390 * Iterate over elements in device_domain_list and call the specified 391 * callback @fn against each element. 392 */ 393 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 394 void *data), void *data) 395 { 396 int ret = 0; 397 unsigned long flags; 398 struct device_domain_info *info; 399 400 spin_lock_irqsave(&device_domain_lock, flags); 401 list_for_each_entry(info, &device_domain_list, global) { 402 ret = fn(info, data); 403 if (ret) { 404 spin_unlock_irqrestore(&device_domain_lock, flags); 405 return ret; 406 } 407 } 408 spin_unlock_irqrestore(&device_domain_lock, flags); 409 410 return 0; 411 } 412 413 const struct iommu_ops intel_iommu_ops; 414 415 static bool translation_pre_enabled(struct intel_iommu *iommu) 416 { 417 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 418 } 419 420 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 421 { 422 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 423 } 424 425 static void init_translation_status(struct intel_iommu *iommu) 426 { 427 u32 gsts; 428 429 gsts = readl(iommu->reg + DMAR_GSTS_REG); 430 if (gsts & DMA_GSTS_TES) 431 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 432 } 433 434 static int __init intel_iommu_setup(char *str) 435 { 436 if (!str) 437 return -EINVAL; 438 while (*str) { 439 if (!strncmp(str, "on", 2)) { 440 dmar_disabled = 0; 441 pr_info("IOMMU enabled\n"); 442 } else if (!strncmp(str, "off", 3)) { 443 dmar_disabled = 1; 444 no_platform_optin = 1; 445 pr_info("IOMMU disabled\n"); 446 } else if (!strncmp(str, "igfx_off", 8)) { 447 dmar_map_gfx = 0; 448 pr_info("Disable GFX device mapping\n"); 449 } else if (!strncmp(str, "forcedac", 8)) { 450 pr_info("Forcing DAC for PCI devices\n"); 451 dmar_forcedac = 1; 452 } else if (!strncmp(str, "strict", 6)) { 453 pr_info("Disable batched IOTLB flush\n"); 454 intel_iommu_strict = 1; 455 } else if (!strncmp(str, "sp_off", 6)) { 456 pr_info("Disable supported super page\n"); 457 intel_iommu_superpage = 0; 458 } else if (!strncmp(str, "sm_on", 5)) { 459 pr_info("Intel-IOMMU: scalable mode supported\n"); 460 intel_iommu_sm = 1; 461 } else if (!strncmp(str, "tboot_noforce", 13)) { 462 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 463 intel_iommu_tboot_noforce = 1; 464 } else if (!strncmp(str, "nobounce", 8)) { 465 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 466 intel_no_bounce = 1; 467 } 468 469 str += strcspn(str, ","); 470 while (*str == ',') 471 str++; 472 } 473 return 0; 474 } 475 __setup("intel_iommu=", intel_iommu_setup); 476 477 static struct kmem_cache *iommu_domain_cache; 478 static struct kmem_cache *iommu_devinfo_cache; 479 480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 481 { 482 struct dmar_domain **domains; 483 int idx = did >> 8; 484 485 domains = iommu->domains[idx]; 486 if (!domains) 487 return NULL; 488 489 return domains[did & 0xff]; 490 } 491 492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 493 struct dmar_domain *domain) 494 { 495 struct dmar_domain **domains; 496 int idx = did >> 8; 497 498 if (!iommu->domains[idx]) { 499 size_t size = 256 * sizeof(struct dmar_domain *); 500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 501 } 502 503 domains = iommu->domains[idx]; 504 if (WARN_ON(!domains)) 505 return; 506 else 507 domains[did & 0xff] = domain; 508 } 509 510 void *alloc_pgtable_page(int node) 511 { 512 struct page *page; 513 void *vaddr = NULL; 514 515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 516 if (page) 517 vaddr = page_address(page); 518 return vaddr; 519 } 520 521 void free_pgtable_page(void *vaddr) 522 { 523 free_page((unsigned long)vaddr); 524 } 525 526 static inline void *alloc_domain_mem(void) 527 { 528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 529 } 530 531 static void free_domain_mem(void *vaddr) 532 { 533 kmem_cache_free(iommu_domain_cache, vaddr); 534 } 535 536 static inline void * alloc_devinfo_mem(void) 537 { 538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 539 } 540 541 static inline void free_devinfo_mem(void *vaddr) 542 { 543 kmem_cache_free(iommu_devinfo_cache, vaddr); 544 } 545 546 static inline int domain_type_is_si(struct dmar_domain *domain) 547 { 548 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 549 } 550 551 static inline bool domain_use_first_level(struct dmar_domain *domain) 552 { 553 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 554 } 555 556 static inline int domain_pfn_supported(struct dmar_domain *domain, 557 unsigned long pfn) 558 { 559 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 560 561 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 562 } 563 564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 565 { 566 unsigned long sagaw; 567 int agaw = -1; 568 569 sagaw = cap_sagaw(iommu->cap); 570 for (agaw = width_to_agaw(max_gaw); 571 agaw >= 0; agaw--) { 572 if (test_bit(agaw, &sagaw)) 573 break; 574 } 575 576 return agaw; 577 } 578 579 /* 580 * Calculate max SAGAW for each iommu. 581 */ 582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 583 { 584 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 585 } 586 587 /* 588 * calculate agaw for each iommu. 589 * "SAGAW" may be different across iommus, use a default agaw, and 590 * get a supported less agaw for iommus that don't support the default agaw. 591 */ 592 int iommu_calculate_agaw(struct intel_iommu *iommu) 593 { 594 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 595 } 596 597 /* This functionin only returns single iommu in a domain */ 598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 599 { 600 int iommu_id; 601 602 /* si_domain and vm domain should not get here. */ 603 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 604 return NULL; 605 606 for_each_domain_iommu(iommu_id, domain) 607 break; 608 609 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 610 return NULL; 611 612 return g_iommus[iommu_id]; 613 } 614 615 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 616 { 617 return sm_supported(iommu) ? 618 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 619 } 620 621 static void domain_update_iommu_coherency(struct dmar_domain *domain) 622 { 623 struct dmar_drhd_unit *drhd; 624 struct intel_iommu *iommu; 625 bool found = false; 626 int i; 627 628 domain->iommu_coherency = 1; 629 630 for_each_domain_iommu(i, domain) { 631 found = true; 632 if (!iommu_paging_structure_coherency(g_iommus[i])) { 633 domain->iommu_coherency = 0; 634 break; 635 } 636 } 637 if (found) 638 return; 639 640 /* No hardware attached; use lowest common denominator */ 641 rcu_read_lock(); 642 for_each_active_iommu(iommu, drhd) { 643 if (!iommu_paging_structure_coherency(iommu)) { 644 domain->iommu_coherency = 0; 645 break; 646 } 647 } 648 rcu_read_unlock(); 649 } 650 651 static int domain_update_iommu_snooping(struct intel_iommu *skip) 652 { 653 struct dmar_drhd_unit *drhd; 654 struct intel_iommu *iommu; 655 int ret = 1; 656 657 rcu_read_lock(); 658 for_each_active_iommu(iommu, drhd) { 659 if (iommu != skip) { 660 if (!ecap_sc_support(iommu->ecap)) { 661 ret = 0; 662 break; 663 } 664 } 665 } 666 rcu_read_unlock(); 667 668 return ret; 669 } 670 671 static int domain_update_iommu_superpage(struct dmar_domain *domain, 672 struct intel_iommu *skip) 673 { 674 struct dmar_drhd_unit *drhd; 675 struct intel_iommu *iommu; 676 int mask = 0x3; 677 678 if (!intel_iommu_superpage) { 679 return 0; 680 } 681 682 /* set iommu_superpage to the smallest common denominator */ 683 rcu_read_lock(); 684 for_each_active_iommu(iommu, drhd) { 685 if (iommu != skip) { 686 if (domain && domain_use_first_level(domain)) { 687 if (!cap_fl1gp_support(iommu->cap)) 688 mask = 0x1; 689 } else { 690 mask &= cap_super_page_val(iommu->cap); 691 } 692 693 if (!mask) 694 break; 695 } 696 } 697 rcu_read_unlock(); 698 699 return fls(mask); 700 } 701 702 /* Some capabilities may be different across iommus */ 703 static void domain_update_iommu_cap(struct dmar_domain *domain) 704 { 705 domain_update_iommu_coherency(domain); 706 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 707 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 708 } 709 710 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 711 u8 devfn, int alloc) 712 { 713 struct root_entry *root = &iommu->root_entry[bus]; 714 struct context_entry *context; 715 u64 *entry; 716 717 entry = &root->lo; 718 if (sm_supported(iommu)) { 719 if (devfn >= 0x80) { 720 devfn -= 0x80; 721 entry = &root->hi; 722 } 723 devfn *= 2; 724 } 725 if (*entry & 1) 726 context = phys_to_virt(*entry & VTD_PAGE_MASK); 727 else { 728 unsigned long phy_addr; 729 if (!alloc) 730 return NULL; 731 732 context = alloc_pgtable_page(iommu->node); 733 if (!context) 734 return NULL; 735 736 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 737 phy_addr = virt_to_phys((void *)context); 738 *entry = phy_addr | 1; 739 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 740 } 741 return &context[devfn]; 742 } 743 744 static int iommu_dummy(struct device *dev) 745 { 746 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; 747 } 748 749 static bool attach_deferred(struct device *dev) 750 { 751 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO; 752 } 753 754 /** 755 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 756 * sub-hierarchy of a candidate PCI-PCI bridge 757 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 758 * @bridge: the candidate PCI-PCI bridge 759 * 760 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 761 */ 762 static bool 763 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 764 { 765 struct pci_dev *pdev, *pbridge; 766 767 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 768 return false; 769 770 pdev = to_pci_dev(dev); 771 pbridge = to_pci_dev(bridge); 772 773 if (pbridge->subordinate && 774 pbridge->subordinate->number <= pdev->bus->number && 775 pbridge->subordinate->busn_res.end >= pdev->bus->number) 776 return true; 777 778 return false; 779 } 780 781 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 782 { 783 struct dmar_drhd_unit *drhd = NULL; 784 struct intel_iommu *iommu; 785 struct device *tmp; 786 struct pci_dev *pdev = NULL; 787 u16 segment = 0; 788 int i; 789 790 if (iommu_dummy(dev)) 791 return NULL; 792 793 if (dev_is_pci(dev)) { 794 struct pci_dev *pf_pdev; 795 796 pdev = pci_real_dma_dev(to_pci_dev(dev)); 797 798 /* VFs aren't listed in scope tables; we need to look up 799 * the PF instead to find the IOMMU. */ 800 pf_pdev = pci_physfn(pdev); 801 dev = &pf_pdev->dev; 802 segment = pci_domain_nr(pdev->bus); 803 } else if (has_acpi_companion(dev)) 804 dev = &ACPI_COMPANION(dev)->dev; 805 806 rcu_read_lock(); 807 for_each_active_iommu(iommu, drhd) { 808 if (pdev && segment != drhd->segment) 809 continue; 810 811 for_each_active_dev_scope(drhd->devices, 812 drhd->devices_cnt, i, tmp) { 813 if (tmp == dev) { 814 /* For a VF use its original BDF# not that of the PF 815 * which we used for the IOMMU lookup. Strictly speaking 816 * we could do this for all PCI devices; we only need to 817 * get the BDF# from the scope table for ACPI matches. */ 818 if (pdev && pdev->is_virtfn) 819 goto got_pdev; 820 821 *bus = drhd->devices[i].bus; 822 *devfn = drhd->devices[i].devfn; 823 goto out; 824 } 825 826 if (is_downstream_to_pci_bridge(dev, tmp)) 827 goto got_pdev; 828 } 829 830 if (pdev && drhd->include_all) { 831 got_pdev: 832 *bus = pdev->bus->number; 833 *devfn = pdev->devfn; 834 goto out; 835 } 836 } 837 iommu = NULL; 838 out: 839 rcu_read_unlock(); 840 841 return iommu; 842 } 843 844 static void domain_flush_cache(struct dmar_domain *domain, 845 void *addr, int size) 846 { 847 if (!domain->iommu_coherency) 848 clflush_cache_range(addr, size); 849 } 850 851 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 852 { 853 struct context_entry *context; 854 int ret = 0; 855 unsigned long flags; 856 857 spin_lock_irqsave(&iommu->lock, flags); 858 context = iommu_context_addr(iommu, bus, devfn, 0); 859 if (context) 860 ret = context_present(context); 861 spin_unlock_irqrestore(&iommu->lock, flags); 862 return ret; 863 } 864 865 static void free_context_table(struct intel_iommu *iommu) 866 { 867 int i; 868 unsigned long flags; 869 struct context_entry *context; 870 871 spin_lock_irqsave(&iommu->lock, flags); 872 if (!iommu->root_entry) { 873 goto out; 874 } 875 for (i = 0; i < ROOT_ENTRY_NR; i++) { 876 context = iommu_context_addr(iommu, i, 0, 0); 877 if (context) 878 free_pgtable_page(context); 879 880 if (!sm_supported(iommu)) 881 continue; 882 883 context = iommu_context_addr(iommu, i, 0x80, 0); 884 if (context) 885 free_pgtable_page(context); 886 887 } 888 free_pgtable_page(iommu->root_entry); 889 iommu->root_entry = NULL; 890 out: 891 spin_unlock_irqrestore(&iommu->lock, flags); 892 } 893 894 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 895 unsigned long pfn, int *target_level) 896 { 897 struct dma_pte *parent, *pte; 898 int level = agaw_to_level(domain->agaw); 899 int offset; 900 901 BUG_ON(!domain->pgd); 902 903 if (!domain_pfn_supported(domain, pfn)) 904 /* Address beyond IOMMU's addressing capabilities. */ 905 return NULL; 906 907 parent = domain->pgd; 908 909 while (1) { 910 void *tmp_page; 911 912 offset = pfn_level_offset(pfn, level); 913 pte = &parent[offset]; 914 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 915 break; 916 if (level == *target_level) 917 break; 918 919 if (!dma_pte_present(pte)) { 920 uint64_t pteval; 921 922 tmp_page = alloc_pgtable_page(domain->nid); 923 924 if (!tmp_page) 925 return NULL; 926 927 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 928 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 929 if (domain_use_first_level(domain)) 930 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 931 if (cmpxchg64(&pte->val, 0ULL, pteval)) 932 /* Someone else set it while we were thinking; use theirs. */ 933 free_pgtable_page(tmp_page); 934 else 935 domain_flush_cache(domain, pte, sizeof(*pte)); 936 } 937 if (level == 1) 938 break; 939 940 parent = phys_to_virt(dma_pte_addr(pte)); 941 level--; 942 } 943 944 if (!*target_level) 945 *target_level = level; 946 947 return pte; 948 } 949 950 /* return address's pte at specific level */ 951 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 952 unsigned long pfn, 953 int level, int *large_page) 954 { 955 struct dma_pte *parent, *pte; 956 int total = agaw_to_level(domain->agaw); 957 int offset; 958 959 parent = domain->pgd; 960 while (level <= total) { 961 offset = pfn_level_offset(pfn, total); 962 pte = &parent[offset]; 963 if (level == total) 964 return pte; 965 966 if (!dma_pte_present(pte)) { 967 *large_page = total; 968 break; 969 } 970 971 if (dma_pte_superpage(pte)) { 972 *large_page = total; 973 return pte; 974 } 975 976 parent = phys_to_virt(dma_pte_addr(pte)); 977 total--; 978 } 979 return NULL; 980 } 981 982 /* clear last level pte, a tlb flush should be followed */ 983 static void dma_pte_clear_range(struct dmar_domain *domain, 984 unsigned long start_pfn, 985 unsigned long last_pfn) 986 { 987 unsigned int large_page; 988 struct dma_pte *first_pte, *pte; 989 990 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 991 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 992 BUG_ON(start_pfn > last_pfn); 993 994 /* we don't need lock here; nobody else touches the iova range */ 995 do { 996 large_page = 1; 997 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 998 if (!pte) { 999 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1000 continue; 1001 } 1002 do { 1003 dma_clear_pte(pte); 1004 start_pfn += lvl_to_nr_pages(large_page); 1005 pte++; 1006 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1007 1008 domain_flush_cache(domain, first_pte, 1009 (void *)pte - (void *)first_pte); 1010 1011 } while (start_pfn && start_pfn <= last_pfn); 1012 } 1013 1014 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1015 int retain_level, struct dma_pte *pte, 1016 unsigned long pfn, unsigned long start_pfn, 1017 unsigned long last_pfn) 1018 { 1019 pfn = max(start_pfn, pfn); 1020 pte = &pte[pfn_level_offset(pfn, level)]; 1021 1022 do { 1023 unsigned long level_pfn; 1024 struct dma_pte *level_pte; 1025 1026 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1027 goto next; 1028 1029 level_pfn = pfn & level_mask(level); 1030 level_pte = phys_to_virt(dma_pte_addr(pte)); 1031 1032 if (level > 2) { 1033 dma_pte_free_level(domain, level - 1, retain_level, 1034 level_pte, level_pfn, start_pfn, 1035 last_pfn); 1036 } 1037 1038 /* 1039 * Free the page table if we're below the level we want to 1040 * retain and the range covers the entire table. 1041 */ 1042 if (level < retain_level && !(start_pfn > level_pfn || 1043 last_pfn < level_pfn + level_size(level) - 1)) { 1044 dma_clear_pte(pte); 1045 domain_flush_cache(domain, pte, sizeof(*pte)); 1046 free_pgtable_page(level_pte); 1047 } 1048 next: 1049 pfn += level_size(level); 1050 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1051 } 1052 1053 /* 1054 * clear last level (leaf) ptes and free page table pages below the 1055 * level we wish to keep intact. 1056 */ 1057 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1058 unsigned long start_pfn, 1059 unsigned long last_pfn, 1060 int retain_level) 1061 { 1062 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1063 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1064 BUG_ON(start_pfn > last_pfn); 1065 1066 dma_pte_clear_range(domain, start_pfn, last_pfn); 1067 1068 /* We don't need lock here; nobody else touches the iova range */ 1069 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1070 domain->pgd, 0, start_pfn, last_pfn); 1071 1072 /* free pgd */ 1073 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1074 free_pgtable_page(domain->pgd); 1075 domain->pgd = NULL; 1076 } 1077 } 1078 1079 /* When a page at a given level is being unlinked from its parent, we don't 1080 need to *modify* it at all. All we need to do is make a list of all the 1081 pages which can be freed just as soon as we've flushed the IOTLB and we 1082 know the hardware page-walk will no longer touch them. 1083 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1084 be freed. */ 1085 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1086 int level, struct dma_pte *pte, 1087 struct page *freelist) 1088 { 1089 struct page *pg; 1090 1091 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1092 pg->freelist = freelist; 1093 freelist = pg; 1094 1095 if (level == 1) 1096 return freelist; 1097 1098 pte = page_address(pg); 1099 do { 1100 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1101 freelist = dma_pte_list_pagetables(domain, level - 1, 1102 pte, freelist); 1103 pte++; 1104 } while (!first_pte_in_page(pte)); 1105 1106 return freelist; 1107 } 1108 1109 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1110 struct dma_pte *pte, unsigned long pfn, 1111 unsigned long start_pfn, 1112 unsigned long last_pfn, 1113 struct page *freelist) 1114 { 1115 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1116 1117 pfn = max(start_pfn, pfn); 1118 pte = &pte[pfn_level_offset(pfn, level)]; 1119 1120 do { 1121 unsigned long level_pfn; 1122 1123 if (!dma_pte_present(pte)) 1124 goto next; 1125 1126 level_pfn = pfn & level_mask(level); 1127 1128 /* If range covers entire pagetable, free it */ 1129 if (start_pfn <= level_pfn && 1130 last_pfn >= level_pfn + level_size(level) - 1) { 1131 /* These suborbinate page tables are going away entirely. Don't 1132 bother to clear them; we're just going to *free* them. */ 1133 if (level > 1 && !dma_pte_superpage(pte)) 1134 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1135 1136 dma_clear_pte(pte); 1137 if (!first_pte) 1138 first_pte = pte; 1139 last_pte = pte; 1140 } else if (level > 1) { 1141 /* Recurse down into a level that isn't *entirely* obsolete */ 1142 freelist = dma_pte_clear_level(domain, level - 1, 1143 phys_to_virt(dma_pte_addr(pte)), 1144 level_pfn, start_pfn, last_pfn, 1145 freelist); 1146 } 1147 next: 1148 pfn += level_size(level); 1149 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1150 1151 if (first_pte) 1152 domain_flush_cache(domain, first_pte, 1153 (void *)++last_pte - (void *)first_pte); 1154 1155 return freelist; 1156 } 1157 1158 /* We can't just free the pages because the IOMMU may still be walking 1159 the page tables, and may have cached the intermediate levels. The 1160 pages can only be freed after the IOTLB flush has been done. */ 1161 static struct page *domain_unmap(struct dmar_domain *domain, 1162 unsigned long start_pfn, 1163 unsigned long last_pfn) 1164 { 1165 struct page *freelist; 1166 1167 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1168 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1169 BUG_ON(start_pfn > last_pfn); 1170 1171 /* we don't need lock here; nobody else touches the iova range */ 1172 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1173 domain->pgd, 0, start_pfn, last_pfn, NULL); 1174 1175 /* free pgd */ 1176 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1177 struct page *pgd_page = virt_to_page(domain->pgd); 1178 pgd_page->freelist = freelist; 1179 freelist = pgd_page; 1180 1181 domain->pgd = NULL; 1182 } 1183 1184 return freelist; 1185 } 1186 1187 static void dma_free_pagelist(struct page *freelist) 1188 { 1189 struct page *pg; 1190 1191 while ((pg = freelist)) { 1192 freelist = pg->freelist; 1193 free_pgtable_page(page_address(pg)); 1194 } 1195 } 1196 1197 static void iova_entry_free(unsigned long data) 1198 { 1199 struct page *freelist = (struct page *)data; 1200 1201 dma_free_pagelist(freelist); 1202 } 1203 1204 /* iommu handling */ 1205 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1206 { 1207 struct root_entry *root; 1208 unsigned long flags; 1209 1210 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1211 if (!root) { 1212 pr_err("Allocating root entry for %s failed\n", 1213 iommu->name); 1214 return -ENOMEM; 1215 } 1216 1217 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1218 1219 spin_lock_irqsave(&iommu->lock, flags); 1220 iommu->root_entry = root; 1221 spin_unlock_irqrestore(&iommu->lock, flags); 1222 1223 return 0; 1224 } 1225 1226 static void iommu_set_root_entry(struct intel_iommu *iommu) 1227 { 1228 u64 addr; 1229 u32 sts; 1230 unsigned long flag; 1231 1232 addr = virt_to_phys(iommu->root_entry); 1233 if (sm_supported(iommu)) 1234 addr |= DMA_RTADDR_SMT; 1235 1236 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1237 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1238 1239 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1240 1241 /* Make sure hardware complete it */ 1242 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1243 readl, (sts & DMA_GSTS_RTPS), sts); 1244 1245 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1246 } 1247 1248 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1249 { 1250 u32 val; 1251 unsigned long flag; 1252 1253 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1254 return; 1255 1256 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1257 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1258 1259 /* Make sure hardware complete it */ 1260 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1261 readl, (!(val & DMA_GSTS_WBFS)), val); 1262 1263 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1264 } 1265 1266 /* return value determine if we need a write buffer flush */ 1267 static void __iommu_flush_context(struct intel_iommu *iommu, 1268 u16 did, u16 source_id, u8 function_mask, 1269 u64 type) 1270 { 1271 u64 val = 0; 1272 unsigned long flag; 1273 1274 switch (type) { 1275 case DMA_CCMD_GLOBAL_INVL: 1276 val = DMA_CCMD_GLOBAL_INVL; 1277 break; 1278 case DMA_CCMD_DOMAIN_INVL: 1279 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1280 break; 1281 case DMA_CCMD_DEVICE_INVL: 1282 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1283 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1284 break; 1285 default: 1286 BUG(); 1287 } 1288 val |= DMA_CCMD_ICC; 1289 1290 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1291 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1292 1293 /* Make sure hardware complete it */ 1294 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1295 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1296 1297 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1298 } 1299 1300 /* return value determine if we need a write buffer flush */ 1301 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1302 u64 addr, unsigned int size_order, u64 type) 1303 { 1304 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1305 u64 val = 0, val_iva = 0; 1306 unsigned long flag; 1307 1308 switch (type) { 1309 case DMA_TLB_GLOBAL_FLUSH: 1310 /* global flush doesn't need set IVA_REG */ 1311 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1312 break; 1313 case DMA_TLB_DSI_FLUSH: 1314 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1315 break; 1316 case DMA_TLB_PSI_FLUSH: 1317 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1318 /* IH bit is passed in as part of address */ 1319 val_iva = size_order | addr; 1320 break; 1321 default: 1322 BUG(); 1323 } 1324 /* Note: set drain read/write */ 1325 #if 0 1326 /* 1327 * This is probably to be super secure.. Looks like we can 1328 * ignore it without any impact. 1329 */ 1330 if (cap_read_drain(iommu->cap)) 1331 val |= DMA_TLB_READ_DRAIN; 1332 #endif 1333 if (cap_write_drain(iommu->cap)) 1334 val |= DMA_TLB_WRITE_DRAIN; 1335 1336 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1337 /* Note: Only uses first TLB reg currently */ 1338 if (val_iva) 1339 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1340 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1341 1342 /* Make sure hardware complete it */ 1343 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1344 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1345 1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1347 1348 /* check IOTLB invalidation granularity */ 1349 if (DMA_TLB_IAIG(val) == 0) 1350 pr_err("Flush IOTLB failed\n"); 1351 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1352 pr_debug("TLB flush request %Lx, actual %Lx\n", 1353 (unsigned long long)DMA_TLB_IIRG(type), 1354 (unsigned long long)DMA_TLB_IAIG(val)); 1355 } 1356 1357 static struct device_domain_info * 1358 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1359 u8 bus, u8 devfn) 1360 { 1361 struct device_domain_info *info; 1362 1363 assert_spin_locked(&device_domain_lock); 1364 1365 if (!iommu->qi) 1366 return NULL; 1367 1368 list_for_each_entry(info, &domain->devices, link) 1369 if (info->iommu == iommu && info->bus == bus && 1370 info->devfn == devfn) { 1371 if (info->ats_supported && info->dev) 1372 return info; 1373 break; 1374 } 1375 1376 return NULL; 1377 } 1378 1379 static void domain_update_iotlb(struct dmar_domain *domain) 1380 { 1381 struct device_domain_info *info; 1382 bool has_iotlb_device = false; 1383 1384 assert_spin_locked(&device_domain_lock); 1385 1386 list_for_each_entry(info, &domain->devices, link) { 1387 struct pci_dev *pdev; 1388 1389 if (!info->dev || !dev_is_pci(info->dev)) 1390 continue; 1391 1392 pdev = to_pci_dev(info->dev); 1393 if (pdev->ats_enabled) { 1394 has_iotlb_device = true; 1395 break; 1396 } 1397 } 1398 1399 domain->has_iotlb_device = has_iotlb_device; 1400 } 1401 1402 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1403 { 1404 struct pci_dev *pdev; 1405 1406 assert_spin_locked(&device_domain_lock); 1407 1408 if (!info || !dev_is_pci(info->dev)) 1409 return; 1410 1411 pdev = to_pci_dev(info->dev); 1412 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1413 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1414 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1415 * reserved, which should be set to 0. 1416 */ 1417 if (!ecap_dit(info->iommu->ecap)) 1418 info->pfsid = 0; 1419 else { 1420 struct pci_dev *pf_pdev; 1421 1422 /* pdev will be returned if device is not a vf */ 1423 pf_pdev = pci_physfn(pdev); 1424 info->pfsid = pci_dev_id(pf_pdev); 1425 } 1426 1427 #ifdef CONFIG_INTEL_IOMMU_SVM 1428 /* The PCIe spec, in its wisdom, declares that the behaviour of 1429 the device if you enable PASID support after ATS support is 1430 undefined. So always enable PASID support on devices which 1431 have it, even if we can't yet know if we're ever going to 1432 use it. */ 1433 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1434 info->pasid_enabled = 1; 1435 1436 if (info->pri_supported && 1437 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1438 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1439 info->pri_enabled = 1; 1440 #endif 1441 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1442 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1443 info->ats_enabled = 1; 1444 domain_update_iotlb(info->domain); 1445 info->ats_qdep = pci_ats_queue_depth(pdev); 1446 } 1447 } 1448 1449 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1450 { 1451 struct pci_dev *pdev; 1452 1453 assert_spin_locked(&device_domain_lock); 1454 1455 if (!dev_is_pci(info->dev)) 1456 return; 1457 1458 pdev = to_pci_dev(info->dev); 1459 1460 if (info->ats_enabled) { 1461 pci_disable_ats(pdev); 1462 info->ats_enabled = 0; 1463 domain_update_iotlb(info->domain); 1464 } 1465 #ifdef CONFIG_INTEL_IOMMU_SVM 1466 if (info->pri_enabled) { 1467 pci_disable_pri(pdev); 1468 info->pri_enabled = 0; 1469 } 1470 if (info->pasid_enabled) { 1471 pci_disable_pasid(pdev); 1472 info->pasid_enabled = 0; 1473 } 1474 #endif 1475 } 1476 1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1478 u64 addr, unsigned mask) 1479 { 1480 u16 sid, qdep; 1481 unsigned long flags; 1482 struct device_domain_info *info; 1483 1484 if (!domain->has_iotlb_device) 1485 return; 1486 1487 spin_lock_irqsave(&device_domain_lock, flags); 1488 list_for_each_entry(info, &domain->devices, link) { 1489 if (!info->ats_enabled) 1490 continue; 1491 1492 sid = info->bus << 8 | info->devfn; 1493 qdep = info->ats_qdep; 1494 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1495 qdep, addr, mask); 1496 } 1497 spin_unlock_irqrestore(&device_domain_lock, flags); 1498 } 1499 1500 static void domain_flush_piotlb(struct intel_iommu *iommu, 1501 struct dmar_domain *domain, 1502 u64 addr, unsigned long npages, bool ih) 1503 { 1504 u16 did = domain->iommu_did[iommu->seq_id]; 1505 1506 if (domain->default_pasid) 1507 qi_flush_piotlb(iommu, did, domain->default_pasid, 1508 addr, npages, ih); 1509 1510 if (!list_empty(&domain->devices)) 1511 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1512 } 1513 1514 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1515 struct dmar_domain *domain, 1516 unsigned long pfn, unsigned int pages, 1517 int ih, int map) 1518 { 1519 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1520 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1521 u16 did = domain->iommu_did[iommu->seq_id]; 1522 1523 BUG_ON(pages == 0); 1524 1525 if (ih) 1526 ih = 1 << 6; 1527 1528 if (domain_use_first_level(domain)) { 1529 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1530 } else { 1531 /* 1532 * Fallback to domain selective flush if no PSI support or 1533 * the size is too big. PSI requires page size to be 2 ^ x, 1534 * and the base address is naturally aligned to the size. 1535 */ 1536 if (!cap_pgsel_inv(iommu->cap) || 1537 mask > cap_max_amask_val(iommu->cap)) 1538 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1539 DMA_TLB_DSI_FLUSH); 1540 else 1541 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1542 DMA_TLB_PSI_FLUSH); 1543 } 1544 1545 /* 1546 * In caching mode, changes of pages from non-present to present require 1547 * flush. However, device IOTLB doesn't need to be flushed in this case. 1548 */ 1549 if (!cap_caching_mode(iommu->cap) || !map) 1550 iommu_flush_dev_iotlb(domain, addr, mask); 1551 } 1552 1553 /* Notification for newly created mappings */ 1554 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1555 struct dmar_domain *domain, 1556 unsigned long pfn, unsigned int pages) 1557 { 1558 /* 1559 * It's a non-present to present mapping. Only flush if caching mode 1560 * and second level. 1561 */ 1562 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1563 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1564 else 1565 iommu_flush_write_buffer(iommu); 1566 } 1567 1568 static void iommu_flush_iova(struct iova_domain *iovad) 1569 { 1570 struct dmar_domain *domain; 1571 int idx; 1572 1573 domain = container_of(iovad, struct dmar_domain, iovad); 1574 1575 for_each_domain_iommu(idx, domain) { 1576 struct intel_iommu *iommu = g_iommus[idx]; 1577 u16 did = domain->iommu_did[iommu->seq_id]; 1578 1579 if (domain_use_first_level(domain)) 1580 domain_flush_piotlb(iommu, domain, 0, -1, 0); 1581 else 1582 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1583 DMA_TLB_DSI_FLUSH); 1584 1585 if (!cap_caching_mode(iommu->cap)) 1586 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1587 0, MAX_AGAW_PFN_WIDTH); 1588 } 1589 } 1590 1591 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1592 { 1593 u32 pmen; 1594 unsigned long flags; 1595 1596 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1597 return; 1598 1599 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1600 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1601 pmen &= ~DMA_PMEN_EPM; 1602 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1603 1604 /* wait for the protected region status bit to clear */ 1605 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1606 readl, !(pmen & DMA_PMEN_PRS), pmen); 1607 1608 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1609 } 1610 1611 static void iommu_enable_translation(struct intel_iommu *iommu) 1612 { 1613 u32 sts; 1614 unsigned long flags; 1615 1616 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1617 iommu->gcmd |= DMA_GCMD_TE; 1618 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1619 1620 /* Make sure hardware complete it */ 1621 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1622 readl, (sts & DMA_GSTS_TES), sts); 1623 1624 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1625 } 1626 1627 static void iommu_disable_translation(struct intel_iommu *iommu) 1628 { 1629 u32 sts; 1630 unsigned long flag; 1631 1632 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1633 iommu->gcmd &= ~DMA_GCMD_TE; 1634 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1635 1636 /* Make sure hardware complete it */ 1637 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1638 readl, (!(sts & DMA_GSTS_TES)), sts); 1639 1640 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1641 } 1642 1643 static int iommu_init_domains(struct intel_iommu *iommu) 1644 { 1645 u32 ndomains, nlongs; 1646 size_t size; 1647 1648 ndomains = cap_ndoms(iommu->cap); 1649 pr_debug("%s: Number of Domains supported <%d>\n", 1650 iommu->name, ndomains); 1651 nlongs = BITS_TO_LONGS(ndomains); 1652 1653 spin_lock_init(&iommu->lock); 1654 1655 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1656 if (!iommu->domain_ids) { 1657 pr_err("%s: Allocating domain id array failed\n", 1658 iommu->name); 1659 return -ENOMEM; 1660 } 1661 1662 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1663 iommu->domains = kzalloc(size, GFP_KERNEL); 1664 1665 if (iommu->domains) { 1666 size = 256 * sizeof(struct dmar_domain *); 1667 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1668 } 1669 1670 if (!iommu->domains || !iommu->domains[0]) { 1671 pr_err("%s: Allocating domain array failed\n", 1672 iommu->name); 1673 kfree(iommu->domain_ids); 1674 kfree(iommu->domains); 1675 iommu->domain_ids = NULL; 1676 iommu->domains = NULL; 1677 return -ENOMEM; 1678 } 1679 1680 /* 1681 * If Caching mode is set, then invalid translations are tagged 1682 * with domain-id 0, hence we need to pre-allocate it. We also 1683 * use domain-id 0 as a marker for non-allocated domain-id, so 1684 * make sure it is not used for a real domain. 1685 */ 1686 set_bit(0, iommu->domain_ids); 1687 1688 /* 1689 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1690 * entry for first-level or pass-through translation modes should 1691 * be programmed with a domain id different from those used for 1692 * second-level or nested translation. We reserve a domain id for 1693 * this purpose. 1694 */ 1695 if (sm_supported(iommu)) 1696 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1697 1698 return 0; 1699 } 1700 1701 static void disable_dmar_iommu(struct intel_iommu *iommu) 1702 { 1703 struct device_domain_info *info, *tmp; 1704 unsigned long flags; 1705 1706 if (!iommu->domains || !iommu->domain_ids) 1707 return; 1708 1709 spin_lock_irqsave(&device_domain_lock, flags); 1710 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1711 if (info->iommu != iommu) 1712 continue; 1713 1714 if (!info->dev || !info->domain) 1715 continue; 1716 1717 __dmar_remove_one_dev_info(info); 1718 } 1719 spin_unlock_irqrestore(&device_domain_lock, flags); 1720 1721 if (iommu->gcmd & DMA_GCMD_TE) 1722 iommu_disable_translation(iommu); 1723 } 1724 1725 static void free_dmar_iommu(struct intel_iommu *iommu) 1726 { 1727 if ((iommu->domains) && (iommu->domain_ids)) { 1728 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1729 int i; 1730 1731 for (i = 0; i < elems; i++) 1732 kfree(iommu->domains[i]); 1733 kfree(iommu->domains); 1734 kfree(iommu->domain_ids); 1735 iommu->domains = NULL; 1736 iommu->domain_ids = NULL; 1737 } 1738 1739 g_iommus[iommu->seq_id] = NULL; 1740 1741 /* free context mapping */ 1742 free_context_table(iommu); 1743 1744 #ifdef CONFIG_INTEL_IOMMU_SVM 1745 if (pasid_supported(iommu)) { 1746 if (ecap_prs(iommu->ecap)) 1747 intel_svm_finish_prq(iommu); 1748 } 1749 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) 1750 ioasid_unregister_allocator(&iommu->pasid_allocator); 1751 1752 #endif 1753 } 1754 1755 /* 1756 * Check and return whether first level is used by default for 1757 * DMA translation. 1758 */ 1759 static bool first_level_by_default(void) 1760 { 1761 struct dmar_drhd_unit *drhd; 1762 struct intel_iommu *iommu; 1763 static int first_level_support = -1; 1764 1765 if (likely(first_level_support != -1)) 1766 return first_level_support; 1767 1768 first_level_support = 1; 1769 1770 rcu_read_lock(); 1771 for_each_active_iommu(iommu, drhd) { 1772 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1773 first_level_support = 0; 1774 break; 1775 } 1776 } 1777 rcu_read_unlock(); 1778 1779 return first_level_support; 1780 } 1781 1782 static struct dmar_domain *alloc_domain(int flags) 1783 { 1784 struct dmar_domain *domain; 1785 1786 domain = alloc_domain_mem(); 1787 if (!domain) 1788 return NULL; 1789 1790 memset(domain, 0, sizeof(*domain)); 1791 domain->nid = NUMA_NO_NODE; 1792 domain->flags = flags; 1793 if (first_level_by_default()) 1794 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1795 domain->has_iotlb_device = false; 1796 INIT_LIST_HEAD(&domain->devices); 1797 1798 return domain; 1799 } 1800 1801 /* Must be called with iommu->lock */ 1802 static int domain_attach_iommu(struct dmar_domain *domain, 1803 struct intel_iommu *iommu) 1804 { 1805 unsigned long ndomains; 1806 int num; 1807 1808 assert_spin_locked(&device_domain_lock); 1809 assert_spin_locked(&iommu->lock); 1810 1811 domain->iommu_refcnt[iommu->seq_id] += 1; 1812 domain->iommu_count += 1; 1813 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1814 ndomains = cap_ndoms(iommu->cap); 1815 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1816 1817 if (num >= ndomains) { 1818 pr_err("%s: No free domain ids\n", iommu->name); 1819 domain->iommu_refcnt[iommu->seq_id] -= 1; 1820 domain->iommu_count -= 1; 1821 return -ENOSPC; 1822 } 1823 1824 set_bit(num, iommu->domain_ids); 1825 set_iommu_domain(iommu, num, domain); 1826 1827 domain->iommu_did[iommu->seq_id] = num; 1828 domain->nid = iommu->node; 1829 1830 domain_update_iommu_cap(domain); 1831 } 1832 1833 return 0; 1834 } 1835 1836 static int domain_detach_iommu(struct dmar_domain *domain, 1837 struct intel_iommu *iommu) 1838 { 1839 int num, count; 1840 1841 assert_spin_locked(&device_domain_lock); 1842 assert_spin_locked(&iommu->lock); 1843 1844 domain->iommu_refcnt[iommu->seq_id] -= 1; 1845 count = --domain->iommu_count; 1846 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1847 num = domain->iommu_did[iommu->seq_id]; 1848 clear_bit(num, iommu->domain_ids); 1849 set_iommu_domain(iommu, num, NULL); 1850 1851 domain_update_iommu_cap(domain); 1852 domain->iommu_did[iommu->seq_id] = 0; 1853 } 1854 1855 return count; 1856 } 1857 1858 static struct iova_domain reserved_iova_list; 1859 static struct lock_class_key reserved_rbtree_key; 1860 1861 static int dmar_init_reserved_ranges(void) 1862 { 1863 struct pci_dev *pdev = NULL; 1864 struct iova *iova; 1865 int i; 1866 1867 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN); 1868 1869 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, 1870 &reserved_rbtree_key); 1871 1872 /* IOAPIC ranges shouldn't be accessed by DMA */ 1873 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), 1874 IOVA_PFN(IOAPIC_RANGE_END)); 1875 if (!iova) { 1876 pr_err("Reserve IOAPIC range failed\n"); 1877 return -ENODEV; 1878 } 1879 1880 /* Reserve all PCI MMIO to avoid peer-to-peer access */ 1881 for_each_pci_dev(pdev) { 1882 struct resource *r; 1883 1884 for (i = 0; i < PCI_NUM_RESOURCES; i++) { 1885 r = &pdev->resource[i]; 1886 if (!r->flags || !(r->flags & IORESOURCE_MEM)) 1887 continue; 1888 iova = reserve_iova(&reserved_iova_list, 1889 IOVA_PFN(r->start), 1890 IOVA_PFN(r->end)); 1891 if (!iova) { 1892 pci_err(pdev, "Reserve iova for %pR failed\n", r); 1893 return -ENODEV; 1894 } 1895 } 1896 } 1897 return 0; 1898 } 1899 1900 static inline int guestwidth_to_adjustwidth(int gaw) 1901 { 1902 int agaw; 1903 int r = (gaw - 12) % 9; 1904 1905 if (r == 0) 1906 agaw = gaw; 1907 else 1908 agaw = gaw + 9 - r; 1909 if (agaw > 64) 1910 agaw = 64; 1911 return agaw; 1912 } 1913 1914 static void domain_exit(struct dmar_domain *domain) 1915 { 1916 1917 /* Remove associated devices and clear attached or cached domains */ 1918 domain_remove_dev_info(domain); 1919 1920 /* destroy iovas */ 1921 if (domain->domain.type == IOMMU_DOMAIN_DMA) 1922 put_iova_domain(&domain->iovad); 1923 1924 if (domain->pgd) { 1925 struct page *freelist; 1926 1927 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); 1928 dma_free_pagelist(freelist); 1929 } 1930 1931 free_domain_mem(domain); 1932 } 1933 1934 /* 1935 * Get the PASID directory size for scalable mode context entry. 1936 * Value of X in the PDTS field of a scalable mode context entry 1937 * indicates PASID directory with 2^(X + 7) entries. 1938 */ 1939 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 1940 { 1941 int pds, max_pde; 1942 1943 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 1944 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 1945 if (pds < 7) 1946 return 0; 1947 1948 return pds - 7; 1949 } 1950 1951 /* 1952 * Set the RID_PASID field of a scalable mode context entry. The 1953 * IOMMU hardware will use the PASID value set in this field for 1954 * DMA translations of DMA requests without PASID. 1955 */ 1956 static inline void 1957 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 1958 { 1959 context->hi |= pasid & ((1 << 20) - 1); 1960 } 1961 1962 /* 1963 * Set the DTE(Device-TLB Enable) field of a scalable mode context 1964 * entry. 1965 */ 1966 static inline void context_set_sm_dte(struct context_entry *context) 1967 { 1968 context->lo |= (1 << 2); 1969 } 1970 1971 /* 1972 * Set the PRE(Page Request Enable) field of a scalable mode context 1973 * entry. 1974 */ 1975 static inline void context_set_sm_pre(struct context_entry *context) 1976 { 1977 context->lo |= (1 << 4); 1978 } 1979 1980 /* Convert value to context PASID directory size field coding. */ 1981 #define context_pdts(pds) (((pds) & 0x7) << 9) 1982 1983 static int domain_context_mapping_one(struct dmar_domain *domain, 1984 struct intel_iommu *iommu, 1985 struct pasid_table *table, 1986 u8 bus, u8 devfn) 1987 { 1988 u16 did = domain->iommu_did[iommu->seq_id]; 1989 int translation = CONTEXT_TT_MULTI_LEVEL; 1990 struct device_domain_info *info = NULL; 1991 struct context_entry *context; 1992 unsigned long flags; 1993 int ret; 1994 1995 WARN_ON(did == 0); 1996 1997 if (hw_pass_through && domain_type_is_si(domain)) 1998 translation = CONTEXT_TT_PASS_THROUGH; 1999 2000 pr_debug("Set context mapping for %02x:%02x.%d\n", 2001 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2002 2003 BUG_ON(!domain->pgd); 2004 2005 spin_lock_irqsave(&device_domain_lock, flags); 2006 spin_lock(&iommu->lock); 2007 2008 ret = -ENOMEM; 2009 context = iommu_context_addr(iommu, bus, devfn, 1); 2010 if (!context) 2011 goto out_unlock; 2012 2013 ret = 0; 2014 if (context_present(context)) 2015 goto out_unlock; 2016 2017 /* 2018 * For kdump cases, old valid entries may be cached due to the 2019 * in-flight DMA and copied pgtable, but there is no unmapping 2020 * behaviour for them, thus we need an explicit cache flush for 2021 * the newly-mapped device. For kdump, at this point, the device 2022 * is supposed to finish reset at its driver probe stage, so no 2023 * in-flight DMA will exist, and we don't need to worry anymore 2024 * hereafter. 2025 */ 2026 if (context_copied(context)) { 2027 u16 did_old = context_domain_id(context); 2028 2029 if (did_old < cap_ndoms(iommu->cap)) { 2030 iommu->flush.flush_context(iommu, did_old, 2031 (((u16)bus) << 8) | devfn, 2032 DMA_CCMD_MASK_NOBIT, 2033 DMA_CCMD_DEVICE_INVL); 2034 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2035 DMA_TLB_DSI_FLUSH); 2036 } 2037 } 2038 2039 context_clear_entry(context); 2040 2041 if (sm_supported(iommu)) { 2042 unsigned long pds; 2043 2044 WARN_ON(!table); 2045 2046 /* Setup the PASID DIR pointer: */ 2047 pds = context_get_sm_pds(table); 2048 context->lo = (u64)virt_to_phys(table->table) | 2049 context_pdts(pds); 2050 2051 /* Setup the RID_PASID field: */ 2052 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2053 2054 /* 2055 * Setup the Device-TLB enable bit and Page request 2056 * Enable bit: 2057 */ 2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2059 if (info && info->ats_supported) 2060 context_set_sm_dte(context); 2061 if (info && info->pri_supported) 2062 context_set_sm_pre(context); 2063 } else { 2064 struct dma_pte *pgd = domain->pgd; 2065 int agaw; 2066 2067 context_set_domain_id(context, did); 2068 2069 if (translation != CONTEXT_TT_PASS_THROUGH) { 2070 /* 2071 * Skip top levels of page tables for iommu which has 2072 * less agaw than default. Unnecessary for PT mode. 2073 */ 2074 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2075 ret = -ENOMEM; 2076 pgd = phys_to_virt(dma_pte_addr(pgd)); 2077 if (!dma_pte_present(pgd)) 2078 goto out_unlock; 2079 } 2080 2081 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2082 if (info && info->ats_supported) 2083 translation = CONTEXT_TT_DEV_IOTLB; 2084 else 2085 translation = CONTEXT_TT_MULTI_LEVEL; 2086 2087 context_set_address_root(context, virt_to_phys(pgd)); 2088 context_set_address_width(context, agaw); 2089 } else { 2090 /* 2091 * In pass through mode, AW must be programmed to 2092 * indicate the largest AGAW value supported by 2093 * hardware. And ASR is ignored by hardware. 2094 */ 2095 context_set_address_width(context, iommu->msagaw); 2096 } 2097 2098 context_set_translation_type(context, translation); 2099 } 2100 2101 context_set_fault_enable(context); 2102 context_set_present(context); 2103 if (!ecap_coherent(iommu->ecap)) 2104 clflush_cache_range(context, sizeof(*context)); 2105 2106 /* 2107 * It's a non-present to present mapping. If hardware doesn't cache 2108 * non-present entry we only need to flush the write-buffer. If the 2109 * _does_ cache non-present entries, then it does so in the special 2110 * domain #0, which we have to flush: 2111 */ 2112 if (cap_caching_mode(iommu->cap)) { 2113 iommu->flush.flush_context(iommu, 0, 2114 (((u16)bus) << 8) | devfn, 2115 DMA_CCMD_MASK_NOBIT, 2116 DMA_CCMD_DEVICE_INVL); 2117 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2118 } else { 2119 iommu_flush_write_buffer(iommu); 2120 } 2121 iommu_enable_dev_iotlb(info); 2122 2123 ret = 0; 2124 2125 out_unlock: 2126 spin_unlock(&iommu->lock); 2127 spin_unlock_irqrestore(&device_domain_lock, flags); 2128 2129 return ret; 2130 } 2131 2132 struct domain_context_mapping_data { 2133 struct dmar_domain *domain; 2134 struct intel_iommu *iommu; 2135 struct pasid_table *table; 2136 }; 2137 2138 static int domain_context_mapping_cb(struct pci_dev *pdev, 2139 u16 alias, void *opaque) 2140 { 2141 struct domain_context_mapping_data *data = opaque; 2142 2143 return domain_context_mapping_one(data->domain, data->iommu, 2144 data->table, PCI_BUS_NUM(alias), 2145 alias & 0xff); 2146 } 2147 2148 static int 2149 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2150 { 2151 struct domain_context_mapping_data data; 2152 struct pasid_table *table; 2153 struct intel_iommu *iommu; 2154 u8 bus, devfn; 2155 2156 iommu = device_to_iommu(dev, &bus, &devfn); 2157 if (!iommu) 2158 return -ENODEV; 2159 2160 table = intel_pasid_get_table(dev); 2161 2162 if (!dev_is_pci(dev)) 2163 return domain_context_mapping_one(domain, iommu, table, 2164 bus, devfn); 2165 2166 data.domain = domain; 2167 data.iommu = iommu; 2168 data.table = table; 2169 2170 return pci_for_each_dma_alias(to_pci_dev(dev), 2171 &domain_context_mapping_cb, &data); 2172 } 2173 2174 static int domain_context_mapped_cb(struct pci_dev *pdev, 2175 u16 alias, void *opaque) 2176 { 2177 struct intel_iommu *iommu = opaque; 2178 2179 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2180 } 2181 2182 static int domain_context_mapped(struct device *dev) 2183 { 2184 struct intel_iommu *iommu; 2185 u8 bus, devfn; 2186 2187 iommu = device_to_iommu(dev, &bus, &devfn); 2188 if (!iommu) 2189 return -ENODEV; 2190 2191 if (!dev_is_pci(dev)) 2192 return device_context_mapped(iommu, bus, devfn); 2193 2194 return !pci_for_each_dma_alias(to_pci_dev(dev), 2195 domain_context_mapped_cb, iommu); 2196 } 2197 2198 /* Returns a number of VTD pages, but aligned to MM page size */ 2199 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2200 size_t size) 2201 { 2202 host_addr &= ~PAGE_MASK; 2203 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2204 } 2205 2206 /* Return largest possible superpage level for a given mapping */ 2207 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2208 unsigned long iov_pfn, 2209 unsigned long phy_pfn, 2210 unsigned long pages) 2211 { 2212 int support, level = 1; 2213 unsigned long pfnmerge; 2214 2215 support = domain->iommu_superpage; 2216 2217 /* To use a large page, the virtual *and* physical addresses 2218 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2219 of them will mean we have to use smaller pages. So just 2220 merge them and check both at once. */ 2221 pfnmerge = iov_pfn | phy_pfn; 2222 2223 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2224 pages >>= VTD_STRIDE_SHIFT; 2225 if (!pages) 2226 break; 2227 pfnmerge >>= VTD_STRIDE_SHIFT; 2228 level++; 2229 support--; 2230 } 2231 return level; 2232 } 2233 2234 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2235 struct scatterlist *sg, unsigned long phys_pfn, 2236 unsigned long nr_pages, int prot) 2237 { 2238 struct dma_pte *first_pte = NULL, *pte = NULL; 2239 phys_addr_t uninitialized_var(pteval); 2240 unsigned long sg_res = 0; 2241 unsigned int largepage_lvl = 0; 2242 unsigned long lvl_pages = 0; 2243 u64 attr; 2244 2245 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2246 2247 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2248 return -EINVAL; 2249 2250 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2251 if (domain_use_first_level(domain)) 2252 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2253 2254 if (!sg) { 2255 sg_res = nr_pages; 2256 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2257 } 2258 2259 while (nr_pages > 0) { 2260 uint64_t tmp; 2261 2262 if (!sg_res) { 2263 unsigned int pgoff = sg->offset & ~PAGE_MASK; 2264 2265 sg_res = aligned_nrpages(sg->offset, sg->length); 2266 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff; 2267 sg->dma_length = sg->length; 2268 pteval = (sg_phys(sg) - pgoff) | attr; 2269 phys_pfn = pteval >> VTD_PAGE_SHIFT; 2270 } 2271 2272 if (!pte) { 2273 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); 2274 2275 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2276 if (!pte) 2277 return -ENOMEM; 2278 /* It is large page*/ 2279 if (largepage_lvl > 1) { 2280 unsigned long nr_superpages, end_pfn; 2281 2282 pteval |= DMA_PTE_LARGE_PAGE; 2283 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2284 2285 nr_superpages = sg_res / lvl_pages; 2286 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2287 2288 /* 2289 * Ensure that old small page tables are 2290 * removed to make room for superpage(s). 2291 * We're adding new large pages, so make sure 2292 * we don't remove their parent tables. 2293 */ 2294 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2295 largepage_lvl + 1); 2296 } else { 2297 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2298 } 2299 2300 } 2301 /* We don't need lock here, nobody else 2302 * touches the iova range 2303 */ 2304 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2305 if (tmp) { 2306 static int dumps = 5; 2307 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2308 iov_pfn, tmp, (unsigned long long)pteval); 2309 if (dumps) { 2310 dumps--; 2311 debug_dma_dump_mappings(NULL); 2312 } 2313 WARN_ON(1); 2314 } 2315 2316 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2317 2318 BUG_ON(nr_pages < lvl_pages); 2319 BUG_ON(sg_res < lvl_pages); 2320 2321 nr_pages -= lvl_pages; 2322 iov_pfn += lvl_pages; 2323 phys_pfn += lvl_pages; 2324 pteval += lvl_pages * VTD_PAGE_SIZE; 2325 sg_res -= lvl_pages; 2326 2327 /* If the next PTE would be the first in a new page, then we 2328 need to flush the cache on the entries we've just written. 2329 And then we'll need to recalculate 'pte', so clear it and 2330 let it get set again in the if (!pte) block above. 2331 2332 If we're done (!nr_pages) we need to flush the cache too. 2333 2334 Also if we've been setting superpages, we may need to 2335 recalculate 'pte' and switch back to smaller pages for the 2336 end of the mapping, if the trailing size is not enough to 2337 use another superpage (i.e. sg_res < lvl_pages). */ 2338 pte++; 2339 if (!nr_pages || first_pte_in_page(pte) || 2340 (largepage_lvl > 1 && sg_res < lvl_pages)) { 2341 domain_flush_cache(domain, first_pte, 2342 (void *)pte - (void *)first_pte); 2343 pte = NULL; 2344 } 2345 2346 if (!sg_res && nr_pages) 2347 sg = sg_next(sg); 2348 } 2349 return 0; 2350 } 2351 2352 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2353 struct scatterlist *sg, unsigned long phys_pfn, 2354 unsigned long nr_pages, int prot) 2355 { 2356 int iommu_id, ret; 2357 struct intel_iommu *iommu; 2358 2359 /* Do the real mapping first */ 2360 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot); 2361 if (ret) 2362 return ret; 2363 2364 for_each_domain_iommu(iommu_id, domain) { 2365 iommu = g_iommus[iommu_id]; 2366 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2367 } 2368 2369 return 0; 2370 } 2371 2372 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2373 struct scatterlist *sg, unsigned long nr_pages, 2374 int prot) 2375 { 2376 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); 2377 } 2378 2379 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2380 unsigned long phys_pfn, unsigned long nr_pages, 2381 int prot) 2382 { 2383 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); 2384 } 2385 2386 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2387 { 2388 unsigned long flags; 2389 struct context_entry *context; 2390 u16 did_old; 2391 2392 if (!iommu) 2393 return; 2394 2395 spin_lock_irqsave(&iommu->lock, flags); 2396 context = iommu_context_addr(iommu, bus, devfn, 0); 2397 if (!context) { 2398 spin_unlock_irqrestore(&iommu->lock, flags); 2399 return; 2400 } 2401 did_old = context_domain_id(context); 2402 context_clear_entry(context); 2403 __iommu_flush_cache(iommu, context, sizeof(*context)); 2404 spin_unlock_irqrestore(&iommu->lock, flags); 2405 iommu->flush.flush_context(iommu, 2406 did_old, 2407 (((u16)bus) << 8) | devfn, 2408 DMA_CCMD_MASK_NOBIT, 2409 DMA_CCMD_DEVICE_INVL); 2410 iommu->flush.flush_iotlb(iommu, 2411 did_old, 2412 0, 2413 0, 2414 DMA_TLB_DSI_FLUSH); 2415 } 2416 2417 static inline void unlink_domain_info(struct device_domain_info *info) 2418 { 2419 assert_spin_locked(&device_domain_lock); 2420 list_del(&info->link); 2421 list_del(&info->global); 2422 if (info->dev) 2423 info->dev->archdata.iommu = NULL; 2424 } 2425 2426 static void domain_remove_dev_info(struct dmar_domain *domain) 2427 { 2428 struct device_domain_info *info, *tmp; 2429 unsigned long flags; 2430 2431 spin_lock_irqsave(&device_domain_lock, flags); 2432 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2433 __dmar_remove_one_dev_info(info); 2434 spin_unlock_irqrestore(&device_domain_lock, flags); 2435 } 2436 2437 struct dmar_domain *find_domain(struct device *dev) 2438 { 2439 struct device_domain_info *info; 2440 2441 if (unlikely(attach_deferred(dev) || iommu_dummy(dev))) 2442 return NULL; 2443 2444 /* No lock here, assumes no domain exit in normal case */ 2445 info = get_domain_info(dev); 2446 if (likely(info)) 2447 return info->domain; 2448 2449 return NULL; 2450 } 2451 2452 static void do_deferred_attach(struct device *dev) 2453 { 2454 struct iommu_domain *domain; 2455 2456 dev->archdata.iommu = NULL; 2457 domain = iommu_get_domain_for_dev(dev); 2458 if (domain) 2459 intel_iommu_attach_device(domain, dev); 2460 } 2461 2462 static inline struct device_domain_info * 2463 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2464 { 2465 struct device_domain_info *info; 2466 2467 list_for_each_entry(info, &device_domain_list, global) 2468 if (info->segment == segment && info->bus == bus && 2469 info->devfn == devfn) 2470 return info; 2471 2472 return NULL; 2473 } 2474 2475 static int domain_setup_first_level(struct intel_iommu *iommu, 2476 struct dmar_domain *domain, 2477 struct device *dev, 2478 int pasid) 2479 { 2480 int flags = PASID_FLAG_SUPERVISOR_MODE; 2481 struct dma_pte *pgd = domain->pgd; 2482 int agaw, level; 2483 2484 /* 2485 * Skip top levels of page tables for iommu which has 2486 * less agaw than default. Unnecessary for PT mode. 2487 */ 2488 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2489 pgd = phys_to_virt(dma_pte_addr(pgd)); 2490 if (!dma_pte_present(pgd)) 2491 return -ENOMEM; 2492 } 2493 2494 level = agaw_to_level(agaw); 2495 if (level != 4 && level != 5) 2496 return -EINVAL; 2497 2498 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2499 2500 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2501 domain->iommu_did[iommu->seq_id], 2502 flags); 2503 } 2504 2505 static bool dev_is_real_dma_subdevice(struct device *dev) 2506 { 2507 return dev && dev_is_pci(dev) && 2508 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2509 } 2510 2511 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2512 int bus, int devfn, 2513 struct device *dev, 2514 struct dmar_domain *domain) 2515 { 2516 struct dmar_domain *found = NULL; 2517 struct device_domain_info *info; 2518 unsigned long flags; 2519 int ret; 2520 2521 info = alloc_devinfo_mem(); 2522 if (!info) 2523 return NULL; 2524 2525 if (!dev_is_real_dma_subdevice(dev)) { 2526 info->bus = bus; 2527 info->devfn = devfn; 2528 info->segment = iommu->segment; 2529 } else { 2530 struct pci_dev *pdev = to_pci_dev(dev); 2531 2532 info->bus = pdev->bus->number; 2533 info->devfn = pdev->devfn; 2534 info->segment = pci_domain_nr(pdev->bus); 2535 } 2536 2537 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2538 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2539 info->ats_qdep = 0; 2540 info->dev = dev; 2541 info->domain = domain; 2542 info->iommu = iommu; 2543 info->pasid_table = NULL; 2544 info->auxd_enabled = 0; 2545 INIT_LIST_HEAD(&info->auxiliary_domains); 2546 2547 if (dev && dev_is_pci(dev)) { 2548 struct pci_dev *pdev = to_pci_dev(info->dev); 2549 2550 if (ecap_dev_iotlb_support(iommu->ecap) && 2551 pci_ats_supported(pdev) && 2552 dmar_find_matched_atsr_unit(pdev)) 2553 info->ats_supported = 1; 2554 2555 if (sm_supported(iommu)) { 2556 if (pasid_supported(iommu)) { 2557 int features = pci_pasid_features(pdev); 2558 if (features >= 0) 2559 info->pasid_supported = features | 1; 2560 } 2561 2562 if (info->ats_supported && ecap_prs(iommu->ecap) && 2563 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI)) 2564 info->pri_supported = 1; 2565 } 2566 } 2567 2568 spin_lock_irqsave(&device_domain_lock, flags); 2569 if (dev) 2570 found = find_domain(dev); 2571 2572 if (!found) { 2573 struct device_domain_info *info2; 2574 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2575 info->devfn); 2576 if (info2) { 2577 found = info2->domain; 2578 info2->dev = dev; 2579 } 2580 } 2581 2582 if (found) { 2583 spin_unlock_irqrestore(&device_domain_lock, flags); 2584 free_devinfo_mem(info); 2585 /* Caller must free the original domain */ 2586 return found; 2587 } 2588 2589 spin_lock(&iommu->lock); 2590 ret = domain_attach_iommu(domain, iommu); 2591 spin_unlock(&iommu->lock); 2592 2593 if (ret) { 2594 spin_unlock_irqrestore(&device_domain_lock, flags); 2595 free_devinfo_mem(info); 2596 return NULL; 2597 } 2598 2599 list_add(&info->link, &domain->devices); 2600 list_add(&info->global, &device_domain_list); 2601 if (dev) 2602 dev->archdata.iommu = info; 2603 spin_unlock_irqrestore(&device_domain_lock, flags); 2604 2605 /* PASID table is mandatory for a PCI device in scalable mode. */ 2606 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2607 ret = intel_pasid_alloc_table(dev); 2608 if (ret) { 2609 dev_err(dev, "PASID table allocation failed\n"); 2610 dmar_remove_one_dev_info(dev); 2611 return NULL; 2612 } 2613 2614 /* Setup the PASID entry for requests without PASID: */ 2615 spin_lock(&iommu->lock); 2616 if (hw_pass_through && domain_type_is_si(domain)) 2617 ret = intel_pasid_setup_pass_through(iommu, domain, 2618 dev, PASID_RID2PASID); 2619 else if (domain_use_first_level(domain)) 2620 ret = domain_setup_first_level(iommu, domain, dev, 2621 PASID_RID2PASID); 2622 else 2623 ret = intel_pasid_setup_second_level(iommu, domain, 2624 dev, PASID_RID2PASID); 2625 spin_unlock(&iommu->lock); 2626 if (ret) { 2627 dev_err(dev, "Setup RID2PASID failed\n"); 2628 dmar_remove_one_dev_info(dev); 2629 return NULL; 2630 } 2631 } 2632 2633 if (dev && domain_context_mapping(domain, dev)) { 2634 dev_err(dev, "Domain context map failed\n"); 2635 dmar_remove_one_dev_info(dev); 2636 return NULL; 2637 } 2638 2639 return domain; 2640 } 2641 2642 static int iommu_domain_identity_map(struct dmar_domain *domain, 2643 unsigned long first_vpfn, 2644 unsigned long last_vpfn) 2645 { 2646 /* 2647 * RMRR range might have overlap with physical memory range, 2648 * clear it first 2649 */ 2650 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2651 2652 return __domain_mapping(domain, first_vpfn, NULL, 2653 first_vpfn, last_vpfn - first_vpfn + 1, 2654 DMA_PTE_READ|DMA_PTE_WRITE); 2655 } 2656 2657 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2658 2659 static int __init si_domain_init(int hw) 2660 { 2661 struct dmar_rmrr_unit *rmrr; 2662 struct device *dev; 2663 int i, nid, ret; 2664 2665 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2666 if (!si_domain) 2667 return -EFAULT; 2668 2669 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2670 domain_exit(si_domain); 2671 return -EFAULT; 2672 } 2673 2674 if (hw) 2675 return 0; 2676 2677 for_each_online_node(nid) { 2678 unsigned long start_pfn, end_pfn; 2679 int i; 2680 2681 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2682 ret = iommu_domain_identity_map(si_domain, 2683 mm_to_dma_pfn(start_pfn), 2684 mm_to_dma_pfn(end_pfn)); 2685 if (ret) 2686 return ret; 2687 } 2688 } 2689 2690 /* 2691 * Identity map the RMRRs so that devices with RMRRs could also use 2692 * the si_domain. 2693 */ 2694 for_each_rmrr_units(rmrr) { 2695 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2696 i, dev) { 2697 unsigned long long start = rmrr->base_address; 2698 unsigned long long end = rmrr->end_address; 2699 2700 if (WARN_ON(end < start || 2701 end >> agaw_to_width(si_domain->agaw))) 2702 continue; 2703 2704 ret = iommu_domain_identity_map(si_domain, 2705 mm_to_dma_pfn(start >> PAGE_SHIFT), 2706 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2707 if (ret) 2708 return ret; 2709 } 2710 } 2711 2712 return 0; 2713 } 2714 2715 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2716 { 2717 struct dmar_domain *ndomain; 2718 struct intel_iommu *iommu; 2719 u8 bus, devfn; 2720 2721 iommu = device_to_iommu(dev, &bus, &devfn); 2722 if (!iommu) 2723 return -ENODEV; 2724 2725 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2726 if (ndomain != domain) 2727 return -EBUSY; 2728 2729 return 0; 2730 } 2731 2732 static bool device_has_rmrr(struct device *dev) 2733 { 2734 struct dmar_rmrr_unit *rmrr; 2735 struct device *tmp; 2736 int i; 2737 2738 rcu_read_lock(); 2739 for_each_rmrr_units(rmrr) { 2740 /* 2741 * Return TRUE if this RMRR contains the device that 2742 * is passed in. 2743 */ 2744 for_each_active_dev_scope(rmrr->devices, 2745 rmrr->devices_cnt, i, tmp) 2746 if (tmp == dev || 2747 is_downstream_to_pci_bridge(dev, tmp)) { 2748 rcu_read_unlock(); 2749 return true; 2750 } 2751 } 2752 rcu_read_unlock(); 2753 return false; 2754 } 2755 2756 /** 2757 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2758 * is relaxable (ie. is allowed to be not enforced under some conditions) 2759 * @dev: device handle 2760 * 2761 * We assume that PCI USB devices with RMRRs have them largely 2762 * for historical reasons and that the RMRR space is not actively used post 2763 * boot. This exclusion may change if vendors begin to abuse it. 2764 * 2765 * The same exception is made for graphics devices, with the requirement that 2766 * any use of the RMRR regions will be torn down before assigning the device 2767 * to a guest. 2768 * 2769 * Return: true if the RMRR is relaxable, false otherwise 2770 */ 2771 static bool device_rmrr_is_relaxable(struct device *dev) 2772 { 2773 struct pci_dev *pdev; 2774 2775 if (!dev_is_pci(dev)) 2776 return false; 2777 2778 pdev = to_pci_dev(dev); 2779 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2780 return true; 2781 else 2782 return false; 2783 } 2784 2785 /* 2786 * There are a couple cases where we need to restrict the functionality of 2787 * devices associated with RMRRs. The first is when evaluating a device for 2788 * identity mapping because problems exist when devices are moved in and out 2789 * of domains and their respective RMRR information is lost. This means that 2790 * a device with associated RMRRs will never be in a "passthrough" domain. 2791 * The second is use of the device through the IOMMU API. This interface 2792 * expects to have full control of the IOVA space for the device. We cannot 2793 * satisfy both the requirement that RMRR access is maintained and have an 2794 * unencumbered IOVA space. We also have no ability to quiesce the device's 2795 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2796 * We therefore prevent devices associated with an RMRR from participating in 2797 * the IOMMU API, which eliminates them from device assignment. 2798 * 2799 * In both cases, devices which have relaxable RMRRs are not concerned by this 2800 * restriction. See device_rmrr_is_relaxable comment. 2801 */ 2802 static bool device_is_rmrr_locked(struct device *dev) 2803 { 2804 if (!device_has_rmrr(dev)) 2805 return false; 2806 2807 if (device_rmrr_is_relaxable(dev)) 2808 return false; 2809 2810 return true; 2811 } 2812 2813 /* 2814 * Return the required default domain type for a specific device. 2815 * 2816 * @dev: the device in query 2817 * @startup: true if this is during early boot 2818 * 2819 * Returns: 2820 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2821 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2822 * - 0: both identity and dynamic domains work for this device 2823 */ 2824 static int device_def_domain_type(struct device *dev) 2825 { 2826 if (dev_is_pci(dev)) { 2827 struct pci_dev *pdev = to_pci_dev(dev); 2828 2829 /* 2830 * Prevent any device marked as untrusted from getting 2831 * placed into the statically identity mapping domain. 2832 */ 2833 if (pdev->untrusted) 2834 return IOMMU_DOMAIN_DMA; 2835 2836 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2837 return IOMMU_DOMAIN_IDENTITY; 2838 2839 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2840 return IOMMU_DOMAIN_IDENTITY; 2841 } 2842 2843 return 0; 2844 } 2845 2846 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2847 { 2848 /* 2849 * Start from the sane iommu hardware state. 2850 * If the queued invalidation is already initialized by us 2851 * (for example, while enabling interrupt-remapping) then 2852 * we got the things already rolling from a sane state. 2853 */ 2854 if (!iommu->qi) { 2855 /* 2856 * Clear any previous faults. 2857 */ 2858 dmar_fault(-1, iommu); 2859 /* 2860 * Disable queued invalidation if supported and already enabled 2861 * before OS handover. 2862 */ 2863 dmar_disable_qi(iommu); 2864 } 2865 2866 if (dmar_enable_qi(iommu)) { 2867 /* 2868 * Queued Invalidate not enabled, use Register Based Invalidate 2869 */ 2870 iommu->flush.flush_context = __iommu_flush_context; 2871 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2872 pr_info("%s: Using Register based invalidation\n", 2873 iommu->name); 2874 } else { 2875 iommu->flush.flush_context = qi_flush_context; 2876 iommu->flush.flush_iotlb = qi_flush_iotlb; 2877 pr_info("%s: Using Queued invalidation\n", iommu->name); 2878 } 2879 } 2880 2881 static int copy_context_table(struct intel_iommu *iommu, 2882 struct root_entry *old_re, 2883 struct context_entry **tbl, 2884 int bus, bool ext) 2885 { 2886 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2887 struct context_entry *new_ce = NULL, ce; 2888 struct context_entry *old_ce = NULL; 2889 struct root_entry re; 2890 phys_addr_t old_ce_phys; 2891 2892 tbl_idx = ext ? bus * 2 : bus; 2893 memcpy(&re, old_re, sizeof(re)); 2894 2895 for (devfn = 0; devfn < 256; devfn++) { 2896 /* First calculate the correct index */ 2897 idx = (ext ? devfn * 2 : devfn) % 256; 2898 2899 if (idx == 0) { 2900 /* First save what we may have and clean up */ 2901 if (new_ce) { 2902 tbl[tbl_idx] = new_ce; 2903 __iommu_flush_cache(iommu, new_ce, 2904 VTD_PAGE_SIZE); 2905 pos = 1; 2906 } 2907 2908 if (old_ce) 2909 memunmap(old_ce); 2910 2911 ret = 0; 2912 if (devfn < 0x80) 2913 old_ce_phys = root_entry_lctp(&re); 2914 else 2915 old_ce_phys = root_entry_uctp(&re); 2916 2917 if (!old_ce_phys) { 2918 if (ext && devfn == 0) { 2919 /* No LCTP, try UCTP */ 2920 devfn = 0x7f; 2921 continue; 2922 } else { 2923 goto out; 2924 } 2925 } 2926 2927 ret = -ENOMEM; 2928 old_ce = memremap(old_ce_phys, PAGE_SIZE, 2929 MEMREMAP_WB); 2930 if (!old_ce) 2931 goto out; 2932 2933 new_ce = alloc_pgtable_page(iommu->node); 2934 if (!new_ce) 2935 goto out_unmap; 2936 2937 ret = 0; 2938 } 2939 2940 /* Now copy the context entry */ 2941 memcpy(&ce, old_ce + idx, sizeof(ce)); 2942 2943 if (!__context_present(&ce)) 2944 continue; 2945 2946 did = context_domain_id(&ce); 2947 if (did >= 0 && did < cap_ndoms(iommu->cap)) 2948 set_bit(did, iommu->domain_ids); 2949 2950 /* 2951 * We need a marker for copied context entries. This 2952 * marker needs to work for the old format as well as 2953 * for extended context entries. 2954 * 2955 * Bit 67 of the context entry is used. In the old 2956 * format this bit is available to software, in the 2957 * extended format it is the PGE bit, but PGE is ignored 2958 * by HW if PASIDs are disabled (and thus still 2959 * available). 2960 * 2961 * So disable PASIDs first and then mark the entry 2962 * copied. This means that we don't copy PASID 2963 * translations from the old kernel, but this is fine as 2964 * faults there are not fatal. 2965 */ 2966 context_clear_pasid_enable(&ce); 2967 context_set_copied(&ce); 2968 2969 new_ce[idx] = ce; 2970 } 2971 2972 tbl[tbl_idx + pos] = new_ce; 2973 2974 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 2975 2976 out_unmap: 2977 memunmap(old_ce); 2978 2979 out: 2980 return ret; 2981 } 2982 2983 static int copy_translation_tables(struct intel_iommu *iommu) 2984 { 2985 struct context_entry **ctxt_tbls; 2986 struct root_entry *old_rt; 2987 phys_addr_t old_rt_phys; 2988 int ctxt_table_entries; 2989 unsigned long flags; 2990 u64 rtaddr_reg; 2991 int bus, ret; 2992 bool new_ext, ext; 2993 2994 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 2995 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 2996 new_ext = !!ecap_ecs(iommu->ecap); 2997 2998 /* 2999 * The RTT bit can only be changed when translation is disabled, 3000 * but disabling translation means to open a window for data 3001 * corruption. So bail out and don't copy anything if we would 3002 * have to change the bit. 3003 */ 3004 if (new_ext != ext) 3005 return -EINVAL; 3006 3007 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3008 if (!old_rt_phys) 3009 return -EINVAL; 3010 3011 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3012 if (!old_rt) 3013 return -ENOMEM; 3014 3015 /* This is too big for the stack - allocate it from slab */ 3016 ctxt_table_entries = ext ? 512 : 256; 3017 ret = -ENOMEM; 3018 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3019 if (!ctxt_tbls) 3020 goto out_unmap; 3021 3022 for (bus = 0; bus < 256; bus++) { 3023 ret = copy_context_table(iommu, &old_rt[bus], 3024 ctxt_tbls, bus, ext); 3025 if (ret) { 3026 pr_err("%s: Failed to copy context table for bus %d\n", 3027 iommu->name, bus); 3028 continue; 3029 } 3030 } 3031 3032 spin_lock_irqsave(&iommu->lock, flags); 3033 3034 /* Context tables are copied, now write them to the root_entry table */ 3035 for (bus = 0; bus < 256; bus++) { 3036 int idx = ext ? bus * 2 : bus; 3037 u64 val; 3038 3039 if (ctxt_tbls[idx]) { 3040 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3041 iommu->root_entry[bus].lo = val; 3042 } 3043 3044 if (!ext || !ctxt_tbls[idx + 1]) 3045 continue; 3046 3047 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3048 iommu->root_entry[bus].hi = val; 3049 } 3050 3051 spin_unlock_irqrestore(&iommu->lock, flags); 3052 3053 kfree(ctxt_tbls); 3054 3055 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3056 3057 ret = 0; 3058 3059 out_unmap: 3060 memunmap(old_rt); 3061 3062 return ret; 3063 } 3064 3065 #ifdef CONFIG_INTEL_IOMMU_SVM 3066 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3067 { 3068 struct intel_iommu *iommu = data; 3069 ioasid_t ioasid; 3070 3071 if (!iommu) 3072 return INVALID_IOASID; 3073 /* 3074 * VT-d virtual command interface always uses the full 20 bit 3075 * PASID range. Host can partition guest PASID range based on 3076 * policies but it is out of guest's control. 3077 */ 3078 if (min < PASID_MIN || max > intel_pasid_max_id) 3079 return INVALID_IOASID; 3080 3081 if (vcmd_alloc_pasid(iommu, &ioasid)) 3082 return INVALID_IOASID; 3083 3084 return ioasid; 3085 } 3086 3087 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3088 { 3089 struct intel_iommu *iommu = data; 3090 3091 if (!iommu) 3092 return; 3093 /* 3094 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3095 * We can only free the PASID when all the devices are unbound. 3096 */ 3097 if (ioasid_find(NULL, ioasid, NULL)) { 3098 pr_alert("Cannot free active IOASID %d\n", ioasid); 3099 return; 3100 } 3101 vcmd_free_pasid(iommu, ioasid); 3102 } 3103 3104 static void register_pasid_allocator(struct intel_iommu *iommu) 3105 { 3106 /* 3107 * If we are running in the host, no need for custom allocator 3108 * in that PASIDs are allocated from the host system-wide. 3109 */ 3110 if (!cap_caching_mode(iommu->cap)) 3111 return; 3112 3113 if (!sm_supported(iommu)) { 3114 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3115 return; 3116 } 3117 3118 /* 3119 * Register a custom PASID allocator if we are running in a guest, 3120 * guest PASID must be obtained via virtual command interface. 3121 * There can be multiple vIOMMUs in each guest but only one allocator 3122 * is active. All vIOMMU allocators will eventually be calling the same 3123 * host allocator. 3124 */ 3125 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) 3126 return; 3127 3128 pr_info("Register custom PASID allocator\n"); 3129 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3130 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3131 iommu->pasid_allocator.pdata = (void *)iommu; 3132 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3133 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3134 /* 3135 * Disable scalable mode on this IOMMU if there 3136 * is no custom allocator. Mixing SM capable vIOMMU 3137 * and non-SM vIOMMU are not supported. 3138 */ 3139 intel_iommu_sm = 0; 3140 } 3141 } 3142 #endif 3143 3144 static int __init init_dmars(void) 3145 { 3146 struct dmar_drhd_unit *drhd; 3147 struct intel_iommu *iommu; 3148 int ret; 3149 3150 /* 3151 * for each drhd 3152 * allocate root 3153 * initialize and program root entry to not present 3154 * endfor 3155 */ 3156 for_each_drhd_unit(drhd) { 3157 /* 3158 * lock not needed as this is only incremented in the single 3159 * threaded kernel __init code path all other access are read 3160 * only 3161 */ 3162 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3163 g_num_of_iommus++; 3164 continue; 3165 } 3166 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3167 } 3168 3169 /* Preallocate enough resources for IOMMU hot-addition */ 3170 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3171 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3172 3173 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3174 GFP_KERNEL); 3175 if (!g_iommus) { 3176 pr_err("Allocating global iommu array failed\n"); 3177 ret = -ENOMEM; 3178 goto error; 3179 } 3180 3181 for_each_iommu(iommu, drhd) { 3182 if (drhd->ignored) { 3183 iommu_disable_translation(iommu); 3184 continue; 3185 } 3186 3187 /* 3188 * Find the max pasid size of all IOMMU's in the system. 3189 * We need to ensure the system pasid table is no bigger 3190 * than the smallest supported. 3191 */ 3192 if (pasid_supported(iommu)) { 3193 u32 temp = 2 << ecap_pss(iommu->ecap); 3194 3195 intel_pasid_max_id = min_t(u32, temp, 3196 intel_pasid_max_id); 3197 } 3198 3199 g_iommus[iommu->seq_id] = iommu; 3200 3201 intel_iommu_init_qi(iommu); 3202 3203 ret = iommu_init_domains(iommu); 3204 if (ret) 3205 goto free_iommu; 3206 3207 init_translation_status(iommu); 3208 3209 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3210 iommu_disable_translation(iommu); 3211 clear_translation_pre_enabled(iommu); 3212 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3213 iommu->name); 3214 } 3215 3216 /* 3217 * TBD: 3218 * we could share the same root & context tables 3219 * among all IOMMU's. Need to Split it later. 3220 */ 3221 ret = iommu_alloc_root_entry(iommu); 3222 if (ret) 3223 goto free_iommu; 3224 3225 if (translation_pre_enabled(iommu)) { 3226 pr_info("Translation already enabled - trying to copy translation structures\n"); 3227 3228 ret = copy_translation_tables(iommu); 3229 if (ret) { 3230 /* 3231 * We found the IOMMU with translation 3232 * enabled - but failed to copy over the 3233 * old root-entry table. Try to proceed 3234 * by disabling translation now and 3235 * allocating a clean root-entry table. 3236 * This might cause DMAR faults, but 3237 * probably the dump will still succeed. 3238 */ 3239 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3240 iommu->name); 3241 iommu_disable_translation(iommu); 3242 clear_translation_pre_enabled(iommu); 3243 } else { 3244 pr_info("Copied translation tables from previous kernel for %s\n", 3245 iommu->name); 3246 } 3247 } 3248 3249 if (!ecap_pass_through(iommu->ecap)) 3250 hw_pass_through = 0; 3251 intel_svm_check(iommu); 3252 } 3253 3254 /* 3255 * Now that qi is enabled on all iommus, set the root entry and flush 3256 * caches. This is required on some Intel X58 chipsets, otherwise the 3257 * flush_context function will loop forever and the boot hangs. 3258 */ 3259 for_each_active_iommu(iommu, drhd) { 3260 iommu_flush_write_buffer(iommu); 3261 #ifdef CONFIG_INTEL_IOMMU_SVM 3262 register_pasid_allocator(iommu); 3263 #endif 3264 iommu_set_root_entry(iommu); 3265 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3266 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3267 } 3268 3269 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3270 dmar_map_gfx = 0; 3271 #endif 3272 3273 if (!dmar_map_gfx) 3274 iommu_identity_mapping |= IDENTMAP_GFX; 3275 3276 check_tylersburg_isoch(); 3277 3278 ret = si_domain_init(hw_pass_through); 3279 if (ret) 3280 goto free_iommu; 3281 3282 /* 3283 * for each drhd 3284 * enable fault log 3285 * global invalidate context cache 3286 * global invalidate iotlb 3287 * enable translation 3288 */ 3289 for_each_iommu(iommu, drhd) { 3290 if (drhd->ignored) { 3291 /* 3292 * we always have to disable PMRs or DMA may fail on 3293 * this device 3294 */ 3295 if (force_on) 3296 iommu_disable_protect_mem_regions(iommu); 3297 continue; 3298 } 3299 3300 iommu_flush_write_buffer(iommu); 3301 3302 #ifdef CONFIG_INTEL_IOMMU_SVM 3303 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3304 /* 3305 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3306 * could cause possible lock race condition. 3307 */ 3308 up_write(&dmar_global_lock); 3309 ret = intel_svm_enable_prq(iommu); 3310 down_write(&dmar_global_lock); 3311 if (ret) 3312 goto free_iommu; 3313 } 3314 #endif 3315 ret = dmar_set_interrupt(iommu); 3316 if (ret) 3317 goto free_iommu; 3318 } 3319 3320 return 0; 3321 3322 free_iommu: 3323 for_each_active_iommu(iommu, drhd) { 3324 disable_dmar_iommu(iommu); 3325 free_dmar_iommu(iommu); 3326 } 3327 3328 kfree(g_iommus); 3329 3330 error: 3331 return ret; 3332 } 3333 3334 /* This takes a number of _MM_ pages, not VTD pages */ 3335 static unsigned long intel_alloc_iova(struct device *dev, 3336 struct dmar_domain *domain, 3337 unsigned long nrpages, uint64_t dma_mask) 3338 { 3339 unsigned long iova_pfn; 3340 3341 /* 3342 * Restrict dma_mask to the width that the iommu can handle. 3343 * First-level translation restricts the input-address to a 3344 * canonical address (i.e., address bits 63:N have the same 3345 * value as address bit [N-1], where N is 48-bits with 4-level 3346 * paging and 57-bits with 5-level paging). Hence, skip bit 3347 * [N-1]. 3348 */ 3349 if (domain_use_first_level(domain)) 3350 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1), 3351 dma_mask); 3352 else 3353 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), 3354 dma_mask); 3355 3356 /* Ensure we reserve the whole size-aligned region */ 3357 nrpages = __roundup_pow_of_two(nrpages); 3358 3359 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { 3360 /* 3361 * First try to allocate an io virtual address in 3362 * DMA_BIT_MASK(32) and if that fails then try allocating 3363 * from higher range 3364 */ 3365 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3366 IOVA_PFN(DMA_BIT_MASK(32)), false); 3367 if (iova_pfn) 3368 return iova_pfn; 3369 } 3370 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3371 IOVA_PFN(dma_mask), true); 3372 if (unlikely(!iova_pfn)) { 3373 dev_err_once(dev, "Allocating %ld-page iova failed\n", 3374 nrpages); 3375 return 0; 3376 } 3377 3378 return iova_pfn; 3379 } 3380 3381 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, 3382 size_t size, int dir, u64 dma_mask) 3383 { 3384 struct dmar_domain *domain; 3385 phys_addr_t start_paddr; 3386 unsigned long iova_pfn; 3387 int prot = 0; 3388 int ret; 3389 struct intel_iommu *iommu; 3390 unsigned long paddr_pfn = paddr >> PAGE_SHIFT; 3391 3392 BUG_ON(dir == DMA_NONE); 3393 3394 if (unlikely(attach_deferred(dev))) 3395 do_deferred_attach(dev); 3396 3397 domain = find_domain(dev); 3398 if (!domain) 3399 return DMA_MAPPING_ERROR; 3400 3401 iommu = domain_get_iommu(domain); 3402 size = aligned_nrpages(paddr, size); 3403 3404 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); 3405 if (!iova_pfn) 3406 goto error; 3407 3408 /* 3409 * Check if DMAR supports zero-length reads on write only 3410 * mappings.. 3411 */ 3412 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3413 !cap_zlr(iommu->cap)) 3414 prot |= DMA_PTE_READ; 3415 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3416 prot |= DMA_PTE_WRITE; 3417 /* 3418 * paddr - (paddr + size) might be partial page, we should map the whole 3419 * page. Note: if two part of one page are separately mapped, we 3420 * might have two guest_addr mapping to the same host paddr, but this 3421 * is not a big problem 3422 */ 3423 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3424 mm_to_dma_pfn(paddr_pfn), size, prot); 3425 if (ret) 3426 goto error; 3427 3428 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT; 3429 start_paddr += paddr & ~PAGE_MASK; 3430 3431 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT); 3432 3433 return start_paddr; 3434 3435 error: 3436 if (iova_pfn) 3437 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3438 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n", 3439 size, (unsigned long long)paddr, dir); 3440 return DMA_MAPPING_ERROR; 3441 } 3442 3443 static dma_addr_t intel_map_page(struct device *dev, struct page *page, 3444 unsigned long offset, size_t size, 3445 enum dma_data_direction dir, 3446 unsigned long attrs) 3447 { 3448 return __intel_map_single(dev, page_to_phys(page) + offset, 3449 size, dir, *dev->dma_mask); 3450 } 3451 3452 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr, 3453 size_t size, enum dma_data_direction dir, 3454 unsigned long attrs) 3455 { 3456 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask); 3457 } 3458 3459 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) 3460 { 3461 struct dmar_domain *domain; 3462 unsigned long start_pfn, last_pfn; 3463 unsigned long nrpages; 3464 unsigned long iova_pfn; 3465 struct intel_iommu *iommu; 3466 struct page *freelist; 3467 struct pci_dev *pdev = NULL; 3468 3469 domain = find_domain(dev); 3470 BUG_ON(!domain); 3471 3472 iommu = domain_get_iommu(domain); 3473 3474 iova_pfn = IOVA_PFN(dev_addr); 3475 3476 nrpages = aligned_nrpages(dev_addr, size); 3477 start_pfn = mm_to_dma_pfn(iova_pfn); 3478 last_pfn = start_pfn + nrpages - 1; 3479 3480 if (dev_is_pci(dev)) 3481 pdev = to_pci_dev(dev); 3482 3483 freelist = domain_unmap(domain, start_pfn, last_pfn); 3484 if (intel_iommu_strict || (pdev && pdev->untrusted) || 3485 !has_iova_flush_queue(&domain->iovad)) { 3486 iommu_flush_iotlb_psi(iommu, domain, start_pfn, 3487 nrpages, !freelist, 0); 3488 /* free iova */ 3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3490 dma_free_pagelist(freelist); 3491 } else { 3492 queue_iova(&domain->iovad, iova_pfn, nrpages, 3493 (unsigned long)freelist); 3494 /* 3495 * queue up the release of the unmap to save the 1/6th of the 3496 * cpu used up by the iotlb flush operation... 3497 */ 3498 } 3499 3500 trace_unmap_single(dev, dev_addr, size); 3501 } 3502 3503 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, 3504 size_t size, enum dma_data_direction dir, 3505 unsigned long attrs) 3506 { 3507 intel_unmap(dev, dev_addr, size); 3508 } 3509 3510 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr, 3511 size_t size, enum dma_data_direction dir, unsigned long attrs) 3512 { 3513 intel_unmap(dev, dev_addr, size); 3514 } 3515 3516 static void *intel_alloc_coherent(struct device *dev, size_t size, 3517 dma_addr_t *dma_handle, gfp_t flags, 3518 unsigned long attrs) 3519 { 3520 struct page *page = NULL; 3521 int order; 3522 3523 if (unlikely(attach_deferred(dev))) 3524 do_deferred_attach(dev); 3525 3526 size = PAGE_ALIGN(size); 3527 order = get_order(size); 3528 3529 if (gfpflags_allow_blocking(flags)) { 3530 unsigned int count = size >> PAGE_SHIFT; 3531 3532 page = dma_alloc_from_contiguous(dev, count, order, 3533 flags & __GFP_NOWARN); 3534 } 3535 3536 if (!page) 3537 page = alloc_pages(flags, order); 3538 if (!page) 3539 return NULL; 3540 memset(page_address(page), 0, size); 3541 3542 *dma_handle = __intel_map_single(dev, page_to_phys(page), size, 3543 DMA_BIDIRECTIONAL, 3544 dev->coherent_dma_mask); 3545 if (*dma_handle != DMA_MAPPING_ERROR) 3546 return page_address(page); 3547 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3548 __free_pages(page, order); 3549 3550 return NULL; 3551 } 3552 3553 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, 3554 dma_addr_t dma_handle, unsigned long attrs) 3555 { 3556 int order; 3557 struct page *page = virt_to_page(vaddr); 3558 3559 size = PAGE_ALIGN(size); 3560 order = get_order(size); 3561 3562 intel_unmap(dev, dma_handle, size); 3563 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3564 __free_pages(page, order); 3565 } 3566 3567 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, 3568 int nelems, enum dma_data_direction dir, 3569 unsigned long attrs) 3570 { 3571 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK; 3572 unsigned long nrpages = 0; 3573 struct scatterlist *sg; 3574 int i; 3575 3576 for_each_sg(sglist, sg, nelems, i) { 3577 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); 3578 } 3579 3580 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3581 3582 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3583 } 3584 3585 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3586 enum dma_data_direction dir, unsigned long attrs) 3587 { 3588 int i; 3589 struct dmar_domain *domain; 3590 size_t size = 0; 3591 int prot = 0; 3592 unsigned long iova_pfn; 3593 int ret; 3594 struct scatterlist *sg; 3595 unsigned long start_vpfn; 3596 struct intel_iommu *iommu; 3597 3598 BUG_ON(dir == DMA_NONE); 3599 3600 if (unlikely(attach_deferred(dev))) 3601 do_deferred_attach(dev); 3602 3603 domain = find_domain(dev); 3604 if (!domain) 3605 return 0; 3606 3607 iommu = domain_get_iommu(domain); 3608 3609 for_each_sg(sglist, sg, nelems, i) 3610 size += aligned_nrpages(sg->offset, sg->length); 3611 3612 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), 3613 *dev->dma_mask); 3614 if (!iova_pfn) { 3615 sglist->dma_length = 0; 3616 return 0; 3617 } 3618 3619 /* 3620 * Check if DMAR supports zero-length reads on write only 3621 * mappings.. 3622 */ 3623 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3624 !cap_zlr(iommu->cap)) 3625 prot |= DMA_PTE_READ; 3626 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3627 prot |= DMA_PTE_WRITE; 3628 3629 start_vpfn = mm_to_dma_pfn(iova_pfn); 3630 3631 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); 3632 if (unlikely(ret)) { 3633 dma_pte_free_pagetable(domain, start_vpfn, 3634 start_vpfn + size - 1, 3635 agaw_to_level(domain->agaw) + 1); 3636 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3637 return 0; 3638 } 3639 3640 for_each_sg(sglist, sg, nelems, i) 3641 trace_map_sg(dev, i + 1, nelems, sg); 3642 3643 return nelems; 3644 } 3645 3646 static u64 intel_get_required_mask(struct device *dev) 3647 { 3648 return DMA_BIT_MASK(32); 3649 } 3650 3651 static const struct dma_map_ops intel_dma_ops = { 3652 .alloc = intel_alloc_coherent, 3653 .free = intel_free_coherent, 3654 .map_sg = intel_map_sg, 3655 .unmap_sg = intel_unmap_sg, 3656 .map_page = intel_map_page, 3657 .unmap_page = intel_unmap_page, 3658 .map_resource = intel_map_resource, 3659 .unmap_resource = intel_unmap_resource, 3660 .dma_supported = dma_direct_supported, 3661 .mmap = dma_common_mmap, 3662 .get_sgtable = dma_common_get_sgtable, 3663 .get_required_mask = intel_get_required_mask, 3664 }; 3665 3666 static void 3667 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size, 3668 enum dma_data_direction dir, enum dma_sync_target target) 3669 { 3670 struct dmar_domain *domain; 3671 phys_addr_t tlb_addr; 3672 3673 domain = find_domain(dev); 3674 if (WARN_ON(!domain)) 3675 return; 3676 3677 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr); 3678 if (is_swiotlb_buffer(tlb_addr)) 3679 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target); 3680 } 3681 3682 static dma_addr_t 3683 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, 3684 enum dma_data_direction dir, unsigned long attrs, 3685 u64 dma_mask) 3686 { 3687 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3688 struct dmar_domain *domain; 3689 struct intel_iommu *iommu; 3690 unsigned long iova_pfn; 3691 unsigned long nrpages; 3692 phys_addr_t tlb_addr; 3693 int prot = 0; 3694 int ret; 3695 3696 if (unlikely(attach_deferred(dev))) 3697 do_deferred_attach(dev); 3698 3699 domain = find_domain(dev); 3700 3701 if (WARN_ON(dir == DMA_NONE || !domain)) 3702 return DMA_MAPPING_ERROR; 3703 3704 iommu = domain_get_iommu(domain); 3705 if (WARN_ON(!iommu)) 3706 return DMA_MAPPING_ERROR; 3707 3708 nrpages = aligned_nrpages(0, size); 3709 iova_pfn = intel_alloc_iova(dev, domain, 3710 dma_to_mm_pfn(nrpages), dma_mask); 3711 if (!iova_pfn) 3712 return DMA_MAPPING_ERROR; 3713 3714 /* 3715 * Check if DMAR supports zero-length reads on write only 3716 * mappings.. 3717 */ 3718 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || 3719 !cap_zlr(iommu->cap)) 3720 prot |= DMA_PTE_READ; 3721 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3722 prot |= DMA_PTE_WRITE; 3723 3724 /* 3725 * If both the physical buffer start address and size are 3726 * page aligned, we don't need to use a bounce page. 3727 */ 3728 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { 3729 tlb_addr = swiotlb_tbl_map_single(dev, 3730 __phys_to_dma(dev, io_tlb_start), 3731 paddr, size, aligned_size, dir, attrs); 3732 if (tlb_addr == DMA_MAPPING_ERROR) { 3733 goto swiotlb_error; 3734 } else { 3735 /* Cleanup the padding area. */ 3736 void *padding_start = phys_to_virt(tlb_addr); 3737 size_t padding_size = aligned_size; 3738 3739 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 3740 (dir == DMA_TO_DEVICE || 3741 dir == DMA_BIDIRECTIONAL)) { 3742 padding_start += size; 3743 padding_size -= size; 3744 } 3745 3746 memset(padding_start, 0, padding_size); 3747 } 3748 } else { 3749 tlb_addr = paddr; 3750 } 3751 3752 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3753 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot); 3754 if (ret) 3755 goto mapping_error; 3756 3757 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size); 3758 3759 return (phys_addr_t)iova_pfn << PAGE_SHIFT; 3760 3761 mapping_error: 3762 if (is_swiotlb_buffer(tlb_addr)) 3763 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3764 aligned_size, dir, attrs); 3765 swiotlb_error: 3766 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3767 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n", 3768 size, (unsigned long long)paddr, dir); 3769 3770 return DMA_MAPPING_ERROR; 3771 } 3772 3773 static void 3774 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, 3775 enum dma_data_direction dir, unsigned long attrs) 3776 { 3777 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3778 struct dmar_domain *domain; 3779 phys_addr_t tlb_addr; 3780 3781 domain = find_domain(dev); 3782 if (WARN_ON(!domain)) 3783 return; 3784 3785 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr); 3786 if (WARN_ON(!tlb_addr)) 3787 return; 3788 3789 intel_unmap(dev, dev_addr, size); 3790 if (is_swiotlb_buffer(tlb_addr)) 3791 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3792 aligned_size, dir, attrs); 3793 3794 trace_bounce_unmap_single(dev, dev_addr, size); 3795 } 3796 3797 static dma_addr_t 3798 bounce_map_page(struct device *dev, struct page *page, unsigned long offset, 3799 size_t size, enum dma_data_direction dir, unsigned long attrs) 3800 { 3801 return bounce_map_single(dev, page_to_phys(page) + offset, 3802 size, dir, attrs, *dev->dma_mask); 3803 } 3804 3805 static dma_addr_t 3806 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, 3807 enum dma_data_direction dir, unsigned long attrs) 3808 { 3809 return bounce_map_single(dev, phys_addr, size, 3810 dir, attrs, *dev->dma_mask); 3811 } 3812 3813 static void 3814 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, 3815 enum dma_data_direction dir, unsigned long attrs) 3816 { 3817 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3818 } 3819 3820 static void 3821 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size, 3822 enum dma_data_direction dir, unsigned long attrs) 3823 { 3824 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3825 } 3826 3827 static void 3828 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3829 enum dma_data_direction dir, unsigned long attrs) 3830 { 3831 struct scatterlist *sg; 3832 int i; 3833 3834 for_each_sg(sglist, sg, nelems, i) 3835 bounce_unmap_page(dev, sg->dma_address, 3836 sg_dma_len(sg), dir, attrs); 3837 } 3838 3839 static int 3840 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3841 enum dma_data_direction dir, unsigned long attrs) 3842 { 3843 int i; 3844 struct scatterlist *sg; 3845 3846 for_each_sg(sglist, sg, nelems, i) { 3847 sg->dma_address = bounce_map_page(dev, sg_page(sg), 3848 sg->offset, sg->length, 3849 dir, attrs); 3850 if (sg->dma_address == DMA_MAPPING_ERROR) 3851 goto out_unmap; 3852 sg_dma_len(sg) = sg->length; 3853 } 3854 3855 for_each_sg(sglist, sg, nelems, i) 3856 trace_bounce_map_sg(dev, i + 1, nelems, sg); 3857 3858 return nelems; 3859 3860 out_unmap: 3861 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 3862 return 0; 3863 } 3864 3865 static void 3866 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr, 3867 size_t size, enum dma_data_direction dir) 3868 { 3869 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU); 3870 } 3871 3872 static void 3873 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr, 3874 size_t size, enum dma_data_direction dir) 3875 { 3876 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE); 3877 } 3878 3879 static void 3880 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, 3881 int nelems, enum dma_data_direction dir) 3882 { 3883 struct scatterlist *sg; 3884 int i; 3885 3886 for_each_sg(sglist, sg, nelems, i) 3887 bounce_sync_single(dev, sg_dma_address(sg), 3888 sg_dma_len(sg), dir, SYNC_FOR_CPU); 3889 } 3890 3891 static void 3892 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, 3893 int nelems, enum dma_data_direction dir) 3894 { 3895 struct scatterlist *sg; 3896 int i; 3897 3898 for_each_sg(sglist, sg, nelems, i) 3899 bounce_sync_single(dev, sg_dma_address(sg), 3900 sg_dma_len(sg), dir, SYNC_FOR_DEVICE); 3901 } 3902 3903 static const struct dma_map_ops bounce_dma_ops = { 3904 .alloc = intel_alloc_coherent, 3905 .free = intel_free_coherent, 3906 .map_sg = bounce_map_sg, 3907 .unmap_sg = bounce_unmap_sg, 3908 .map_page = bounce_map_page, 3909 .unmap_page = bounce_unmap_page, 3910 .sync_single_for_cpu = bounce_sync_single_for_cpu, 3911 .sync_single_for_device = bounce_sync_single_for_device, 3912 .sync_sg_for_cpu = bounce_sync_sg_for_cpu, 3913 .sync_sg_for_device = bounce_sync_sg_for_device, 3914 .map_resource = bounce_map_resource, 3915 .unmap_resource = bounce_unmap_resource, 3916 .dma_supported = dma_direct_supported, 3917 }; 3918 3919 static inline int iommu_domain_cache_init(void) 3920 { 3921 int ret = 0; 3922 3923 iommu_domain_cache = kmem_cache_create("iommu_domain", 3924 sizeof(struct dmar_domain), 3925 0, 3926 SLAB_HWCACHE_ALIGN, 3927 3928 NULL); 3929 if (!iommu_domain_cache) { 3930 pr_err("Couldn't create iommu_domain cache\n"); 3931 ret = -ENOMEM; 3932 } 3933 3934 return ret; 3935 } 3936 3937 static inline int iommu_devinfo_cache_init(void) 3938 { 3939 int ret = 0; 3940 3941 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 3942 sizeof(struct device_domain_info), 3943 0, 3944 SLAB_HWCACHE_ALIGN, 3945 NULL); 3946 if (!iommu_devinfo_cache) { 3947 pr_err("Couldn't create devinfo cache\n"); 3948 ret = -ENOMEM; 3949 } 3950 3951 return ret; 3952 } 3953 3954 static int __init iommu_init_mempool(void) 3955 { 3956 int ret; 3957 ret = iova_cache_get(); 3958 if (ret) 3959 return ret; 3960 3961 ret = iommu_domain_cache_init(); 3962 if (ret) 3963 goto domain_error; 3964 3965 ret = iommu_devinfo_cache_init(); 3966 if (!ret) 3967 return ret; 3968 3969 kmem_cache_destroy(iommu_domain_cache); 3970 domain_error: 3971 iova_cache_put(); 3972 3973 return -ENOMEM; 3974 } 3975 3976 static void __init iommu_exit_mempool(void) 3977 { 3978 kmem_cache_destroy(iommu_devinfo_cache); 3979 kmem_cache_destroy(iommu_domain_cache); 3980 iova_cache_put(); 3981 } 3982 3983 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 3984 { 3985 struct dmar_drhd_unit *drhd; 3986 u32 vtbar; 3987 int rc; 3988 3989 /* We know that this device on this chipset has its own IOMMU. 3990 * If we find it under a different IOMMU, then the BIOS is lying 3991 * to us. Hope that the IOMMU for this device is actually 3992 * disabled, and it needs no translation... 3993 */ 3994 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 3995 if (rc) { 3996 /* "can't" happen */ 3997 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 3998 return; 3999 } 4000 vtbar &= 0xffff0000; 4001 4002 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 4003 drhd = dmar_find_matched_drhd_unit(pdev); 4004 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 4005 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 4006 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 4007 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; 4008 } 4009 } 4010 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu); 4011 4012 static void __init init_no_remapping_devices(void) 4013 { 4014 struct dmar_drhd_unit *drhd; 4015 struct device *dev; 4016 int i; 4017 4018 for_each_drhd_unit(drhd) { 4019 if (!drhd->include_all) { 4020 for_each_active_dev_scope(drhd->devices, 4021 drhd->devices_cnt, i, dev) 4022 break; 4023 /* ignore DMAR unit if no devices exist */ 4024 if (i == drhd->devices_cnt) 4025 drhd->ignored = 1; 4026 } 4027 } 4028 4029 for_each_active_drhd_unit(drhd) { 4030 if (drhd->include_all) 4031 continue; 4032 4033 for_each_active_dev_scope(drhd->devices, 4034 drhd->devices_cnt, i, dev) 4035 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 4036 break; 4037 if (i < drhd->devices_cnt) 4038 continue; 4039 4040 /* This IOMMU has *only* gfx devices. Either bypass it or 4041 set the gfx_mapped flag, as appropriate */ 4042 if (!dmar_map_gfx) { 4043 drhd->ignored = 1; 4044 for_each_active_dev_scope(drhd->devices, 4045 drhd->devices_cnt, i, dev) 4046 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO; 4047 } 4048 } 4049 } 4050 4051 #ifdef CONFIG_SUSPEND 4052 static int init_iommu_hw(void) 4053 { 4054 struct dmar_drhd_unit *drhd; 4055 struct intel_iommu *iommu = NULL; 4056 4057 for_each_active_iommu(iommu, drhd) 4058 if (iommu->qi) 4059 dmar_reenable_qi(iommu); 4060 4061 for_each_iommu(iommu, drhd) { 4062 if (drhd->ignored) { 4063 /* 4064 * we always have to disable PMRs or DMA may fail on 4065 * this device 4066 */ 4067 if (force_on) 4068 iommu_disable_protect_mem_regions(iommu); 4069 continue; 4070 } 4071 4072 iommu_flush_write_buffer(iommu); 4073 4074 iommu_set_root_entry(iommu); 4075 4076 iommu->flush.flush_context(iommu, 0, 0, 0, 4077 DMA_CCMD_GLOBAL_INVL); 4078 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4079 iommu_enable_translation(iommu); 4080 iommu_disable_protect_mem_regions(iommu); 4081 } 4082 4083 return 0; 4084 } 4085 4086 static void iommu_flush_all(void) 4087 { 4088 struct dmar_drhd_unit *drhd; 4089 struct intel_iommu *iommu; 4090 4091 for_each_active_iommu(iommu, drhd) { 4092 iommu->flush.flush_context(iommu, 0, 0, 0, 4093 DMA_CCMD_GLOBAL_INVL); 4094 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 4095 DMA_TLB_GLOBAL_FLUSH); 4096 } 4097 } 4098 4099 static int iommu_suspend(void) 4100 { 4101 struct dmar_drhd_unit *drhd; 4102 struct intel_iommu *iommu = NULL; 4103 unsigned long flag; 4104 4105 for_each_active_iommu(iommu, drhd) { 4106 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 4107 GFP_ATOMIC); 4108 if (!iommu->iommu_state) 4109 goto nomem; 4110 } 4111 4112 iommu_flush_all(); 4113 4114 for_each_active_iommu(iommu, drhd) { 4115 iommu_disable_translation(iommu); 4116 4117 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4118 4119 iommu->iommu_state[SR_DMAR_FECTL_REG] = 4120 readl(iommu->reg + DMAR_FECTL_REG); 4121 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 4122 readl(iommu->reg + DMAR_FEDATA_REG); 4123 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 4124 readl(iommu->reg + DMAR_FEADDR_REG); 4125 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 4126 readl(iommu->reg + DMAR_FEUADDR_REG); 4127 4128 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4129 } 4130 return 0; 4131 4132 nomem: 4133 for_each_active_iommu(iommu, drhd) 4134 kfree(iommu->iommu_state); 4135 4136 return -ENOMEM; 4137 } 4138 4139 static void iommu_resume(void) 4140 { 4141 struct dmar_drhd_unit *drhd; 4142 struct intel_iommu *iommu = NULL; 4143 unsigned long flag; 4144 4145 if (init_iommu_hw()) { 4146 if (force_on) 4147 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 4148 else 4149 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 4150 return; 4151 } 4152 4153 for_each_active_iommu(iommu, drhd) { 4154 4155 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4156 4157 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 4158 iommu->reg + DMAR_FECTL_REG); 4159 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 4160 iommu->reg + DMAR_FEDATA_REG); 4161 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 4162 iommu->reg + DMAR_FEADDR_REG); 4163 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 4164 iommu->reg + DMAR_FEUADDR_REG); 4165 4166 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4167 } 4168 4169 for_each_active_iommu(iommu, drhd) 4170 kfree(iommu->iommu_state); 4171 } 4172 4173 static struct syscore_ops iommu_syscore_ops = { 4174 .resume = iommu_resume, 4175 .suspend = iommu_suspend, 4176 }; 4177 4178 static void __init init_iommu_pm_ops(void) 4179 { 4180 register_syscore_ops(&iommu_syscore_ops); 4181 } 4182 4183 #else 4184 static inline void init_iommu_pm_ops(void) {} 4185 #endif /* CONFIG_PM */ 4186 4187 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 4188 { 4189 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 4190 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 4191 rmrr->end_address <= rmrr->base_address || 4192 arch_rmrr_sanity_check(rmrr)) 4193 return -EINVAL; 4194 4195 return 0; 4196 } 4197 4198 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 4199 { 4200 struct acpi_dmar_reserved_memory *rmrr; 4201 struct dmar_rmrr_unit *rmrru; 4202 4203 rmrr = (struct acpi_dmar_reserved_memory *)header; 4204 if (rmrr_sanity_check(rmrr)) { 4205 pr_warn(FW_BUG 4206 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 4207 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4208 rmrr->base_address, rmrr->end_address, 4209 dmi_get_system_info(DMI_BIOS_VENDOR), 4210 dmi_get_system_info(DMI_BIOS_VERSION), 4211 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4212 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 4213 } 4214 4215 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 4216 if (!rmrru) 4217 goto out; 4218 4219 rmrru->hdr = header; 4220 4221 rmrru->base_address = rmrr->base_address; 4222 rmrru->end_address = rmrr->end_address; 4223 4224 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 4225 ((void *)rmrr) + rmrr->header.length, 4226 &rmrru->devices_cnt); 4227 if (rmrru->devices_cnt && rmrru->devices == NULL) 4228 goto free_rmrru; 4229 4230 list_add(&rmrru->list, &dmar_rmrr_units); 4231 4232 return 0; 4233 free_rmrru: 4234 kfree(rmrru); 4235 out: 4236 return -ENOMEM; 4237 } 4238 4239 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 4240 { 4241 struct dmar_atsr_unit *atsru; 4242 struct acpi_dmar_atsr *tmp; 4243 4244 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 4245 dmar_rcu_check()) { 4246 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 4247 if (atsr->segment != tmp->segment) 4248 continue; 4249 if (atsr->header.length != tmp->header.length) 4250 continue; 4251 if (memcmp(atsr, tmp, atsr->header.length) == 0) 4252 return atsru; 4253 } 4254 4255 return NULL; 4256 } 4257 4258 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4259 { 4260 struct acpi_dmar_atsr *atsr; 4261 struct dmar_atsr_unit *atsru; 4262 4263 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 4264 return 0; 4265 4266 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4267 atsru = dmar_find_atsr(atsr); 4268 if (atsru) 4269 return 0; 4270 4271 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 4272 if (!atsru) 4273 return -ENOMEM; 4274 4275 /* 4276 * If memory is allocated from slab by ACPI _DSM method, we need to 4277 * copy the memory content because the memory buffer will be freed 4278 * on return. 4279 */ 4280 atsru->hdr = (void *)(atsru + 1); 4281 memcpy(atsru->hdr, hdr, hdr->length); 4282 atsru->include_all = atsr->flags & 0x1; 4283 if (!atsru->include_all) { 4284 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 4285 (void *)atsr + atsr->header.length, 4286 &atsru->devices_cnt); 4287 if (atsru->devices_cnt && atsru->devices == NULL) { 4288 kfree(atsru); 4289 return -ENOMEM; 4290 } 4291 } 4292 4293 list_add_rcu(&atsru->list, &dmar_atsr_units); 4294 4295 return 0; 4296 } 4297 4298 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 4299 { 4300 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 4301 kfree(atsru); 4302 } 4303 4304 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4305 { 4306 struct acpi_dmar_atsr *atsr; 4307 struct dmar_atsr_unit *atsru; 4308 4309 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4310 atsru = dmar_find_atsr(atsr); 4311 if (atsru) { 4312 list_del_rcu(&atsru->list); 4313 synchronize_rcu(); 4314 intel_iommu_free_atsr(atsru); 4315 } 4316 4317 return 0; 4318 } 4319 4320 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4321 { 4322 int i; 4323 struct device *dev; 4324 struct acpi_dmar_atsr *atsr; 4325 struct dmar_atsr_unit *atsru; 4326 4327 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4328 atsru = dmar_find_atsr(atsr); 4329 if (!atsru) 4330 return 0; 4331 4332 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 4333 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 4334 i, dev) 4335 return -EBUSY; 4336 } 4337 4338 return 0; 4339 } 4340 4341 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 4342 { 4343 int sp, ret; 4344 struct intel_iommu *iommu = dmaru->iommu; 4345 4346 if (g_iommus[iommu->seq_id]) 4347 return 0; 4348 4349 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 4350 pr_warn("%s: Doesn't support hardware pass through.\n", 4351 iommu->name); 4352 return -ENXIO; 4353 } 4354 if (!ecap_sc_support(iommu->ecap) && 4355 domain_update_iommu_snooping(iommu)) { 4356 pr_warn("%s: Doesn't support snooping.\n", 4357 iommu->name); 4358 return -ENXIO; 4359 } 4360 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 4361 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 4362 pr_warn("%s: Doesn't support large page.\n", 4363 iommu->name); 4364 return -ENXIO; 4365 } 4366 4367 /* 4368 * Disable translation if already enabled prior to OS handover. 4369 */ 4370 if (iommu->gcmd & DMA_GCMD_TE) 4371 iommu_disable_translation(iommu); 4372 4373 g_iommus[iommu->seq_id] = iommu; 4374 ret = iommu_init_domains(iommu); 4375 if (ret == 0) 4376 ret = iommu_alloc_root_entry(iommu); 4377 if (ret) 4378 goto out; 4379 4380 intel_svm_check(iommu); 4381 4382 if (dmaru->ignored) { 4383 /* 4384 * we always have to disable PMRs or DMA may fail on this device 4385 */ 4386 if (force_on) 4387 iommu_disable_protect_mem_regions(iommu); 4388 return 0; 4389 } 4390 4391 intel_iommu_init_qi(iommu); 4392 iommu_flush_write_buffer(iommu); 4393 4394 #ifdef CONFIG_INTEL_IOMMU_SVM 4395 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 4396 ret = intel_svm_enable_prq(iommu); 4397 if (ret) 4398 goto disable_iommu; 4399 } 4400 #endif 4401 ret = dmar_set_interrupt(iommu); 4402 if (ret) 4403 goto disable_iommu; 4404 4405 iommu_set_root_entry(iommu); 4406 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 4407 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4408 iommu_enable_translation(iommu); 4409 4410 iommu_disable_protect_mem_regions(iommu); 4411 return 0; 4412 4413 disable_iommu: 4414 disable_dmar_iommu(iommu); 4415 out: 4416 free_dmar_iommu(iommu); 4417 return ret; 4418 } 4419 4420 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4421 { 4422 int ret = 0; 4423 struct intel_iommu *iommu = dmaru->iommu; 4424 4425 if (!intel_iommu_enabled) 4426 return 0; 4427 if (iommu == NULL) 4428 return -EINVAL; 4429 4430 if (insert) { 4431 ret = intel_iommu_add(dmaru); 4432 } else { 4433 disable_dmar_iommu(iommu); 4434 free_dmar_iommu(iommu); 4435 } 4436 4437 return ret; 4438 } 4439 4440 static void intel_iommu_free_dmars(void) 4441 { 4442 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4443 struct dmar_atsr_unit *atsru, *atsr_n; 4444 4445 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4446 list_del(&rmrru->list); 4447 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4448 kfree(rmrru); 4449 } 4450 4451 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4452 list_del(&atsru->list); 4453 intel_iommu_free_atsr(atsru); 4454 } 4455 } 4456 4457 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4458 { 4459 int i, ret = 1; 4460 struct pci_bus *bus; 4461 struct pci_dev *bridge = NULL; 4462 struct device *tmp; 4463 struct acpi_dmar_atsr *atsr; 4464 struct dmar_atsr_unit *atsru; 4465 4466 dev = pci_physfn(dev); 4467 for (bus = dev->bus; bus; bus = bus->parent) { 4468 bridge = bus->self; 4469 /* If it's an integrated device, allow ATS */ 4470 if (!bridge) 4471 return 1; 4472 /* Connected via non-PCIe: no ATS */ 4473 if (!pci_is_pcie(bridge) || 4474 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4475 return 0; 4476 /* If we found the root port, look it up in the ATSR */ 4477 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4478 break; 4479 } 4480 4481 rcu_read_lock(); 4482 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4483 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4484 if (atsr->segment != pci_domain_nr(dev->bus)) 4485 continue; 4486 4487 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4488 if (tmp == &bridge->dev) 4489 goto out; 4490 4491 if (atsru->include_all) 4492 goto out; 4493 } 4494 ret = 0; 4495 out: 4496 rcu_read_unlock(); 4497 4498 return ret; 4499 } 4500 4501 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4502 { 4503 int ret; 4504 struct dmar_rmrr_unit *rmrru; 4505 struct dmar_atsr_unit *atsru; 4506 struct acpi_dmar_atsr *atsr; 4507 struct acpi_dmar_reserved_memory *rmrr; 4508 4509 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4510 return 0; 4511 4512 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4513 rmrr = container_of(rmrru->hdr, 4514 struct acpi_dmar_reserved_memory, header); 4515 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4516 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4517 ((void *)rmrr) + rmrr->header.length, 4518 rmrr->segment, rmrru->devices, 4519 rmrru->devices_cnt); 4520 if (ret < 0) 4521 return ret; 4522 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4523 dmar_remove_dev_scope(info, rmrr->segment, 4524 rmrru->devices, rmrru->devices_cnt); 4525 } 4526 } 4527 4528 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4529 if (atsru->include_all) 4530 continue; 4531 4532 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4533 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4534 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4535 (void *)atsr + atsr->header.length, 4536 atsr->segment, atsru->devices, 4537 atsru->devices_cnt); 4538 if (ret > 0) 4539 break; 4540 else if (ret < 0) 4541 return ret; 4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4543 if (dmar_remove_dev_scope(info, atsr->segment, 4544 atsru->devices, atsru->devices_cnt)) 4545 break; 4546 } 4547 } 4548 4549 return 0; 4550 } 4551 4552 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4553 unsigned long val, void *v) 4554 { 4555 struct memory_notify *mhp = v; 4556 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4557 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4558 mhp->nr_pages - 1); 4559 4560 switch (val) { 4561 case MEM_GOING_ONLINE: 4562 if (iommu_domain_identity_map(si_domain, 4563 start_vpfn, last_vpfn)) { 4564 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4565 start_vpfn, last_vpfn); 4566 return NOTIFY_BAD; 4567 } 4568 break; 4569 4570 case MEM_OFFLINE: 4571 case MEM_CANCEL_ONLINE: 4572 { 4573 struct dmar_drhd_unit *drhd; 4574 struct intel_iommu *iommu; 4575 struct page *freelist; 4576 4577 freelist = domain_unmap(si_domain, 4578 start_vpfn, last_vpfn); 4579 4580 rcu_read_lock(); 4581 for_each_active_iommu(iommu, drhd) 4582 iommu_flush_iotlb_psi(iommu, si_domain, 4583 start_vpfn, mhp->nr_pages, 4584 !freelist, 0); 4585 rcu_read_unlock(); 4586 dma_free_pagelist(freelist); 4587 } 4588 break; 4589 } 4590 4591 return NOTIFY_OK; 4592 } 4593 4594 static struct notifier_block intel_iommu_memory_nb = { 4595 .notifier_call = intel_iommu_memory_notifier, 4596 .priority = 0 4597 }; 4598 4599 static void free_all_cpu_cached_iovas(unsigned int cpu) 4600 { 4601 int i; 4602 4603 for (i = 0; i < g_num_of_iommus; i++) { 4604 struct intel_iommu *iommu = g_iommus[i]; 4605 struct dmar_domain *domain; 4606 int did; 4607 4608 if (!iommu) 4609 continue; 4610 4611 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4612 domain = get_iommu_domain(iommu, (u16)did); 4613 4614 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4615 continue; 4616 4617 free_cpu_cached_iovas(cpu, &domain->iovad); 4618 } 4619 } 4620 } 4621 4622 static int intel_iommu_cpu_dead(unsigned int cpu) 4623 { 4624 free_all_cpu_cached_iovas(cpu); 4625 return 0; 4626 } 4627 4628 static void intel_disable_iommus(void) 4629 { 4630 struct intel_iommu *iommu = NULL; 4631 struct dmar_drhd_unit *drhd; 4632 4633 for_each_iommu(iommu, drhd) 4634 iommu_disable_translation(iommu); 4635 } 4636 4637 void intel_iommu_shutdown(void) 4638 { 4639 struct dmar_drhd_unit *drhd; 4640 struct intel_iommu *iommu = NULL; 4641 4642 if (no_iommu || dmar_disabled) 4643 return; 4644 4645 down_write(&dmar_global_lock); 4646 4647 /* Disable PMRs explicitly here. */ 4648 for_each_iommu(iommu, drhd) 4649 iommu_disable_protect_mem_regions(iommu); 4650 4651 /* Make sure the IOMMUs are switched off */ 4652 intel_disable_iommus(); 4653 4654 up_write(&dmar_global_lock); 4655 } 4656 4657 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4658 { 4659 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4660 4661 return container_of(iommu_dev, struct intel_iommu, iommu); 4662 } 4663 4664 static ssize_t intel_iommu_show_version(struct device *dev, 4665 struct device_attribute *attr, 4666 char *buf) 4667 { 4668 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4669 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4670 return sprintf(buf, "%d:%d\n", 4671 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4672 } 4673 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4674 4675 static ssize_t intel_iommu_show_address(struct device *dev, 4676 struct device_attribute *attr, 4677 char *buf) 4678 { 4679 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4680 return sprintf(buf, "%llx\n", iommu->reg_phys); 4681 } 4682 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4683 4684 static ssize_t intel_iommu_show_cap(struct device *dev, 4685 struct device_attribute *attr, 4686 char *buf) 4687 { 4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4689 return sprintf(buf, "%llx\n", iommu->cap); 4690 } 4691 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4692 4693 static ssize_t intel_iommu_show_ecap(struct device *dev, 4694 struct device_attribute *attr, 4695 char *buf) 4696 { 4697 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4698 return sprintf(buf, "%llx\n", iommu->ecap); 4699 } 4700 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4701 4702 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4703 struct device_attribute *attr, 4704 char *buf) 4705 { 4706 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4707 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4708 } 4709 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4710 4711 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4712 struct device_attribute *attr, 4713 char *buf) 4714 { 4715 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4716 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4717 cap_ndoms(iommu->cap))); 4718 } 4719 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4720 4721 static struct attribute *intel_iommu_attrs[] = { 4722 &dev_attr_version.attr, 4723 &dev_attr_address.attr, 4724 &dev_attr_cap.attr, 4725 &dev_attr_ecap.attr, 4726 &dev_attr_domains_supported.attr, 4727 &dev_attr_domains_used.attr, 4728 NULL, 4729 }; 4730 4731 static struct attribute_group intel_iommu_group = { 4732 .name = "intel-iommu", 4733 .attrs = intel_iommu_attrs, 4734 }; 4735 4736 const struct attribute_group *intel_iommu_groups[] = { 4737 &intel_iommu_group, 4738 NULL, 4739 }; 4740 4741 static inline bool has_untrusted_dev(void) 4742 { 4743 struct pci_dev *pdev = NULL; 4744 4745 for_each_pci_dev(pdev) 4746 if (pdev->untrusted) 4747 return true; 4748 4749 return false; 4750 } 4751 4752 static int __init platform_optin_force_iommu(void) 4753 { 4754 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev()) 4755 return 0; 4756 4757 if (no_iommu || dmar_disabled) 4758 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4759 4760 /* 4761 * If Intel-IOMMU is disabled by default, we will apply identity 4762 * map for all devices except those marked as being untrusted. 4763 */ 4764 if (dmar_disabled) 4765 iommu_set_default_passthrough(false); 4766 4767 dmar_disabled = 0; 4768 no_iommu = 0; 4769 4770 return 1; 4771 } 4772 4773 static int __init probe_acpi_namespace_devices(void) 4774 { 4775 struct dmar_drhd_unit *drhd; 4776 /* To avoid a -Wunused-but-set-variable warning. */ 4777 struct intel_iommu *iommu __maybe_unused; 4778 struct device *dev; 4779 int i, ret = 0; 4780 4781 for_each_active_iommu(iommu, drhd) { 4782 for_each_active_dev_scope(drhd->devices, 4783 drhd->devices_cnt, i, dev) { 4784 struct acpi_device_physical_node *pn; 4785 struct iommu_group *group; 4786 struct acpi_device *adev; 4787 4788 if (dev->bus != &acpi_bus_type) 4789 continue; 4790 4791 adev = to_acpi_device(dev); 4792 mutex_lock(&adev->physical_node_lock); 4793 list_for_each_entry(pn, 4794 &adev->physical_node_list, node) { 4795 group = iommu_group_get(pn->dev); 4796 if (group) { 4797 iommu_group_put(group); 4798 continue; 4799 } 4800 4801 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4802 ret = iommu_probe_device(pn->dev); 4803 if (ret) 4804 break; 4805 } 4806 mutex_unlock(&adev->physical_node_lock); 4807 4808 if (ret) 4809 return ret; 4810 } 4811 } 4812 4813 return 0; 4814 } 4815 4816 int __init intel_iommu_init(void) 4817 { 4818 int ret = -ENODEV; 4819 struct dmar_drhd_unit *drhd; 4820 struct intel_iommu *iommu; 4821 4822 /* 4823 * Intel IOMMU is required for a TXT/tboot launch or platform 4824 * opt in, so enforce that. 4825 */ 4826 force_on = tboot_force_iommu() || platform_optin_force_iommu(); 4827 4828 if (iommu_init_mempool()) { 4829 if (force_on) 4830 panic("tboot: Failed to initialize iommu memory\n"); 4831 return -ENOMEM; 4832 } 4833 4834 down_write(&dmar_global_lock); 4835 if (dmar_table_init()) { 4836 if (force_on) 4837 panic("tboot: Failed to initialize DMAR table\n"); 4838 goto out_free_dmar; 4839 } 4840 4841 if (dmar_dev_scope_init() < 0) { 4842 if (force_on) 4843 panic("tboot: Failed to initialize DMAR device scope\n"); 4844 goto out_free_dmar; 4845 } 4846 4847 up_write(&dmar_global_lock); 4848 4849 /* 4850 * The bus notifier takes the dmar_global_lock, so lockdep will 4851 * complain later when we register it under the lock. 4852 */ 4853 dmar_register_bus_notifier(); 4854 4855 down_write(&dmar_global_lock); 4856 4857 if (!no_iommu) 4858 intel_iommu_debugfs_init(); 4859 4860 if (no_iommu || dmar_disabled) { 4861 /* 4862 * We exit the function here to ensure IOMMU's remapping and 4863 * mempool aren't setup, which means that the IOMMU's PMRs 4864 * won't be disabled via the call to init_dmars(). So disable 4865 * it explicitly here. The PMRs were setup by tboot prior to 4866 * calling SENTER, but the kernel is expected to reset/tear 4867 * down the PMRs. 4868 */ 4869 if (intel_iommu_tboot_noforce) { 4870 for_each_iommu(iommu, drhd) 4871 iommu_disable_protect_mem_regions(iommu); 4872 } 4873 4874 /* 4875 * Make sure the IOMMUs are switched off, even when we 4876 * boot into a kexec kernel and the previous kernel left 4877 * them enabled 4878 */ 4879 intel_disable_iommus(); 4880 goto out_free_dmar; 4881 } 4882 4883 if (list_empty(&dmar_rmrr_units)) 4884 pr_info("No RMRR found\n"); 4885 4886 if (list_empty(&dmar_atsr_units)) 4887 pr_info("No ATSR found\n"); 4888 4889 if (dmar_init_reserved_ranges()) { 4890 if (force_on) 4891 panic("tboot: Failed to reserve iommu ranges\n"); 4892 goto out_free_reserved_range; 4893 } 4894 4895 if (dmar_map_gfx) 4896 intel_iommu_gfx_mapped = 1; 4897 4898 init_no_remapping_devices(); 4899 4900 ret = init_dmars(); 4901 if (ret) { 4902 if (force_on) 4903 panic("tboot: Failed to initialize DMARs\n"); 4904 pr_err("Initialization failed\n"); 4905 goto out_free_reserved_range; 4906 } 4907 up_write(&dmar_global_lock); 4908 4909 init_iommu_pm_ops(); 4910 4911 down_read(&dmar_global_lock); 4912 for_each_active_iommu(iommu, drhd) { 4913 iommu_device_sysfs_add(&iommu->iommu, NULL, 4914 intel_iommu_groups, 4915 "%s", iommu->name); 4916 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4917 iommu_device_register(&iommu->iommu); 4918 } 4919 up_read(&dmar_global_lock); 4920 4921 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4922 if (si_domain && !hw_pass_through) 4923 register_memory_notifier(&intel_iommu_memory_nb); 4924 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4925 intel_iommu_cpu_dead); 4926 4927 down_read(&dmar_global_lock); 4928 if (probe_acpi_namespace_devices()) 4929 pr_warn("ACPI name space devices didn't probe correctly\n"); 4930 4931 /* Finally, we enable the DMA remapping hardware. */ 4932 for_each_iommu(iommu, drhd) { 4933 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4934 iommu_enable_translation(iommu); 4935 4936 iommu_disable_protect_mem_regions(iommu); 4937 } 4938 up_read(&dmar_global_lock); 4939 4940 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 4941 4942 intel_iommu_enabled = 1; 4943 4944 return 0; 4945 4946 out_free_reserved_range: 4947 put_iova_domain(&reserved_iova_list); 4948 out_free_dmar: 4949 intel_iommu_free_dmars(); 4950 up_write(&dmar_global_lock); 4951 iommu_exit_mempool(); 4952 return ret; 4953 } 4954 4955 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 4956 { 4957 struct intel_iommu *iommu = opaque; 4958 4959 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 4960 return 0; 4961 } 4962 4963 /* 4964 * NB - intel-iommu lacks any sort of reference counting for the users of 4965 * dependent devices. If multiple endpoints have intersecting dependent 4966 * devices, unbinding the driver from any one of them will possibly leave 4967 * the others unable to operate. 4968 */ 4969 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 4970 { 4971 if (!iommu || !dev || !dev_is_pci(dev)) 4972 return; 4973 4974 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 4975 } 4976 4977 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 4978 { 4979 struct dmar_domain *domain; 4980 struct intel_iommu *iommu; 4981 unsigned long flags; 4982 4983 assert_spin_locked(&device_domain_lock); 4984 4985 if (WARN_ON(!info)) 4986 return; 4987 4988 iommu = info->iommu; 4989 domain = info->domain; 4990 4991 if (info->dev) { 4992 if (dev_is_pci(info->dev) && sm_supported(iommu)) 4993 intel_pasid_tear_down_entry(iommu, info->dev, 4994 PASID_RID2PASID, false); 4995 4996 iommu_disable_dev_iotlb(info); 4997 if (!dev_is_real_dma_subdevice(info->dev)) 4998 domain_context_clear(iommu, info->dev); 4999 intel_pasid_free_table(info->dev); 5000 } 5001 5002 unlink_domain_info(info); 5003 5004 spin_lock_irqsave(&iommu->lock, flags); 5005 domain_detach_iommu(domain, iommu); 5006 spin_unlock_irqrestore(&iommu->lock, flags); 5007 5008 free_devinfo_mem(info); 5009 } 5010 5011 static void dmar_remove_one_dev_info(struct device *dev) 5012 { 5013 struct device_domain_info *info; 5014 unsigned long flags; 5015 5016 spin_lock_irqsave(&device_domain_lock, flags); 5017 info = get_domain_info(dev); 5018 if (info) 5019 __dmar_remove_one_dev_info(info); 5020 spin_unlock_irqrestore(&device_domain_lock, flags); 5021 } 5022 5023 static int md_domain_init(struct dmar_domain *domain, int guest_width) 5024 { 5025 int adjust_width; 5026 5027 /* calculate AGAW */ 5028 domain->gaw = guest_width; 5029 adjust_width = guestwidth_to_adjustwidth(guest_width); 5030 domain->agaw = width_to_agaw(adjust_width); 5031 5032 domain->iommu_coherency = 0; 5033 domain->iommu_snooping = 0; 5034 domain->iommu_superpage = 0; 5035 domain->max_addr = 0; 5036 5037 /* always allocate the top pgd */ 5038 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 5039 if (!domain->pgd) 5040 return -ENOMEM; 5041 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 5042 return 0; 5043 } 5044 5045 static void intel_init_iova_domain(struct dmar_domain *dmar_domain) 5046 { 5047 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); 5048 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad); 5049 5050 if (!intel_iommu_strict && 5051 init_iova_flush_queue(&dmar_domain->iovad, 5052 iommu_flush_iova, iova_entry_free)) 5053 pr_info("iova flush queue initialization failed\n"); 5054 } 5055 5056 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 5057 { 5058 struct dmar_domain *dmar_domain; 5059 struct iommu_domain *domain; 5060 5061 switch (type) { 5062 case IOMMU_DOMAIN_DMA: 5063 /* fallthrough */ 5064 case IOMMU_DOMAIN_UNMANAGED: 5065 dmar_domain = alloc_domain(0); 5066 if (!dmar_domain) { 5067 pr_err("Can't allocate dmar_domain\n"); 5068 return NULL; 5069 } 5070 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 5071 pr_err("Domain initialization failed\n"); 5072 domain_exit(dmar_domain); 5073 return NULL; 5074 } 5075 5076 if (type == IOMMU_DOMAIN_DMA) 5077 intel_init_iova_domain(dmar_domain); 5078 5079 domain_update_iommu_cap(dmar_domain); 5080 5081 domain = &dmar_domain->domain; 5082 domain->geometry.aperture_start = 0; 5083 domain->geometry.aperture_end = 5084 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 5085 domain->geometry.force_aperture = true; 5086 5087 return domain; 5088 case IOMMU_DOMAIN_IDENTITY: 5089 return &si_domain->domain; 5090 default: 5091 return NULL; 5092 } 5093 5094 return NULL; 5095 } 5096 5097 static void intel_iommu_domain_free(struct iommu_domain *domain) 5098 { 5099 if (domain != &si_domain->domain) 5100 domain_exit(to_dmar_domain(domain)); 5101 } 5102 5103 /* 5104 * Check whether a @domain could be attached to the @dev through the 5105 * aux-domain attach/detach APIs. 5106 */ 5107 static inline bool 5108 is_aux_domain(struct device *dev, struct iommu_domain *domain) 5109 { 5110 struct device_domain_info *info = get_domain_info(dev); 5111 5112 return info && info->auxd_enabled && 5113 domain->type == IOMMU_DOMAIN_UNMANAGED; 5114 } 5115 5116 static void auxiliary_link_device(struct dmar_domain *domain, 5117 struct device *dev) 5118 { 5119 struct device_domain_info *info = get_domain_info(dev); 5120 5121 assert_spin_locked(&device_domain_lock); 5122 if (WARN_ON(!info)) 5123 return; 5124 5125 domain->auxd_refcnt++; 5126 list_add(&domain->auxd, &info->auxiliary_domains); 5127 } 5128 5129 static void auxiliary_unlink_device(struct dmar_domain *domain, 5130 struct device *dev) 5131 { 5132 struct device_domain_info *info = get_domain_info(dev); 5133 5134 assert_spin_locked(&device_domain_lock); 5135 if (WARN_ON(!info)) 5136 return; 5137 5138 list_del(&domain->auxd); 5139 domain->auxd_refcnt--; 5140 5141 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5142 ioasid_free(domain->default_pasid); 5143 } 5144 5145 static int aux_domain_add_dev(struct dmar_domain *domain, 5146 struct device *dev) 5147 { 5148 int ret; 5149 u8 bus, devfn; 5150 unsigned long flags; 5151 struct intel_iommu *iommu; 5152 5153 iommu = device_to_iommu(dev, &bus, &devfn); 5154 if (!iommu) 5155 return -ENODEV; 5156 5157 if (domain->default_pasid <= 0) { 5158 int pasid; 5159 5160 /* No private data needed for the default pasid */ 5161 pasid = ioasid_alloc(NULL, PASID_MIN, 5162 pci_max_pasids(to_pci_dev(dev)) - 1, 5163 NULL); 5164 if (pasid == INVALID_IOASID) { 5165 pr_err("Can't allocate default pasid\n"); 5166 return -ENODEV; 5167 } 5168 domain->default_pasid = pasid; 5169 } 5170 5171 spin_lock_irqsave(&device_domain_lock, flags); 5172 /* 5173 * iommu->lock must be held to attach domain to iommu and setup the 5174 * pasid entry for second level translation. 5175 */ 5176 spin_lock(&iommu->lock); 5177 ret = domain_attach_iommu(domain, iommu); 5178 if (ret) 5179 goto attach_failed; 5180 5181 /* Setup the PASID entry for mediated devices: */ 5182 if (domain_use_first_level(domain)) 5183 ret = domain_setup_first_level(iommu, domain, dev, 5184 domain->default_pasid); 5185 else 5186 ret = intel_pasid_setup_second_level(iommu, domain, dev, 5187 domain->default_pasid); 5188 if (ret) 5189 goto table_failed; 5190 spin_unlock(&iommu->lock); 5191 5192 auxiliary_link_device(domain, dev); 5193 5194 spin_unlock_irqrestore(&device_domain_lock, flags); 5195 5196 return 0; 5197 5198 table_failed: 5199 domain_detach_iommu(domain, iommu); 5200 attach_failed: 5201 spin_unlock(&iommu->lock); 5202 spin_unlock_irqrestore(&device_domain_lock, flags); 5203 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5204 ioasid_free(domain->default_pasid); 5205 5206 return ret; 5207 } 5208 5209 static void aux_domain_remove_dev(struct dmar_domain *domain, 5210 struct device *dev) 5211 { 5212 struct device_domain_info *info; 5213 struct intel_iommu *iommu; 5214 unsigned long flags; 5215 5216 if (!is_aux_domain(dev, &domain->domain)) 5217 return; 5218 5219 spin_lock_irqsave(&device_domain_lock, flags); 5220 info = get_domain_info(dev); 5221 iommu = info->iommu; 5222 5223 auxiliary_unlink_device(domain, dev); 5224 5225 spin_lock(&iommu->lock); 5226 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); 5227 domain_detach_iommu(domain, iommu); 5228 spin_unlock(&iommu->lock); 5229 5230 spin_unlock_irqrestore(&device_domain_lock, flags); 5231 } 5232 5233 static int prepare_domain_attach_device(struct iommu_domain *domain, 5234 struct device *dev) 5235 { 5236 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5237 struct intel_iommu *iommu; 5238 int addr_width; 5239 u8 bus, devfn; 5240 5241 iommu = device_to_iommu(dev, &bus, &devfn); 5242 if (!iommu) 5243 return -ENODEV; 5244 5245 /* check if this iommu agaw is sufficient for max mapped address */ 5246 addr_width = agaw_to_width(iommu->agaw); 5247 if (addr_width > cap_mgaw(iommu->cap)) 5248 addr_width = cap_mgaw(iommu->cap); 5249 5250 if (dmar_domain->max_addr > (1LL << addr_width)) { 5251 dev_err(dev, "%s: iommu width (%d) is not " 5252 "sufficient for the mapped address (%llx)\n", 5253 __func__, addr_width, dmar_domain->max_addr); 5254 return -EFAULT; 5255 } 5256 dmar_domain->gaw = addr_width; 5257 5258 /* 5259 * Knock out extra levels of page tables if necessary 5260 */ 5261 while (iommu->agaw < dmar_domain->agaw) { 5262 struct dma_pte *pte; 5263 5264 pte = dmar_domain->pgd; 5265 if (dma_pte_present(pte)) { 5266 dmar_domain->pgd = (struct dma_pte *) 5267 phys_to_virt(dma_pte_addr(pte)); 5268 free_pgtable_page(pte); 5269 } 5270 dmar_domain->agaw--; 5271 } 5272 5273 return 0; 5274 } 5275 5276 static int intel_iommu_attach_device(struct iommu_domain *domain, 5277 struct device *dev) 5278 { 5279 int ret; 5280 5281 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 5282 device_is_rmrr_locked(dev)) { 5283 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 5284 return -EPERM; 5285 } 5286 5287 if (is_aux_domain(dev, domain)) 5288 return -EPERM; 5289 5290 /* normally dev is not mapped */ 5291 if (unlikely(domain_context_mapped(dev))) { 5292 struct dmar_domain *old_domain; 5293 5294 old_domain = find_domain(dev); 5295 if (old_domain) 5296 dmar_remove_one_dev_info(dev); 5297 } 5298 5299 ret = prepare_domain_attach_device(domain, dev); 5300 if (ret) 5301 return ret; 5302 5303 return domain_add_dev_info(to_dmar_domain(domain), dev); 5304 } 5305 5306 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 5307 struct device *dev) 5308 { 5309 int ret; 5310 5311 if (!is_aux_domain(dev, domain)) 5312 return -EPERM; 5313 5314 ret = prepare_domain_attach_device(domain, dev); 5315 if (ret) 5316 return ret; 5317 5318 return aux_domain_add_dev(to_dmar_domain(domain), dev); 5319 } 5320 5321 static void intel_iommu_detach_device(struct iommu_domain *domain, 5322 struct device *dev) 5323 { 5324 dmar_remove_one_dev_info(dev); 5325 } 5326 5327 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 5328 struct device *dev) 5329 { 5330 aux_domain_remove_dev(to_dmar_domain(domain), dev); 5331 } 5332 5333 /* 5334 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 5335 * VT-d granularity. Invalidation is typically included in the unmap operation 5336 * as a result of DMA or VFIO unmap. However, for assigned devices guest 5337 * owns the first level page tables. Invalidations of translation caches in the 5338 * guest are trapped and passed down to the host. 5339 * 5340 * vIOMMU in the guest will only expose first level page tables, therefore 5341 * we do not support IOTLB granularity for request without PASID (second level). 5342 * 5343 * For example, to find the VT-d granularity encoding for IOTLB 5344 * type and page selective granularity within PASID: 5345 * X: indexed by iommu cache type 5346 * Y: indexed by enum iommu_inv_granularity 5347 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 5348 */ 5349 5350 static const int 5351 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 5352 /* 5353 * PASID based IOTLB invalidation: PASID selective (per PASID), 5354 * page selective (address granularity) 5355 */ 5356 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 5357 /* PASID based dev TLBs */ 5358 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 5359 /* PASID cache */ 5360 {-EINVAL, -EINVAL, -EINVAL} 5361 }; 5362 5363 static inline int to_vtd_granularity(int type, int granu) 5364 { 5365 return inv_type_granu_table[type][granu]; 5366 } 5367 5368 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 5369 { 5370 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 5371 5372 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5373 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5374 * granu size in contiguous memory. 5375 */ 5376 return order_base_2(nr_pages); 5377 } 5378 5379 #ifdef CONFIG_INTEL_IOMMU_SVM 5380 static int 5381 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5382 struct iommu_cache_invalidate_info *inv_info) 5383 { 5384 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5385 struct device_domain_info *info; 5386 struct intel_iommu *iommu; 5387 unsigned long flags; 5388 int cache_type; 5389 u8 bus, devfn; 5390 u16 did, sid; 5391 int ret = 0; 5392 u64 size = 0; 5393 5394 if (!inv_info || !dmar_domain || 5395 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) 5396 return -EINVAL; 5397 5398 if (!dev || !dev_is_pci(dev)) 5399 return -ENODEV; 5400 5401 iommu = device_to_iommu(dev, &bus, &devfn); 5402 if (!iommu) 5403 return -ENODEV; 5404 5405 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5406 return -EINVAL; 5407 5408 spin_lock_irqsave(&device_domain_lock, flags); 5409 spin_lock(&iommu->lock); 5410 info = get_domain_info(dev); 5411 if (!info) { 5412 ret = -EINVAL; 5413 goto out_unlock; 5414 } 5415 did = dmar_domain->iommu_did[iommu->seq_id]; 5416 sid = PCI_DEVID(bus, devfn); 5417 5418 /* Size is only valid in address selective invalidation */ 5419 if (inv_info->granularity != IOMMU_INV_GRANU_PASID) 5420 size = to_vtd_size(inv_info->addr_info.granule_size, 5421 inv_info->addr_info.nb_granules); 5422 5423 for_each_set_bit(cache_type, 5424 (unsigned long *)&inv_info->cache, 5425 IOMMU_CACHE_INV_TYPE_NR) { 5426 int granu = 0; 5427 u64 pasid = 0; 5428 5429 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5430 if (granu == -EINVAL) { 5431 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5432 cache_type, inv_info->granularity); 5433 break; 5434 } 5435 5436 /* 5437 * PASID is stored in different locations based on the 5438 * granularity. 5439 */ 5440 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5441 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5442 pasid = inv_info->pasid_info.pasid; 5443 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5444 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5445 pasid = inv_info->addr_info.pasid; 5446 5447 switch (BIT(cache_type)) { 5448 case IOMMU_CACHE_INV_TYPE_IOTLB: 5449 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5450 size && 5451 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5452 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n", 5453 inv_info->addr_info.addr, size); 5454 ret = -ERANGE; 5455 goto out_unlock; 5456 } 5457 5458 /* 5459 * If granu is PASID-selective, address is ignored. 5460 * We use npages = -1 to indicate that. 5461 */ 5462 qi_flush_piotlb(iommu, did, pasid, 5463 mm_to_dma_pfn(inv_info->addr_info.addr), 5464 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5465 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5466 5467 /* 5468 * Always flush device IOTLB if ATS is enabled. vIOMMU 5469 * in the guest may assume IOTLB flush is inclusive, 5470 * which is more efficient. 5471 */ 5472 if (info->ats_enabled) 5473 qi_flush_dev_iotlb_pasid(iommu, sid, 5474 info->pfsid, pasid, 5475 info->ats_qdep, 5476 inv_info->addr_info.addr, 5477 size, granu); 5478 break; 5479 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5480 if (info->ats_enabled) 5481 qi_flush_dev_iotlb_pasid(iommu, sid, 5482 info->pfsid, pasid, 5483 info->ats_qdep, 5484 inv_info->addr_info.addr, 5485 size, granu); 5486 else 5487 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5488 break; 5489 default: 5490 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5491 cache_type); 5492 ret = -EINVAL; 5493 } 5494 } 5495 out_unlock: 5496 spin_unlock(&iommu->lock); 5497 spin_unlock_irqrestore(&device_domain_lock, flags); 5498 5499 return ret; 5500 } 5501 #endif 5502 5503 static int intel_iommu_map(struct iommu_domain *domain, 5504 unsigned long iova, phys_addr_t hpa, 5505 size_t size, int iommu_prot, gfp_t gfp) 5506 { 5507 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5508 u64 max_addr; 5509 int prot = 0; 5510 int ret; 5511 5512 if (iommu_prot & IOMMU_READ) 5513 prot |= DMA_PTE_READ; 5514 if (iommu_prot & IOMMU_WRITE) 5515 prot |= DMA_PTE_WRITE; 5516 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5517 prot |= DMA_PTE_SNP; 5518 5519 max_addr = iova + size; 5520 if (dmar_domain->max_addr < max_addr) { 5521 u64 end; 5522 5523 /* check if minimum agaw is sufficient for mapped address */ 5524 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5525 if (end < max_addr) { 5526 pr_err("%s: iommu width (%d) is not " 5527 "sufficient for the mapped address (%llx)\n", 5528 __func__, dmar_domain->gaw, max_addr); 5529 return -EFAULT; 5530 } 5531 dmar_domain->max_addr = max_addr; 5532 } 5533 /* Round up size to next multiple of PAGE_SIZE, if it and 5534 the low bits of hpa would take us onto the next page */ 5535 size = aligned_nrpages(hpa, size); 5536 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5537 hpa >> VTD_PAGE_SHIFT, size, prot); 5538 return ret; 5539 } 5540 5541 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5542 unsigned long iova, size_t size, 5543 struct iommu_iotlb_gather *gather) 5544 { 5545 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5546 struct page *freelist = NULL; 5547 unsigned long start_pfn, last_pfn; 5548 unsigned int npages; 5549 int iommu_id, level = 0; 5550 5551 /* Cope with horrid API which requires us to unmap more than the 5552 size argument if it happens to be a large-page mapping. */ 5553 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5554 5555 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5556 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5557 5558 start_pfn = iova >> VTD_PAGE_SHIFT; 5559 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5560 5561 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn); 5562 5563 npages = last_pfn - start_pfn + 1; 5564 5565 for_each_domain_iommu(iommu_id, dmar_domain) 5566 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5567 start_pfn, npages, !freelist, 0); 5568 5569 dma_free_pagelist(freelist); 5570 5571 if (dmar_domain->max_addr == iova + size) 5572 dmar_domain->max_addr = iova; 5573 5574 return size; 5575 } 5576 5577 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5578 dma_addr_t iova) 5579 { 5580 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5581 struct dma_pte *pte; 5582 int level = 0; 5583 u64 phys = 0; 5584 5585 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5586 if (pte && dma_pte_present(pte)) 5587 phys = dma_pte_addr(pte) + 5588 (iova & (BIT_MASK(level_to_offset_bits(level) + 5589 VTD_PAGE_SHIFT) - 1)); 5590 5591 return phys; 5592 } 5593 5594 static inline bool scalable_mode_support(void) 5595 { 5596 struct dmar_drhd_unit *drhd; 5597 struct intel_iommu *iommu; 5598 bool ret = true; 5599 5600 rcu_read_lock(); 5601 for_each_active_iommu(iommu, drhd) { 5602 if (!sm_supported(iommu)) { 5603 ret = false; 5604 break; 5605 } 5606 } 5607 rcu_read_unlock(); 5608 5609 return ret; 5610 } 5611 5612 static inline bool iommu_pasid_support(void) 5613 { 5614 struct dmar_drhd_unit *drhd; 5615 struct intel_iommu *iommu; 5616 bool ret = true; 5617 5618 rcu_read_lock(); 5619 for_each_active_iommu(iommu, drhd) { 5620 if (!pasid_supported(iommu)) { 5621 ret = false; 5622 break; 5623 } 5624 } 5625 rcu_read_unlock(); 5626 5627 return ret; 5628 } 5629 5630 static inline bool nested_mode_support(void) 5631 { 5632 struct dmar_drhd_unit *drhd; 5633 struct intel_iommu *iommu; 5634 bool ret = true; 5635 5636 rcu_read_lock(); 5637 for_each_active_iommu(iommu, drhd) { 5638 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5639 ret = false; 5640 break; 5641 } 5642 } 5643 rcu_read_unlock(); 5644 5645 return ret; 5646 } 5647 5648 static bool intel_iommu_capable(enum iommu_cap cap) 5649 { 5650 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5651 return domain_update_iommu_snooping(NULL) == 1; 5652 if (cap == IOMMU_CAP_INTR_REMAP) 5653 return irq_remapping_enabled == 1; 5654 5655 return false; 5656 } 5657 5658 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5659 { 5660 struct intel_iommu *iommu; 5661 u8 bus, devfn; 5662 5663 iommu = device_to_iommu(dev, &bus, &devfn); 5664 if (!iommu) 5665 return ERR_PTR(-ENODEV); 5666 5667 if (translation_pre_enabled(iommu)) 5668 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO; 5669 5670 return &iommu->iommu; 5671 } 5672 5673 static void intel_iommu_release_device(struct device *dev) 5674 { 5675 struct intel_iommu *iommu; 5676 u8 bus, devfn; 5677 5678 iommu = device_to_iommu(dev, &bus, &devfn); 5679 if (!iommu) 5680 return; 5681 5682 dmar_remove_one_dev_info(dev); 5683 5684 set_dma_ops(dev, NULL); 5685 } 5686 5687 static void intel_iommu_probe_finalize(struct device *dev) 5688 { 5689 struct iommu_domain *domain; 5690 5691 domain = iommu_get_domain_for_dev(dev); 5692 if (device_needs_bounce(dev)) 5693 set_dma_ops(dev, &bounce_dma_ops); 5694 else if (domain && domain->type == IOMMU_DOMAIN_DMA) 5695 set_dma_ops(dev, &intel_dma_ops); 5696 else 5697 set_dma_ops(dev, NULL); 5698 } 5699 5700 static void intel_iommu_get_resv_regions(struct device *device, 5701 struct list_head *head) 5702 { 5703 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5704 struct iommu_resv_region *reg; 5705 struct dmar_rmrr_unit *rmrr; 5706 struct device *i_dev; 5707 int i; 5708 5709 down_read(&dmar_global_lock); 5710 for_each_rmrr_units(rmrr) { 5711 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5712 i, i_dev) { 5713 struct iommu_resv_region *resv; 5714 enum iommu_resv_type type; 5715 size_t length; 5716 5717 if (i_dev != device && 5718 !is_downstream_to_pci_bridge(device, i_dev)) 5719 continue; 5720 5721 length = rmrr->end_address - rmrr->base_address + 1; 5722 5723 type = device_rmrr_is_relaxable(device) ? 5724 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5725 5726 resv = iommu_alloc_resv_region(rmrr->base_address, 5727 length, prot, type); 5728 if (!resv) 5729 break; 5730 5731 list_add_tail(&resv->list, head); 5732 } 5733 } 5734 up_read(&dmar_global_lock); 5735 5736 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5737 if (dev_is_pci(device)) { 5738 struct pci_dev *pdev = to_pci_dev(device); 5739 5740 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5741 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5742 IOMMU_RESV_DIRECT_RELAXABLE); 5743 if (reg) 5744 list_add_tail(®->list, head); 5745 } 5746 } 5747 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5748 5749 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5750 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5751 0, IOMMU_RESV_MSI); 5752 if (!reg) 5753 return; 5754 list_add_tail(®->list, head); 5755 } 5756 5757 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5758 { 5759 struct device_domain_info *info; 5760 struct context_entry *context; 5761 struct dmar_domain *domain; 5762 unsigned long flags; 5763 u64 ctx_lo; 5764 int ret; 5765 5766 domain = find_domain(dev); 5767 if (!domain) 5768 return -EINVAL; 5769 5770 spin_lock_irqsave(&device_domain_lock, flags); 5771 spin_lock(&iommu->lock); 5772 5773 ret = -EINVAL; 5774 info = get_domain_info(dev); 5775 if (!info || !info->pasid_supported) 5776 goto out; 5777 5778 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5779 if (WARN_ON(!context)) 5780 goto out; 5781 5782 ctx_lo = context[0].lo; 5783 5784 if (!(ctx_lo & CONTEXT_PASIDE)) { 5785 ctx_lo |= CONTEXT_PASIDE; 5786 context[0].lo = ctx_lo; 5787 wmb(); 5788 iommu->flush.flush_context(iommu, 5789 domain->iommu_did[iommu->seq_id], 5790 PCI_DEVID(info->bus, info->devfn), 5791 DMA_CCMD_MASK_NOBIT, 5792 DMA_CCMD_DEVICE_INVL); 5793 } 5794 5795 /* Enable PASID support in the device, if it wasn't already */ 5796 if (!info->pasid_enabled) 5797 iommu_enable_dev_iotlb(info); 5798 5799 ret = 0; 5800 5801 out: 5802 spin_unlock(&iommu->lock); 5803 spin_unlock_irqrestore(&device_domain_lock, flags); 5804 5805 return ret; 5806 } 5807 5808 static void intel_iommu_apply_resv_region(struct device *dev, 5809 struct iommu_domain *domain, 5810 struct iommu_resv_region *region) 5811 { 5812 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5813 unsigned long start, end; 5814 5815 start = IOVA_PFN(region->start); 5816 end = IOVA_PFN(region->start + region->length - 1); 5817 5818 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); 5819 } 5820 5821 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5822 { 5823 if (dev_is_pci(dev)) 5824 return pci_device_group(dev); 5825 return generic_device_group(dev); 5826 } 5827 5828 #ifdef CONFIG_INTEL_IOMMU_SVM 5829 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) 5830 { 5831 struct intel_iommu *iommu; 5832 u8 bus, devfn; 5833 5834 if (iommu_dummy(dev)) { 5835 dev_warn(dev, 5836 "No IOMMU translation for device; cannot enable SVM\n"); 5837 return NULL; 5838 } 5839 5840 iommu = device_to_iommu(dev, &bus, &devfn); 5841 if ((!iommu)) { 5842 dev_err(dev, "No IOMMU for device; cannot enable SVM\n"); 5843 return NULL; 5844 } 5845 5846 return iommu; 5847 } 5848 #endif /* CONFIG_INTEL_IOMMU_SVM */ 5849 5850 static int intel_iommu_enable_auxd(struct device *dev) 5851 { 5852 struct device_domain_info *info; 5853 struct intel_iommu *iommu; 5854 unsigned long flags; 5855 u8 bus, devfn; 5856 int ret; 5857 5858 iommu = device_to_iommu(dev, &bus, &devfn); 5859 if (!iommu || dmar_disabled) 5860 return -EINVAL; 5861 5862 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5863 return -EINVAL; 5864 5865 ret = intel_iommu_enable_pasid(iommu, dev); 5866 if (ret) 5867 return -ENODEV; 5868 5869 spin_lock_irqsave(&device_domain_lock, flags); 5870 info = get_domain_info(dev); 5871 info->auxd_enabled = 1; 5872 spin_unlock_irqrestore(&device_domain_lock, flags); 5873 5874 return 0; 5875 } 5876 5877 static int intel_iommu_disable_auxd(struct device *dev) 5878 { 5879 struct device_domain_info *info; 5880 unsigned long flags; 5881 5882 spin_lock_irqsave(&device_domain_lock, flags); 5883 info = get_domain_info(dev); 5884 if (!WARN_ON(!info)) 5885 info->auxd_enabled = 0; 5886 spin_unlock_irqrestore(&device_domain_lock, flags); 5887 5888 return 0; 5889 } 5890 5891 /* 5892 * A PCI express designated vendor specific extended capability is defined 5893 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5894 * for system software and tools to detect endpoint devices supporting the 5895 * Intel scalable IO virtualization without host driver dependency. 5896 * 5897 * Returns the address of the matching extended capability structure within 5898 * the device's PCI configuration space or 0 if the device does not support 5899 * it. 5900 */ 5901 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5902 { 5903 int pos; 5904 u16 vendor, id; 5905 5906 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5907 while (pos) { 5908 pci_read_config_word(pdev, pos + 4, &vendor); 5909 pci_read_config_word(pdev, pos + 8, &id); 5910 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5911 return pos; 5912 5913 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5914 } 5915 5916 return 0; 5917 } 5918 5919 static bool 5920 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5921 { 5922 if (feat == IOMMU_DEV_FEAT_AUX) { 5923 int ret; 5924 5925 if (!dev_is_pci(dev) || dmar_disabled || 5926 !scalable_mode_support() || !iommu_pasid_support()) 5927 return false; 5928 5929 ret = pci_pasid_features(to_pci_dev(dev)); 5930 if (ret < 0) 5931 return false; 5932 5933 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5934 } 5935 5936 if (feat == IOMMU_DEV_FEAT_SVA) { 5937 struct device_domain_info *info = get_domain_info(dev); 5938 5939 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5940 info->pasid_supported && info->pri_supported && 5941 info->ats_supported; 5942 } 5943 5944 return false; 5945 } 5946 5947 static int 5948 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5949 { 5950 if (feat == IOMMU_DEV_FEAT_AUX) 5951 return intel_iommu_enable_auxd(dev); 5952 5953 if (feat == IOMMU_DEV_FEAT_SVA) { 5954 struct device_domain_info *info = get_domain_info(dev); 5955 5956 if (!info) 5957 return -EINVAL; 5958 5959 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5960 return 0; 5961 } 5962 5963 return -ENODEV; 5964 } 5965 5966 static int 5967 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 5968 { 5969 if (feat == IOMMU_DEV_FEAT_AUX) 5970 return intel_iommu_disable_auxd(dev); 5971 5972 return -ENODEV; 5973 } 5974 5975 static bool 5976 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 5977 { 5978 struct device_domain_info *info = get_domain_info(dev); 5979 5980 if (feat == IOMMU_DEV_FEAT_AUX) 5981 return scalable_mode_support() && info && info->auxd_enabled; 5982 5983 return false; 5984 } 5985 5986 static int 5987 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 5988 { 5989 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5990 5991 return dmar_domain->default_pasid > 0 ? 5992 dmar_domain->default_pasid : -EINVAL; 5993 } 5994 5995 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 5996 struct device *dev) 5997 { 5998 return attach_deferred(dev); 5999 } 6000 6001 static int 6002 intel_iommu_domain_set_attr(struct iommu_domain *domain, 6003 enum iommu_attr attr, void *data) 6004 { 6005 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6006 unsigned long flags; 6007 int ret = 0; 6008 6009 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 6010 return -EINVAL; 6011 6012 switch (attr) { 6013 case DOMAIN_ATTR_NESTING: 6014 spin_lock_irqsave(&device_domain_lock, flags); 6015 if (nested_mode_support() && 6016 list_empty(&dmar_domain->devices)) { 6017 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 6018 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 6019 } else { 6020 ret = -ENODEV; 6021 } 6022 spin_unlock_irqrestore(&device_domain_lock, flags); 6023 break; 6024 default: 6025 ret = -EINVAL; 6026 break; 6027 } 6028 6029 return ret; 6030 } 6031 6032 /* 6033 * Check that the device does not live on an external facing PCI port that is 6034 * marked as untrusted. Such devices should not be able to apply quirks and 6035 * thus not be able to bypass the IOMMU restrictions. 6036 */ 6037 static bool risky_device(struct pci_dev *pdev) 6038 { 6039 if (pdev->untrusted) { 6040 pci_info(pdev, 6041 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 6042 pdev->vendor, pdev->device); 6043 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 6044 return true; 6045 } 6046 return false; 6047 } 6048 6049 const struct iommu_ops intel_iommu_ops = { 6050 .capable = intel_iommu_capable, 6051 .domain_alloc = intel_iommu_domain_alloc, 6052 .domain_free = intel_iommu_domain_free, 6053 .domain_set_attr = intel_iommu_domain_set_attr, 6054 .attach_dev = intel_iommu_attach_device, 6055 .detach_dev = intel_iommu_detach_device, 6056 .aux_attach_dev = intel_iommu_aux_attach_device, 6057 .aux_detach_dev = intel_iommu_aux_detach_device, 6058 .aux_get_pasid = intel_iommu_aux_get_pasid, 6059 .map = intel_iommu_map, 6060 .unmap = intel_iommu_unmap, 6061 .iova_to_phys = intel_iommu_iova_to_phys, 6062 .probe_device = intel_iommu_probe_device, 6063 .probe_finalize = intel_iommu_probe_finalize, 6064 .release_device = intel_iommu_release_device, 6065 .get_resv_regions = intel_iommu_get_resv_regions, 6066 .put_resv_regions = generic_iommu_put_resv_regions, 6067 .apply_resv_region = intel_iommu_apply_resv_region, 6068 .device_group = intel_iommu_device_group, 6069 .dev_has_feat = intel_iommu_dev_has_feat, 6070 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 6071 .dev_enable_feat = intel_iommu_dev_enable_feat, 6072 .dev_disable_feat = intel_iommu_dev_disable_feat, 6073 .is_attach_deferred = intel_iommu_is_attach_deferred, 6074 .def_domain_type = device_def_domain_type, 6075 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 6076 #ifdef CONFIG_INTEL_IOMMU_SVM 6077 .cache_invalidate = intel_iommu_sva_invalidate, 6078 .sva_bind_gpasid = intel_svm_bind_gpasid, 6079 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 6080 .sva_bind = intel_svm_bind, 6081 .sva_unbind = intel_svm_unbind, 6082 .sva_get_pasid = intel_svm_get_pasid, 6083 #endif 6084 }; 6085 6086 static void quirk_iommu_igfx(struct pci_dev *dev) 6087 { 6088 if (risky_device(dev)) 6089 return; 6090 6091 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 6092 dmar_map_gfx = 0; 6093 } 6094 6095 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 6103 6104 /* Broadwell igfx malfunctions with dmar */ 6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 6129 6130 static void quirk_iommu_rwbf(struct pci_dev *dev) 6131 { 6132 if (risky_device(dev)) 6133 return; 6134 6135 /* 6136 * Mobile 4 Series Chipset neglects to set RWBF capability, 6137 * but needs it. Same seems to hold for the desktop versions. 6138 */ 6139 pci_info(dev, "Forcing write-buffer flush capability\n"); 6140 rwbf_quirk = 1; 6141 } 6142 6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 6150 6151 #define GGC 0x52 6152 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 6153 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 6154 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 6155 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 6156 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 6157 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 6158 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 6159 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 6160 6161 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 6162 { 6163 unsigned short ggc; 6164 6165 if (risky_device(dev)) 6166 return; 6167 6168 if (pci_read_config_word(dev, GGC, &ggc)) 6169 return; 6170 6171 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 6172 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 6173 dmar_map_gfx = 0; 6174 } else if (dmar_map_gfx) { 6175 /* we have to ensure the gfx device is idle before we flush */ 6176 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 6177 intel_iommu_strict = 1; 6178 } 6179 } 6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 6184 6185 /* On Tylersburg chipsets, some BIOSes have been known to enable the 6186 ISOCH DMAR unit for the Azalia sound device, but not give it any 6187 TLB entries, which causes it to deadlock. Check for that. We do 6188 this in a function called from init_dmars(), instead of in a PCI 6189 quirk, because we don't want to print the obnoxious "BIOS broken" 6190 message if VT-d is actually disabled. 6191 */ 6192 static void __init check_tylersburg_isoch(void) 6193 { 6194 struct pci_dev *pdev; 6195 uint32_t vtisochctrl; 6196 6197 /* If there's no Azalia in the system anyway, forget it. */ 6198 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 6199 if (!pdev) 6200 return; 6201 6202 if (risky_device(pdev)) { 6203 pci_dev_put(pdev); 6204 return; 6205 } 6206 6207 pci_dev_put(pdev); 6208 6209 /* System Management Registers. Might be hidden, in which case 6210 we can't do the sanity check. But that's OK, because the 6211 known-broken BIOSes _don't_ actually hide it, so far. */ 6212 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 6213 if (!pdev) 6214 return; 6215 6216 if (risky_device(pdev)) { 6217 pci_dev_put(pdev); 6218 return; 6219 } 6220 6221 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 6222 pci_dev_put(pdev); 6223 return; 6224 } 6225 6226 pci_dev_put(pdev); 6227 6228 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 6229 if (vtisochctrl & 1) 6230 return; 6231 6232 /* Drop all bits other than the number of TLB entries */ 6233 vtisochctrl &= 0x1c; 6234 6235 /* If we have the recommended number of TLB entries (16), fine. */ 6236 if (vtisochctrl == 0x10) 6237 return; 6238 6239 /* Zero TLB entries? You get to ride the short bus to school. */ 6240 if (!vtisochctrl) { 6241 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 6242 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 6243 dmi_get_system_info(DMI_BIOS_VENDOR), 6244 dmi_get_system_info(DMI_BIOS_VERSION), 6245 dmi_get_system_info(DMI_PRODUCT_VERSION)); 6246 iommu_identity_mapping |= IDENTMAP_AZALIA; 6247 return; 6248 } 6249 6250 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 6251 vtisochctrl); 6252 } 6253