1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright © 2006-2014 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org>, 6 * Ashok Raj <ashok.raj@intel.com>, 7 * Shaohua Li <shaohua.li@intel.com>, 8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 * Joerg Roedel <jroedel@suse.de> 11 */ 12 13 #define pr_fmt(fmt) "DMAR: " fmt 14 #define dev_fmt(fmt) pr_fmt(fmt) 15 16 #include <linux/init.h> 17 #include <linux/bitmap.h> 18 #include <linux/debugfs.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/irq.h> 22 #include <linux/interrupt.h> 23 #include <linux/spinlock.h> 24 #include <linux/pci.h> 25 #include <linux/dmar.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/mempool.h> 28 #include <linux/memory.h> 29 #include <linux/cpu.h> 30 #include <linux/timer.h> 31 #include <linux/io.h> 32 #include <linux/iova.h> 33 #include <linux/iommu.h> 34 #include <linux/intel-iommu.h> 35 #include <linux/syscore_ops.h> 36 #include <linux/tboot.h> 37 #include <linux/dmi.h> 38 #include <linux/pci-ats.h> 39 #include <linux/memblock.h> 40 #include <linux/dma-map-ops.h> 41 #include <linux/dma-direct.h> 42 #include <linux/crash_dump.h> 43 #include <linux/numa.h> 44 #include <linux/swiotlb.h> 45 #include <asm/irq_remapping.h> 46 #include <asm/cacheflush.h> 47 #include <asm/iommu.h> 48 #include <trace/events/intel_iommu.h> 49 50 #include "../irq_remapping.h" 51 #include "pasid.h" 52 53 #define ROOT_SIZE VTD_PAGE_SIZE 54 #define CONTEXT_SIZE VTD_PAGE_SIZE 55 56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) 57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) 58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) 59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) 60 61 #define IOAPIC_RANGE_START (0xfee00000) 62 #define IOAPIC_RANGE_END (0xfeefffff) 63 #define IOVA_START_ADDR (0x1000) 64 65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 66 67 #define MAX_AGAW_WIDTH 64 68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) 69 70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1) 71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1) 72 73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 78 79 /* IO virtual address start page frame number */ 80 #define IOVA_START_PFN (1) 81 82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) 83 84 /* page table handling */ 85 #define LEVEL_STRIDE (9) 86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) 87 88 /* 89 * This bitmap is used to advertise the page sizes our hardware support 90 * to the IOMMU core, which will then use this information to split 91 * physically contiguous memory regions it is mapping into page sizes 92 * that we support. 93 * 94 * Traditionally the IOMMU core just handed us the mappings directly, 95 * after making sure the size is an order of a 4KiB page and that the 96 * mapping has natural alignment. 97 * 98 * To retain this behavior, we currently advertise that we support 99 * all page sizes that are an order of 4KiB. 100 * 101 * If at some point we'd like to utilize the IOMMU core's new behavior, 102 * we could change this to advertise the real page sizes we support. 103 */ 104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL) 105 106 static inline int agaw_to_level(int agaw) 107 { 108 return agaw + 2; 109 } 110 111 static inline int agaw_to_width(int agaw) 112 { 113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 114 } 115 116 static inline int width_to_agaw(int width) 117 { 118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); 119 } 120 121 static inline unsigned int level_to_offset_bits(int level) 122 { 123 return (level - 1) * LEVEL_STRIDE; 124 } 125 126 static inline int pfn_level_offset(u64 pfn, int level) 127 { 128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 129 } 130 131 static inline u64 level_mask(int level) 132 { 133 return -1ULL << level_to_offset_bits(level); 134 } 135 136 static inline u64 level_size(int level) 137 { 138 return 1ULL << level_to_offset_bits(level); 139 } 140 141 static inline u64 align_to_level(u64 pfn, int level) 142 { 143 return (pfn + level_size(level) - 1) & level_mask(level); 144 } 145 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 147 { 148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 149 } 150 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things 152 are never going to work. */ 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 154 { 155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT); 156 } 157 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn) 159 { 160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); 161 } 162 static inline unsigned long page_to_dma_pfn(struct page *pg) 163 { 164 return mm_to_dma_pfn(page_to_pfn(pg)); 165 } 166 static inline unsigned long virt_to_dma_pfn(void *p) 167 { 168 return page_to_dma_pfn(virt_to_page(p)); 169 } 170 171 /* global iommu list, set NULL for ignored DMAR units */ 172 static struct intel_iommu **g_iommus; 173 174 static void __init check_tylersburg_isoch(void); 175 static int rwbf_quirk; 176 177 /* 178 * set to 1 to panic kernel if can't successfully enable VT-d 179 * (used when kernel is launched w/ TXT) 180 */ 181 static int force_on = 0; 182 int intel_iommu_tboot_noforce; 183 static int no_platform_optin; 184 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) 186 187 /* 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP) 189 * if marked present. 190 */ 191 static phys_addr_t root_entry_lctp(struct root_entry *re) 192 { 193 if (!(re->lo & 1)) 194 return 0; 195 196 return re->lo & VTD_PAGE_MASK; 197 } 198 199 /* 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP) 201 * if marked present. 202 */ 203 static phys_addr_t root_entry_uctp(struct root_entry *re) 204 { 205 if (!(re->hi & 1)) 206 return 0; 207 208 return re->hi & VTD_PAGE_MASK; 209 } 210 211 static inline void context_clear_pasid_enable(struct context_entry *context) 212 { 213 context->lo &= ~(1ULL << 11); 214 } 215 216 static inline bool context_pasid_enabled(struct context_entry *context) 217 { 218 return !!(context->lo & (1ULL << 11)); 219 } 220 221 static inline void context_set_copied(struct context_entry *context) 222 { 223 context->hi |= (1ull << 3); 224 } 225 226 static inline bool context_copied(struct context_entry *context) 227 { 228 return !!(context->hi & (1ULL << 3)); 229 } 230 231 static inline bool __context_present(struct context_entry *context) 232 { 233 return (context->lo & 1); 234 } 235 236 bool context_present(struct context_entry *context) 237 { 238 return context_pasid_enabled(context) ? 239 __context_present(context) : 240 __context_present(context) && !context_copied(context); 241 } 242 243 static inline void context_set_present(struct context_entry *context) 244 { 245 context->lo |= 1; 246 } 247 248 static inline void context_set_fault_enable(struct context_entry *context) 249 { 250 context->lo &= (((u64)-1) << 2) | 1; 251 } 252 253 static inline void context_set_translation_type(struct context_entry *context, 254 unsigned long value) 255 { 256 context->lo &= (((u64)-1) << 4) | 3; 257 context->lo |= (value & 3) << 2; 258 } 259 260 static inline void context_set_address_root(struct context_entry *context, 261 unsigned long value) 262 { 263 context->lo &= ~VTD_PAGE_MASK; 264 context->lo |= value & VTD_PAGE_MASK; 265 } 266 267 static inline void context_set_address_width(struct context_entry *context, 268 unsigned long value) 269 { 270 context->hi |= value & 7; 271 } 272 273 static inline void context_set_domain_id(struct context_entry *context, 274 unsigned long value) 275 { 276 context->hi |= (value & ((1 << 16) - 1)) << 8; 277 } 278 279 static inline int context_domain_id(struct context_entry *c) 280 { 281 return((c->hi >> 8) & 0xffff); 282 } 283 284 static inline void context_clear_entry(struct context_entry *context) 285 { 286 context->lo = 0; 287 context->hi = 0; 288 } 289 290 /* 291 * This domain is a statically identity mapping domain. 292 * 1. This domain creats a static 1:1 mapping to all usable memory. 293 * 2. It maps to each iommu if successful. 294 * 3. Each iommu mapps to this domain if successful. 295 */ 296 static struct dmar_domain *si_domain; 297 static int hw_pass_through = 1; 298 299 #define for_each_domain_iommu(idx, domain) \ 300 for (idx = 0; idx < g_num_of_iommus; idx++) \ 301 if (domain->iommu_refcnt[idx]) 302 303 struct dmar_rmrr_unit { 304 struct list_head list; /* list of rmrr units */ 305 struct acpi_dmar_header *hdr; /* ACPI header */ 306 u64 base_address; /* reserved base address*/ 307 u64 end_address; /* reserved end address */ 308 struct dmar_dev_scope *devices; /* target devices */ 309 int devices_cnt; /* target device count */ 310 }; 311 312 struct dmar_atsr_unit { 313 struct list_head list; /* list of ATSR units */ 314 struct acpi_dmar_header *hdr; /* ACPI header */ 315 struct dmar_dev_scope *devices; /* target devices */ 316 int devices_cnt; /* target device count */ 317 u8 include_all:1; /* include all ports */ 318 }; 319 320 static LIST_HEAD(dmar_atsr_units); 321 static LIST_HEAD(dmar_rmrr_units); 322 323 #define for_each_rmrr_units(rmrr) \ 324 list_for_each_entry(rmrr, &dmar_rmrr_units, list) 325 326 /* bitmap for indexing intel_iommus */ 327 static int g_num_of_iommus; 328 329 static void domain_exit(struct dmar_domain *domain); 330 static void domain_remove_dev_info(struct dmar_domain *domain); 331 static void dmar_remove_one_dev_info(struct device *dev); 332 static void __dmar_remove_one_dev_info(struct device_domain_info *info); 333 static int intel_iommu_attach_device(struct iommu_domain *domain, 334 struct device *dev); 335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 336 dma_addr_t iova); 337 338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON 339 int dmar_disabled = 0; 340 #else 341 int dmar_disabled = 1; 342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ 343 344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON 345 int intel_iommu_sm = 1; 346 #else 347 int intel_iommu_sm; 348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ 349 350 int intel_iommu_enabled = 0; 351 EXPORT_SYMBOL_GPL(intel_iommu_enabled); 352 353 static int dmar_map_gfx = 1; 354 static int dmar_forcedac; 355 static int intel_iommu_strict; 356 static int intel_iommu_superpage = 1; 357 static int iommu_identity_mapping; 358 static int intel_no_bounce; 359 static int iommu_skip_te_disable; 360 361 #define IDENTMAP_GFX 2 362 #define IDENTMAP_AZALIA 4 363 364 int intel_iommu_gfx_mapped; 365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); 366 367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) 368 struct device_domain_info *get_domain_info(struct device *dev) 369 { 370 struct device_domain_info *info; 371 372 if (!dev) 373 return NULL; 374 375 info = dev_iommu_priv_get(dev); 376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO)) 377 return NULL; 378 379 return info; 380 } 381 382 DEFINE_SPINLOCK(device_domain_lock); 383 static LIST_HEAD(device_domain_list); 384 385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \ 386 to_pci_dev(d)->untrusted) 387 388 /* 389 * Iterate over elements in device_domain_list and call the specified 390 * callback @fn against each element. 391 */ 392 int for_each_device_domain(int (*fn)(struct device_domain_info *info, 393 void *data), void *data) 394 { 395 int ret = 0; 396 unsigned long flags; 397 struct device_domain_info *info; 398 399 spin_lock_irqsave(&device_domain_lock, flags); 400 list_for_each_entry(info, &device_domain_list, global) { 401 ret = fn(info, data); 402 if (ret) { 403 spin_unlock_irqrestore(&device_domain_lock, flags); 404 return ret; 405 } 406 } 407 spin_unlock_irqrestore(&device_domain_lock, flags); 408 409 return 0; 410 } 411 412 const struct iommu_ops intel_iommu_ops; 413 414 static bool translation_pre_enabled(struct intel_iommu *iommu) 415 { 416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); 417 } 418 419 static void clear_translation_pre_enabled(struct intel_iommu *iommu) 420 { 421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; 422 } 423 424 static void init_translation_status(struct intel_iommu *iommu) 425 { 426 u32 gsts; 427 428 gsts = readl(iommu->reg + DMAR_GSTS_REG); 429 if (gsts & DMA_GSTS_TES) 430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; 431 } 432 433 static int __init intel_iommu_setup(char *str) 434 { 435 if (!str) 436 return -EINVAL; 437 while (*str) { 438 if (!strncmp(str, "on", 2)) { 439 dmar_disabled = 0; 440 pr_info("IOMMU enabled\n"); 441 } else if (!strncmp(str, "off", 3)) { 442 dmar_disabled = 1; 443 no_platform_optin = 1; 444 pr_info("IOMMU disabled\n"); 445 } else if (!strncmp(str, "igfx_off", 8)) { 446 dmar_map_gfx = 0; 447 pr_info("Disable GFX device mapping\n"); 448 } else if (!strncmp(str, "forcedac", 8)) { 449 pr_info("Forcing DAC for PCI devices\n"); 450 dmar_forcedac = 1; 451 } else if (!strncmp(str, "strict", 6)) { 452 pr_info("Disable batched IOTLB flush\n"); 453 intel_iommu_strict = 1; 454 } else if (!strncmp(str, "sp_off", 6)) { 455 pr_info("Disable supported super page\n"); 456 intel_iommu_superpage = 0; 457 } else if (!strncmp(str, "sm_on", 5)) { 458 pr_info("Intel-IOMMU: scalable mode supported\n"); 459 intel_iommu_sm = 1; 460 } else if (!strncmp(str, "tboot_noforce", 13)) { 461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); 462 intel_iommu_tboot_noforce = 1; 463 } else if (!strncmp(str, "nobounce", 8)) { 464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); 465 intel_no_bounce = 1; 466 } 467 468 str += strcspn(str, ","); 469 while (*str == ',') 470 str++; 471 } 472 return 0; 473 } 474 __setup("intel_iommu=", intel_iommu_setup); 475 476 static struct kmem_cache *iommu_domain_cache; 477 static struct kmem_cache *iommu_devinfo_cache; 478 479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did) 480 { 481 struct dmar_domain **domains; 482 int idx = did >> 8; 483 484 domains = iommu->domains[idx]; 485 if (!domains) 486 return NULL; 487 488 return domains[did & 0xff]; 489 } 490 491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did, 492 struct dmar_domain *domain) 493 { 494 struct dmar_domain **domains; 495 int idx = did >> 8; 496 497 if (!iommu->domains[idx]) { 498 size_t size = 256 * sizeof(struct dmar_domain *); 499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC); 500 } 501 502 domains = iommu->domains[idx]; 503 if (WARN_ON(!domains)) 504 return; 505 else 506 domains[did & 0xff] = domain; 507 } 508 509 void *alloc_pgtable_page(int node) 510 { 511 struct page *page; 512 void *vaddr = NULL; 513 514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); 515 if (page) 516 vaddr = page_address(page); 517 return vaddr; 518 } 519 520 void free_pgtable_page(void *vaddr) 521 { 522 free_page((unsigned long)vaddr); 523 } 524 525 static inline void *alloc_domain_mem(void) 526 { 527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC); 528 } 529 530 static void free_domain_mem(void *vaddr) 531 { 532 kmem_cache_free(iommu_domain_cache, vaddr); 533 } 534 535 static inline void * alloc_devinfo_mem(void) 536 { 537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC); 538 } 539 540 static inline void free_devinfo_mem(void *vaddr) 541 { 542 kmem_cache_free(iommu_devinfo_cache, vaddr); 543 } 544 545 static inline int domain_type_is_si(struct dmar_domain *domain) 546 { 547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; 548 } 549 550 static inline bool domain_use_first_level(struct dmar_domain *domain) 551 { 552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; 553 } 554 555 static inline int domain_pfn_supported(struct dmar_domain *domain, 556 unsigned long pfn) 557 { 558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 559 560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 561 } 562 563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) 564 { 565 unsigned long sagaw; 566 int agaw = -1; 567 568 sagaw = cap_sagaw(iommu->cap); 569 for (agaw = width_to_agaw(max_gaw); 570 agaw >= 0; agaw--) { 571 if (test_bit(agaw, &sagaw)) 572 break; 573 } 574 575 return agaw; 576 } 577 578 /* 579 * Calculate max SAGAW for each iommu. 580 */ 581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu) 582 { 583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); 584 } 585 586 /* 587 * calculate agaw for each iommu. 588 * "SAGAW" may be different across iommus, use a default agaw, and 589 * get a supported less agaw for iommus that don't support the default agaw. 590 */ 591 int iommu_calculate_agaw(struct intel_iommu *iommu) 592 { 593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); 594 } 595 596 /* This functionin only returns single iommu in a domain */ 597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) 598 { 599 int iommu_id; 600 601 /* si_domain and vm domain should not get here. */ 602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) 603 return NULL; 604 605 for_each_domain_iommu(iommu_id, domain) 606 break; 607 608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus) 609 return NULL; 610 611 return g_iommus[iommu_id]; 612 } 613 614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) 615 { 616 return sm_supported(iommu) ? 617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 618 } 619 620 static void domain_update_iommu_coherency(struct dmar_domain *domain) 621 { 622 struct dmar_drhd_unit *drhd; 623 struct intel_iommu *iommu; 624 bool found = false; 625 int i; 626 627 domain->iommu_coherency = 1; 628 629 for_each_domain_iommu(i, domain) { 630 found = true; 631 if (!iommu_paging_structure_coherency(g_iommus[i])) { 632 domain->iommu_coherency = 0; 633 break; 634 } 635 } 636 if (found) 637 return; 638 639 /* No hardware attached; use lowest common denominator */ 640 rcu_read_lock(); 641 for_each_active_iommu(iommu, drhd) { 642 if (!iommu_paging_structure_coherency(iommu)) { 643 domain->iommu_coherency = 0; 644 break; 645 } 646 } 647 rcu_read_unlock(); 648 } 649 650 static int domain_update_iommu_snooping(struct intel_iommu *skip) 651 { 652 struct dmar_drhd_unit *drhd; 653 struct intel_iommu *iommu; 654 int ret = 1; 655 656 rcu_read_lock(); 657 for_each_active_iommu(iommu, drhd) { 658 if (iommu != skip) { 659 if (!ecap_sc_support(iommu->ecap)) { 660 ret = 0; 661 break; 662 } 663 } 664 } 665 rcu_read_unlock(); 666 667 return ret; 668 } 669 670 static int domain_update_iommu_superpage(struct dmar_domain *domain, 671 struct intel_iommu *skip) 672 { 673 struct dmar_drhd_unit *drhd; 674 struct intel_iommu *iommu; 675 int mask = 0x3; 676 677 if (!intel_iommu_superpage) { 678 return 0; 679 } 680 681 /* set iommu_superpage to the smallest common denominator */ 682 rcu_read_lock(); 683 for_each_active_iommu(iommu, drhd) { 684 if (iommu != skip) { 685 if (domain && domain_use_first_level(domain)) { 686 if (!cap_fl1gp_support(iommu->cap)) 687 mask = 0x1; 688 } else { 689 mask &= cap_super_page_val(iommu->cap); 690 } 691 692 if (!mask) 693 break; 694 } 695 } 696 rcu_read_unlock(); 697 698 return fls(mask); 699 } 700 701 static int domain_update_device_node(struct dmar_domain *domain) 702 { 703 struct device_domain_info *info; 704 int nid = NUMA_NO_NODE; 705 706 assert_spin_locked(&device_domain_lock); 707 708 if (list_empty(&domain->devices)) 709 return NUMA_NO_NODE; 710 711 list_for_each_entry(info, &domain->devices, link) { 712 if (!info->dev) 713 continue; 714 715 /* 716 * There could possibly be multiple device numa nodes as devices 717 * within the same domain may sit behind different IOMMUs. There 718 * isn't perfect answer in such situation, so we select first 719 * come first served policy. 720 */ 721 nid = dev_to_node(info->dev); 722 if (nid != NUMA_NO_NODE) 723 break; 724 } 725 726 return nid; 727 } 728 729 /* Some capabilities may be different across iommus */ 730 static void domain_update_iommu_cap(struct dmar_domain *domain) 731 { 732 domain_update_iommu_coherency(domain); 733 domain->iommu_snooping = domain_update_iommu_snooping(NULL); 734 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); 735 736 /* 737 * If RHSA is missing, we should default to the device numa domain 738 * as fall back. 739 */ 740 if (domain->nid == NUMA_NO_NODE) 741 domain->nid = domain_update_device_node(domain); 742 } 743 744 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, 745 u8 devfn, int alloc) 746 { 747 struct root_entry *root = &iommu->root_entry[bus]; 748 struct context_entry *context; 749 u64 *entry; 750 751 entry = &root->lo; 752 if (sm_supported(iommu)) { 753 if (devfn >= 0x80) { 754 devfn -= 0x80; 755 entry = &root->hi; 756 } 757 devfn *= 2; 758 } 759 if (*entry & 1) 760 context = phys_to_virt(*entry & VTD_PAGE_MASK); 761 else { 762 unsigned long phy_addr; 763 if (!alloc) 764 return NULL; 765 766 context = alloc_pgtable_page(iommu->node); 767 if (!context) 768 return NULL; 769 770 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); 771 phy_addr = virt_to_phys((void *)context); 772 *entry = phy_addr | 1; 773 __iommu_flush_cache(iommu, entry, sizeof(*entry)); 774 } 775 return &context[devfn]; 776 } 777 778 static bool attach_deferred(struct device *dev) 779 { 780 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO; 781 } 782 783 /** 784 * is_downstream_to_pci_bridge - test if a device belongs to the PCI 785 * sub-hierarchy of a candidate PCI-PCI bridge 786 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy 787 * @bridge: the candidate PCI-PCI bridge 788 * 789 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. 790 */ 791 static bool 792 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) 793 { 794 struct pci_dev *pdev, *pbridge; 795 796 if (!dev_is_pci(dev) || !dev_is_pci(bridge)) 797 return false; 798 799 pdev = to_pci_dev(dev); 800 pbridge = to_pci_dev(bridge); 801 802 if (pbridge->subordinate && 803 pbridge->subordinate->number <= pdev->bus->number && 804 pbridge->subordinate->busn_res.end >= pdev->bus->number) 805 return true; 806 807 return false; 808 } 809 810 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) 811 { 812 struct dmar_drhd_unit *drhd; 813 u32 vtbar; 814 int rc; 815 816 /* We know that this device on this chipset has its own IOMMU. 817 * If we find it under a different IOMMU, then the BIOS is lying 818 * to us. Hope that the IOMMU for this device is actually 819 * disabled, and it needs no translation... 820 */ 821 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); 822 if (rc) { 823 /* "can't" happen */ 824 dev_info(&pdev->dev, "failed to run vt-d quirk\n"); 825 return false; 826 } 827 vtbar &= 0xffff0000; 828 829 /* we know that the this iommu should be at offset 0xa000 from vtbar */ 830 drhd = dmar_find_matched_drhd_unit(pdev); 831 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { 832 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); 833 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 834 return true; 835 } 836 837 return false; 838 } 839 840 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) 841 { 842 if (!iommu || iommu->drhd->ignored) 843 return true; 844 845 if (dev_is_pci(dev)) { 846 struct pci_dev *pdev = to_pci_dev(dev); 847 848 if (pdev->vendor == PCI_VENDOR_ID_INTEL && 849 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && 850 quirk_ioat_snb_local_iommu(pdev)) 851 return true; 852 } 853 854 return false; 855 } 856 857 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) 858 { 859 struct dmar_drhd_unit *drhd = NULL; 860 struct pci_dev *pdev = NULL; 861 struct intel_iommu *iommu; 862 struct device *tmp; 863 u16 segment = 0; 864 int i; 865 866 if (!dev) 867 return NULL; 868 869 if (dev_is_pci(dev)) { 870 struct pci_dev *pf_pdev; 871 872 pdev = pci_real_dma_dev(to_pci_dev(dev)); 873 874 /* VFs aren't listed in scope tables; we need to look up 875 * the PF instead to find the IOMMU. */ 876 pf_pdev = pci_physfn(pdev); 877 dev = &pf_pdev->dev; 878 segment = pci_domain_nr(pdev->bus); 879 } else if (has_acpi_companion(dev)) 880 dev = &ACPI_COMPANION(dev)->dev; 881 882 rcu_read_lock(); 883 for_each_iommu(iommu, drhd) { 884 if (pdev && segment != drhd->segment) 885 continue; 886 887 for_each_active_dev_scope(drhd->devices, 888 drhd->devices_cnt, i, tmp) { 889 if (tmp == dev) { 890 /* For a VF use its original BDF# not that of the PF 891 * which we used for the IOMMU lookup. Strictly speaking 892 * we could do this for all PCI devices; we only need to 893 * get the BDF# from the scope table for ACPI matches. */ 894 if (pdev && pdev->is_virtfn) 895 goto got_pdev; 896 897 if (bus && devfn) { 898 *bus = drhd->devices[i].bus; 899 *devfn = drhd->devices[i].devfn; 900 } 901 goto out; 902 } 903 904 if (is_downstream_to_pci_bridge(dev, tmp)) 905 goto got_pdev; 906 } 907 908 if (pdev && drhd->include_all) { 909 got_pdev: 910 if (bus && devfn) { 911 *bus = pdev->bus->number; 912 *devfn = pdev->devfn; 913 } 914 goto out; 915 } 916 } 917 iommu = NULL; 918 out: 919 if (iommu_is_dummy(iommu, dev)) 920 iommu = NULL; 921 922 rcu_read_unlock(); 923 924 return iommu; 925 } 926 927 static void domain_flush_cache(struct dmar_domain *domain, 928 void *addr, int size) 929 { 930 if (!domain->iommu_coherency) 931 clflush_cache_range(addr, size); 932 } 933 934 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) 935 { 936 struct context_entry *context; 937 int ret = 0; 938 unsigned long flags; 939 940 spin_lock_irqsave(&iommu->lock, flags); 941 context = iommu_context_addr(iommu, bus, devfn, 0); 942 if (context) 943 ret = context_present(context); 944 spin_unlock_irqrestore(&iommu->lock, flags); 945 return ret; 946 } 947 948 static void free_context_table(struct intel_iommu *iommu) 949 { 950 int i; 951 unsigned long flags; 952 struct context_entry *context; 953 954 spin_lock_irqsave(&iommu->lock, flags); 955 if (!iommu->root_entry) { 956 goto out; 957 } 958 for (i = 0; i < ROOT_ENTRY_NR; i++) { 959 context = iommu_context_addr(iommu, i, 0, 0); 960 if (context) 961 free_pgtable_page(context); 962 963 if (!sm_supported(iommu)) 964 continue; 965 966 context = iommu_context_addr(iommu, i, 0x80, 0); 967 if (context) 968 free_pgtable_page(context); 969 970 } 971 free_pgtable_page(iommu->root_entry); 972 iommu->root_entry = NULL; 973 out: 974 spin_unlock_irqrestore(&iommu->lock, flags); 975 } 976 977 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 978 unsigned long pfn, int *target_level) 979 { 980 struct dma_pte *parent, *pte; 981 int level = agaw_to_level(domain->agaw); 982 int offset; 983 984 BUG_ON(!domain->pgd); 985 986 if (!domain_pfn_supported(domain, pfn)) 987 /* Address beyond IOMMU's addressing capabilities. */ 988 return NULL; 989 990 parent = domain->pgd; 991 992 while (1) { 993 void *tmp_page; 994 995 offset = pfn_level_offset(pfn, level); 996 pte = &parent[offset]; 997 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 998 break; 999 if (level == *target_level) 1000 break; 1001 1002 if (!dma_pte_present(pte)) { 1003 uint64_t pteval; 1004 1005 tmp_page = alloc_pgtable_page(domain->nid); 1006 1007 if (!tmp_page) 1008 return NULL; 1009 1010 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 1011 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; 1012 if (domain_use_first_level(domain)) 1013 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; 1014 if (cmpxchg64(&pte->val, 0ULL, pteval)) 1015 /* Someone else set it while we were thinking; use theirs. */ 1016 free_pgtable_page(tmp_page); 1017 else 1018 domain_flush_cache(domain, pte, sizeof(*pte)); 1019 } 1020 if (level == 1) 1021 break; 1022 1023 parent = phys_to_virt(dma_pte_addr(pte)); 1024 level--; 1025 } 1026 1027 if (!*target_level) 1028 *target_level = level; 1029 1030 return pte; 1031 } 1032 1033 /* return address's pte at specific level */ 1034 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 1035 unsigned long pfn, 1036 int level, int *large_page) 1037 { 1038 struct dma_pte *parent, *pte; 1039 int total = agaw_to_level(domain->agaw); 1040 int offset; 1041 1042 parent = domain->pgd; 1043 while (level <= total) { 1044 offset = pfn_level_offset(pfn, total); 1045 pte = &parent[offset]; 1046 if (level == total) 1047 return pte; 1048 1049 if (!dma_pte_present(pte)) { 1050 *large_page = total; 1051 break; 1052 } 1053 1054 if (dma_pte_superpage(pte)) { 1055 *large_page = total; 1056 return pte; 1057 } 1058 1059 parent = phys_to_virt(dma_pte_addr(pte)); 1060 total--; 1061 } 1062 return NULL; 1063 } 1064 1065 /* clear last level pte, a tlb flush should be followed */ 1066 static void dma_pte_clear_range(struct dmar_domain *domain, 1067 unsigned long start_pfn, 1068 unsigned long last_pfn) 1069 { 1070 unsigned int large_page; 1071 struct dma_pte *first_pte, *pte; 1072 1073 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1074 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1075 BUG_ON(start_pfn > last_pfn); 1076 1077 /* we don't need lock here; nobody else touches the iova range */ 1078 do { 1079 large_page = 1; 1080 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 1081 if (!pte) { 1082 start_pfn = align_to_level(start_pfn + 1, large_page + 1); 1083 continue; 1084 } 1085 do { 1086 dma_clear_pte(pte); 1087 start_pfn += lvl_to_nr_pages(large_page); 1088 pte++; 1089 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 1090 1091 domain_flush_cache(domain, first_pte, 1092 (void *)pte - (void *)first_pte); 1093 1094 } while (start_pfn && start_pfn <= last_pfn); 1095 } 1096 1097 static void dma_pte_free_level(struct dmar_domain *domain, int level, 1098 int retain_level, struct dma_pte *pte, 1099 unsigned long pfn, unsigned long start_pfn, 1100 unsigned long last_pfn) 1101 { 1102 pfn = max(start_pfn, pfn); 1103 pte = &pte[pfn_level_offset(pfn, level)]; 1104 1105 do { 1106 unsigned long level_pfn; 1107 struct dma_pte *level_pte; 1108 1109 if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 1110 goto next; 1111 1112 level_pfn = pfn & level_mask(level); 1113 level_pte = phys_to_virt(dma_pte_addr(pte)); 1114 1115 if (level > 2) { 1116 dma_pte_free_level(domain, level - 1, retain_level, 1117 level_pte, level_pfn, start_pfn, 1118 last_pfn); 1119 } 1120 1121 /* 1122 * Free the page table if we're below the level we want to 1123 * retain and the range covers the entire table. 1124 */ 1125 if (level < retain_level && !(start_pfn > level_pfn || 1126 last_pfn < level_pfn + level_size(level) - 1)) { 1127 dma_clear_pte(pte); 1128 domain_flush_cache(domain, pte, sizeof(*pte)); 1129 free_pgtable_page(level_pte); 1130 } 1131 next: 1132 pfn += level_size(level); 1133 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1134 } 1135 1136 /* 1137 * clear last level (leaf) ptes and free page table pages below the 1138 * level we wish to keep intact. 1139 */ 1140 static void dma_pte_free_pagetable(struct dmar_domain *domain, 1141 unsigned long start_pfn, 1142 unsigned long last_pfn, 1143 int retain_level) 1144 { 1145 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1146 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1147 BUG_ON(start_pfn > last_pfn); 1148 1149 dma_pte_clear_range(domain, start_pfn, last_pfn); 1150 1151 /* We don't need lock here; nobody else touches the iova range */ 1152 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 1153 domain->pgd, 0, start_pfn, last_pfn); 1154 1155 /* free pgd */ 1156 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1157 free_pgtable_page(domain->pgd); 1158 domain->pgd = NULL; 1159 } 1160 } 1161 1162 /* When a page at a given level is being unlinked from its parent, we don't 1163 need to *modify* it at all. All we need to do is make a list of all the 1164 pages which can be freed just as soon as we've flushed the IOTLB and we 1165 know the hardware page-walk will no longer touch them. 1166 The 'pte' argument is the *parent* PTE, pointing to the page that is to 1167 be freed. */ 1168 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, 1169 int level, struct dma_pte *pte, 1170 struct page *freelist) 1171 { 1172 struct page *pg; 1173 1174 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); 1175 pg->freelist = freelist; 1176 freelist = pg; 1177 1178 if (level == 1) 1179 return freelist; 1180 1181 pte = page_address(pg); 1182 do { 1183 if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 1184 freelist = dma_pte_list_pagetables(domain, level - 1, 1185 pte, freelist); 1186 pte++; 1187 } while (!first_pte_in_page(pte)); 1188 1189 return freelist; 1190 } 1191 1192 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, 1193 struct dma_pte *pte, unsigned long pfn, 1194 unsigned long start_pfn, 1195 unsigned long last_pfn, 1196 struct page *freelist) 1197 { 1198 struct dma_pte *first_pte = NULL, *last_pte = NULL; 1199 1200 pfn = max(start_pfn, pfn); 1201 pte = &pte[pfn_level_offset(pfn, level)]; 1202 1203 do { 1204 unsigned long level_pfn; 1205 1206 if (!dma_pte_present(pte)) 1207 goto next; 1208 1209 level_pfn = pfn & level_mask(level); 1210 1211 /* If range covers entire pagetable, free it */ 1212 if (start_pfn <= level_pfn && 1213 last_pfn >= level_pfn + level_size(level) - 1) { 1214 /* These suborbinate page tables are going away entirely. Don't 1215 bother to clear them; we're just going to *free* them. */ 1216 if (level > 1 && !dma_pte_superpage(pte)) 1217 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist); 1218 1219 dma_clear_pte(pte); 1220 if (!first_pte) 1221 first_pte = pte; 1222 last_pte = pte; 1223 } else if (level > 1) { 1224 /* Recurse down into a level that isn't *entirely* obsolete */ 1225 freelist = dma_pte_clear_level(domain, level - 1, 1226 phys_to_virt(dma_pte_addr(pte)), 1227 level_pfn, start_pfn, last_pfn, 1228 freelist); 1229 } 1230 next: 1231 pfn += level_size(level); 1232 } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 1233 1234 if (first_pte) 1235 domain_flush_cache(domain, first_pte, 1236 (void *)++last_pte - (void *)first_pte); 1237 1238 return freelist; 1239 } 1240 1241 /* We can't just free the pages because the IOMMU may still be walking 1242 the page tables, and may have cached the intermediate levels. The 1243 pages can only be freed after the IOTLB flush has been done. */ 1244 static struct page *domain_unmap(struct dmar_domain *domain, 1245 unsigned long start_pfn, 1246 unsigned long last_pfn) 1247 { 1248 struct page *freelist; 1249 1250 BUG_ON(!domain_pfn_supported(domain, start_pfn)); 1251 BUG_ON(!domain_pfn_supported(domain, last_pfn)); 1252 BUG_ON(start_pfn > last_pfn); 1253 1254 /* we don't need lock here; nobody else touches the iova range */ 1255 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 1256 domain->pgd, 0, start_pfn, last_pfn, NULL); 1257 1258 /* free pgd */ 1259 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 1260 struct page *pgd_page = virt_to_page(domain->pgd); 1261 pgd_page->freelist = freelist; 1262 freelist = pgd_page; 1263 1264 domain->pgd = NULL; 1265 } 1266 1267 return freelist; 1268 } 1269 1270 static void dma_free_pagelist(struct page *freelist) 1271 { 1272 struct page *pg; 1273 1274 while ((pg = freelist)) { 1275 freelist = pg->freelist; 1276 free_pgtable_page(page_address(pg)); 1277 } 1278 } 1279 1280 static void iova_entry_free(unsigned long data) 1281 { 1282 struct page *freelist = (struct page *)data; 1283 1284 dma_free_pagelist(freelist); 1285 } 1286 1287 /* iommu handling */ 1288 static int iommu_alloc_root_entry(struct intel_iommu *iommu) 1289 { 1290 struct root_entry *root; 1291 unsigned long flags; 1292 1293 root = (struct root_entry *)alloc_pgtable_page(iommu->node); 1294 if (!root) { 1295 pr_err("Allocating root entry for %s failed\n", 1296 iommu->name); 1297 return -ENOMEM; 1298 } 1299 1300 __iommu_flush_cache(iommu, root, ROOT_SIZE); 1301 1302 spin_lock_irqsave(&iommu->lock, flags); 1303 iommu->root_entry = root; 1304 spin_unlock_irqrestore(&iommu->lock, flags); 1305 1306 return 0; 1307 } 1308 1309 static void iommu_set_root_entry(struct intel_iommu *iommu) 1310 { 1311 u64 addr; 1312 u32 sts; 1313 unsigned long flag; 1314 1315 addr = virt_to_phys(iommu->root_entry); 1316 if (sm_supported(iommu)) 1317 addr |= DMA_RTADDR_SMT; 1318 1319 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1320 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 1321 1322 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 1323 1324 /* Make sure hardware complete it */ 1325 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1326 readl, (sts & DMA_GSTS_RTPS), sts); 1327 1328 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1329 } 1330 1331 void iommu_flush_write_buffer(struct intel_iommu *iommu) 1332 { 1333 u32 val; 1334 unsigned long flag; 1335 1336 if (!rwbf_quirk && !cap_rwbf(iommu->cap)) 1337 return; 1338 1339 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1340 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); 1341 1342 /* Make sure hardware complete it */ 1343 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1344 readl, (!(val & DMA_GSTS_WBFS)), val); 1345 1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1347 } 1348 1349 /* return value determine if we need a write buffer flush */ 1350 static void __iommu_flush_context(struct intel_iommu *iommu, 1351 u16 did, u16 source_id, u8 function_mask, 1352 u64 type) 1353 { 1354 u64 val = 0; 1355 unsigned long flag; 1356 1357 switch (type) { 1358 case DMA_CCMD_GLOBAL_INVL: 1359 val = DMA_CCMD_GLOBAL_INVL; 1360 break; 1361 case DMA_CCMD_DOMAIN_INVL: 1362 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); 1363 break; 1364 case DMA_CCMD_DEVICE_INVL: 1365 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) 1366 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); 1367 break; 1368 default: 1369 BUG(); 1370 } 1371 val |= DMA_CCMD_ICC; 1372 1373 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1374 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 1375 1376 /* Make sure hardware complete it */ 1377 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 1378 dmar_readq, (!(val & DMA_CCMD_ICC)), val); 1379 1380 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1381 } 1382 1383 /* return value determine if we need a write buffer flush */ 1384 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, 1385 u64 addr, unsigned int size_order, u64 type) 1386 { 1387 int tlb_offset = ecap_iotlb_offset(iommu->ecap); 1388 u64 val = 0, val_iva = 0; 1389 unsigned long flag; 1390 1391 switch (type) { 1392 case DMA_TLB_GLOBAL_FLUSH: 1393 /* global flush doesn't need set IVA_REG */ 1394 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; 1395 break; 1396 case DMA_TLB_DSI_FLUSH: 1397 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1398 break; 1399 case DMA_TLB_PSI_FLUSH: 1400 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); 1401 /* IH bit is passed in as part of address */ 1402 val_iva = size_order | addr; 1403 break; 1404 default: 1405 BUG(); 1406 } 1407 /* Note: set drain read/write */ 1408 #if 0 1409 /* 1410 * This is probably to be super secure.. Looks like we can 1411 * ignore it without any impact. 1412 */ 1413 if (cap_read_drain(iommu->cap)) 1414 val |= DMA_TLB_READ_DRAIN; 1415 #endif 1416 if (cap_write_drain(iommu->cap)) 1417 val |= DMA_TLB_WRITE_DRAIN; 1418 1419 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1420 /* Note: Only uses first TLB reg currently */ 1421 if (val_iva) 1422 dmar_writeq(iommu->reg + tlb_offset, val_iva); 1423 dmar_writeq(iommu->reg + tlb_offset + 8, val); 1424 1425 /* Make sure hardware complete it */ 1426 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 1427 dmar_readq, (!(val & DMA_TLB_IVT)), val); 1428 1429 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1430 1431 /* check IOTLB invalidation granularity */ 1432 if (DMA_TLB_IAIG(val) == 0) 1433 pr_err("Flush IOTLB failed\n"); 1434 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) 1435 pr_debug("TLB flush request %Lx, actual %Lx\n", 1436 (unsigned long long)DMA_TLB_IIRG(type), 1437 (unsigned long long)DMA_TLB_IAIG(val)); 1438 } 1439 1440 static struct device_domain_info * 1441 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, 1442 u8 bus, u8 devfn) 1443 { 1444 struct device_domain_info *info; 1445 1446 assert_spin_locked(&device_domain_lock); 1447 1448 if (!iommu->qi) 1449 return NULL; 1450 1451 list_for_each_entry(info, &domain->devices, link) 1452 if (info->iommu == iommu && info->bus == bus && 1453 info->devfn == devfn) { 1454 if (info->ats_supported && info->dev) 1455 return info; 1456 break; 1457 } 1458 1459 return NULL; 1460 } 1461 1462 static void domain_update_iotlb(struct dmar_domain *domain) 1463 { 1464 struct device_domain_info *info; 1465 bool has_iotlb_device = false; 1466 1467 assert_spin_locked(&device_domain_lock); 1468 1469 list_for_each_entry(info, &domain->devices, link) { 1470 struct pci_dev *pdev; 1471 1472 if (!info->dev || !dev_is_pci(info->dev)) 1473 continue; 1474 1475 pdev = to_pci_dev(info->dev); 1476 if (pdev->ats_enabled) { 1477 has_iotlb_device = true; 1478 break; 1479 } 1480 } 1481 1482 domain->has_iotlb_device = has_iotlb_device; 1483 } 1484 1485 static void iommu_enable_dev_iotlb(struct device_domain_info *info) 1486 { 1487 struct pci_dev *pdev; 1488 1489 assert_spin_locked(&device_domain_lock); 1490 1491 if (!info || !dev_is_pci(info->dev)) 1492 return; 1493 1494 pdev = to_pci_dev(info->dev); 1495 /* For IOMMU that supports device IOTLB throttling (DIT), we assign 1496 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge 1497 * queue depth at PF level. If DIT is not set, PFSID will be treated as 1498 * reserved, which should be set to 0. 1499 */ 1500 if (!ecap_dit(info->iommu->ecap)) 1501 info->pfsid = 0; 1502 else { 1503 struct pci_dev *pf_pdev; 1504 1505 /* pdev will be returned if device is not a vf */ 1506 pf_pdev = pci_physfn(pdev); 1507 info->pfsid = pci_dev_id(pf_pdev); 1508 } 1509 1510 #ifdef CONFIG_INTEL_IOMMU_SVM 1511 /* The PCIe spec, in its wisdom, declares that the behaviour of 1512 the device if you enable PASID support after ATS support is 1513 undefined. So always enable PASID support on devices which 1514 have it, even if we can't yet know if we're ever going to 1515 use it. */ 1516 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) 1517 info->pasid_enabled = 1; 1518 1519 if (info->pri_supported && 1520 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && 1521 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) 1522 info->pri_enabled = 1; 1523 #endif 1524 if (info->ats_supported && pci_ats_page_aligned(pdev) && 1525 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { 1526 info->ats_enabled = 1; 1527 domain_update_iotlb(info->domain); 1528 info->ats_qdep = pci_ats_queue_depth(pdev); 1529 } 1530 } 1531 1532 static void iommu_disable_dev_iotlb(struct device_domain_info *info) 1533 { 1534 struct pci_dev *pdev; 1535 1536 assert_spin_locked(&device_domain_lock); 1537 1538 if (!dev_is_pci(info->dev)) 1539 return; 1540 1541 pdev = to_pci_dev(info->dev); 1542 1543 if (info->ats_enabled) { 1544 pci_disable_ats(pdev); 1545 info->ats_enabled = 0; 1546 domain_update_iotlb(info->domain); 1547 } 1548 #ifdef CONFIG_INTEL_IOMMU_SVM 1549 if (info->pri_enabled) { 1550 pci_disable_pri(pdev); 1551 info->pri_enabled = 0; 1552 } 1553 if (info->pasid_enabled) { 1554 pci_disable_pasid(pdev); 1555 info->pasid_enabled = 0; 1556 } 1557 #endif 1558 } 1559 1560 static void iommu_flush_dev_iotlb(struct dmar_domain *domain, 1561 u64 addr, unsigned mask) 1562 { 1563 u16 sid, qdep; 1564 unsigned long flags; 1565 struct device_domain_info *info; 1566 1567 if (!domain->has_iotlb_device) 1568 return; 1569 1570 spin_lock_irqsave(&device_domain_lock, flags); 1571 list_for_each_entry(info, &domain->devices, link) { 1572 if (!info->ats_enabled) 1573 continue; 1574 1575 sid = info->bus << 8 | info->devfn; 1576 qdep = info->ats_qdep; 1577 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, 1578 qdep, addr, mask); 1579 } 1580 spin_unlock_irqrestore(&device_domain_lock, flags); 1581 } 1582 1583 static void domain_flush_piotlb(struct intel_iommu *iommu, 1584 struct dmar_domain *domain, 1585 u64 addr, unsigned long npages, bool ih) 1586 { 1587 u16 did = domain->iommu_did[iommu->seq_id]; 1588 1589 if (domain->default_pasid) 1590 qi_flush_piotlb(iommu, did, domain->default_pasid, 1591 addr, npages, ih); 1592 1593 if (!list_empty(&domain->devices)) 1594 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); 1595 } 1596 1597 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, 1598 struct dmar_domain *domain, 1599 unsigned long pfn, unsigned int pages, 1600 int ih, int map) 1601 { 1602 unsigned int mask = ilog2(__roundup_pow_of_two(pages)); 1603 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT; 1604 u16 did = domain->iommu_did[iommu->seq_id]; 1605 1606 BUG_ON(pages == 0); 1607 1608 if (ih) 1609 ih = 1 << 6; 1610 1611 if (domain_use_first_level(domain)) { 1612 domain_flush_piotlb(iommu, domain, addr, pages, ih); 1613 } else { 1614 /* 1615 * Fallback to domain selective flush if no PSI support or 1616 * the size is too big. PSI requires page size to be 2 ^ x, 1617 * and the base address is naturally aligned to the size. 1618 */ 1619 if (!cap_pgsel_inv(iommu->cap) || 1620 mask > cap_max_amask_val(iommu->cap)) 1621 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1622 DMA_TLB_DSI_FLUSH); 1623 else 1624 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, 1625 DMA_TLB_PSI_FLUSH); 1626 } 1627 1628 /* 1629 * In caching mode, changes of pages from non-present to present require 1630 * flush. However, device IOTLB doesn't need to be flushed in this case. 1631 */ 1632 if (!cap_caching_mode(iommu->cap) || !map) 1633 iommu_flush_dev_iotlb(domain, addr, mask); 1634 } 1635 1636 /* Notification for newly created mappings */ 1637 static inline void __mapping_notify_one(struct intel_iommu *iommu, 1638 struct dmar_domain *domain, 1639 unsigned long pfn, unsigned int pages) 1640 { 1641 /* 1642 * It's a non-present to present mapping. Only flush if caching mode 1643 * and second level. 1644 */ 1645 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) 1646 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); 1647 else 1648 iommu_flush_write_buffer(iommu); 1649 } 1650 1651 static void iommu_flush_iova(struct iova_domain *iovad) 1652 { 1653 struct dmar_domain *domain; 1654 int idx; 1655 1656 domain = container_of(iovad, struct dmar_domain, iovad); 1657 1658 for_each_domain_iommu(idx, domain) { 1659 struct intel_iommu *iommu = g_iommus[idx]; 1660 u16 did = domain->iommu_did[iommu->seq_id]; 1661 1662 if (domain_use_first_level(domain)) 1663 domain_flush_piotlb(iommu, domain, 0, -1, 0); 1664 else 1665 iommu->flush.flush_iotlb(iommu, did, 0, 0, 1666 DMA_TLB_DSI_FLUSH); 1667 1668 if (!cap_caching_mode(iommu->cap)) 1669 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), 1670 0, MAX_AGAW_PFN_WIDTH); 1671 } 1672 } 1673 1674 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) 1675 { 1676 u32 pmen; 1677 unsigned long flags; 1678 1679 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) 1680 return; 1681 1682 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1683 pmen = readl(iommu->reg + DMAR_PMEN_REG); 1684 pmen &= ~DMA_PMEN_EPM; 1685 writel(pmen, iommu->reg + DMAR_PMEN_REG); 1686 1687 /* wait for the protected region status bit to clear */ 1688 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, 1689 readl, !(pmen & DMA_PMEN_PRS), pmen); 1690 1691 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1692 } 1693 1694 static void iommu_enable_translation(struct intel_iommu *iommu) 1695 { 1696 u32 sts; 1697 unsigned long flags; 1698 1699 raw_spin_lock_irqsave(&iommu->register_lock, flags); 1700 iommu->gcmd |= DMA_GCMD_TE; 1701 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1702 1703 /* Make sure hardware complete it */ 1704 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1705 readl, (sts & DMA_GSTS_TES), sts); 1706 1707 raw_spin_unlock_irqrestore(&iommu->register_lock, flags); 1708 } 1709 1710 static void iommu_disable_translation(struct intel_iommu *iommu) 1711 { 1712 u32 sts; 1713 unsigned long flag; 1714 1715 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && 1716 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) 1717 return; 1718 1719 raw_spin_lock_irqsave(&iommu->register_lock, flag); 1720 iommu->gcmd &= ~DMA_GCMD_TE; 1721 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); 1722 1723 /* Make sure hardware complete it */ 1724 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, 1725 readl, (!(sts & DMA_GSTS_TES)), sts); 1726 1727 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 1728 } 1729 1730 static int iommu_init_domains(struct intel_iommu *iommu) 1731 { 1732 u32 ndomains, nlongs; 1733 size_t size; 1734 1735 ndomains = cap_ndoms(iommu->cap); 1736 pr_debug("%s: Number of Domains supported <%d>\n", 1737 iommu->name, ndomains); 1738 nlongs = BITS_TO_LONGS(ndomains); 1739 1740 spin_lock_init(&iommu->lock); 1741 1742 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); 1743 if (!iommu->domain_ids) { 1744 pr_err("%s: Allocating domain id array failed\n", 1745 iommu->name); 1746 return -ENOMEM; 1747 } 1748 1749 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); 1750 iommu->domains = kzalloc(size, GFP_KERNEL); 1751 1752 if (iommu->domains) { 1753 size = 256 * sizeof(struct dmar_domain *); 1754 iommu->domains[0] = kzalloc(size, GFP_KERNEL); 1755 } 1756 1757 if (!iommu->domains || !iommu->domains[0]) { 1758 pr_err("%s: Allocating domain array failed\n", 1759 iommu->name); 1760 kfree(iommu->domain_ids); 1761 kfree(iommu->domains); 1762 iommu->domain_ids = NULL; 1763 iommu->domains = NULL; 1764 return -ENOMEM; 1765 } 1766 1767 /* 1768 * If Caching mode is set, then invalid translations are tagged 1769 * with domain-id 0, hence we need to pre-allocate it. We also 1770 * use domain-id 0 as a marker for non-allocated domain-id, so 1771 * make sure it is not used for a real domain. 1772 */ 1773 set_bit(0, iommu->domain_ids); 1774 1775 /* 1776 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid 1777 * entry for first-level or pass-through translation modes should 1778 * be programmed with a domain id different from those used for 1779 * second-level or nested translation. We reserve a domain id for 1780 * this purpose. 1781 */ 1782 if (sm_supported(iommu)) 1783 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); 1784 1785 return 0; 1786 } 1787 1788 static void disable_dmar_iommu(struct intel_iommu *iommu) 1789 { 1790 struct device_domain_info *info, *tmp; 1791 unsigned long flags; 1792 1793 if (!iommu->domains || !iommu->domain_ids) 1794 return; 1795 1796 spin_lock_irqsave(&device_domain_lock, flags); 1797 list_for_each_entry_safe(info, tmp, &device_domain_list, global) { 1798 if (info->iommu != iommu) 1799 continue; 1800 1801 if (!info->dev || !info->domain) 1802 continue; 1803 1804 __dmar_remove_one_dev_info(info); 1805 } 1806 spin_unlock_irqrestore(&device_domain_lock, flags); 1807 1808 if (iommu->gcmd & DMA_GCMD_TE) 1809 iommu_disable_translation(iommu); 1810 } 1811 1812 static void free_dmar_iommu(struct intel_iommu *iommu) 1813 { 1814 if ((iommu->domains) && (iommu->domain_ids)) { 1815 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; 1816 int i; 1817 1818 for (i = 0; i < elems; i++) 1819 kfree(iommu->domains[i]); 1820 kfree(iommu->domains); 1821 kfree(iommu->domain_ids); 1822 iommu->domains = NULL; 1823 iommu->domain_ids = NULL; 1824 } 1825 1826 g_iommus[iommu->seq_id] = NULL; 1827 1828 /* free context mapping */ 1829 free_context_table(iommu); 1830 1831 #ifdef CONFIG_INTEL_IOMMU_SVM 1832 if (pasid_supported(iommu)) { 1833 if (ecap_prs(iommu->ecap)) 1834 intel_svm_finish_prq(iommu); 1835 } 1836 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) 1837 ioasid_unregister_allocator(&iommu->pasid_allocator); 1838 1839 #endif 1840 } 1841 1842 /* 1843 * Check and return whether first level is used by default for 1844 * DMA translation. 1845 */ 1846 static bool first_level_by_default(void) 1847 { 1848 struct dmar_drhd_unit *drhd; 1849 struct intel_iommu *iommu; 1850 static int first_level_support = -1; 1851 1852 if (likely(first_level_support != -1)) 1853 return first_level_support; 1854 1855 first_level_support = 1; 1856 1857 rcu_read_lock(); 1858 for_each_active_iommu(iommu, drhd) { 1859 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { 1860 first_level_support = 0; 1861 break; 1862 } 1863 } 1864 rcu_read_unlock(); 1865 1866 return first_level_support; 1867 } 1868 1869 static struct dmar_domain *alloc_domain(int flags) 1870 { 1871 struct dmar_domain *domain; 1872 1873 domain = alloc_domain_mem(); 1874 if (!domain) 1875 return NULL; 1876 1877 memset(domain, 0, sizeof(*domain)); 1878 domain->nid = NUMA_NO_NODE; 1879 domain->flags = flags; 1880 if (first_level_by_default()) 1881 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; 1882 domain->has_iotlb_device = false; 1883 INIT_LIST_HEAD(&domain->devices); 1884 1885 return domain; 1886 } 1887 1888 /* Must be called with iommu->lock */ 1889 static int domain_attach_iommu(struct dmar_domain *domain, 1890 struct intel_iommu *iommu) 1891 { 1892 unsigned long ndomains; 1893 int num; 1894 1895 assert_spin_locked(&device_domain_lock); 1896 assert_spin_locked(&iommu->lock); 1897 1898 domain->iommu_refcnt[iommu->seq_id] += 1; 1899 domain->iommu_count += 1; 1900 if (domain->iommu_refcnt[iommu->seq_id] == 1) { 1901 ndomains = cap_ndoms(iommu->cap); 1902 num = find_first_zero_bit(iommu->domain_ids, ndomains); 1903 1904 if (num >= ndomains) { 1905 pr_err("%s: No free domain ids\n", iommu->name); 1906 domain->iommu_refcnt[iommu->seq_id] -= 1; 1907 domain->iommu_count -= 1; 1908 return -ENOSPC; 1909 } 1910 1911 set_bit(num, iommu->domain_ids); 1912 set_iommu_domain(iommu, num, domain); 1913 1914 domain->iommu_did[iommu->seq_id] = num; 1915 domain->nid = iommu->node; 1916 1917 domain_update_iommu_cap(domain); 1918 } 1919 1920 return 0; 1921 } 1922 1923 static int domain_detach_iommu(struct dmar_domain *domain, 1924 struct intel_iommu *iommu) 1925 { 1926 int num, count; 1927 1928 assert_spin_locked(&device_domain_lock); 1929 assert_spin_locked(&iommu->lock); 1930 1931 domain->iommu_refcnt[iommu->seq_id] -= 1; 1932 count = --domain->iommu_count; 1933 if (domain->iommu_refcnt[iommu->seq_id] == 0) { 1934 num = domain->iommu_did[iommu->seq_id]; 1935 clear_bit(num, iommu->domain_ids); 1936 set_iommu_domain(iommu, num, NULL); 1937 1938 domain_update_iommu_cap(domain); 1939 domain->iommu_did[iommu->seq_id] = 0; 1940 } 1941 1942 return count; 1943 } 1944 1945 static struct iova_domain reserved_iova_list; 1946 static struct lock_class_key reserved_rbtree_key; 1947 1948 static int dmar_init_reserved_ranges(void) 1949 { 1950 struct pci_dev *pdev = NULL; 1951 struct iova *iova; 1952 int i; 1953 1954 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN); 1955 1956 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, 1957 &reserved_rbtree_key); 1958 1959 /* IOAPIC ranges shouldn't be accessed by DMA */ 1960 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START), 1961 IOVA_PFN(IOAPIC_RANGE_END)); 1962 if (!iova) { 1963 pr_err("Reserve IOAPIC range failed\n"); 1964 return -ENODEV; 1965 } 1966 1967 /* Reserve all PCI MMIO to avoid peer-to-peer access */ 1968 for_each_pci_dev(pdev) { 1969 struct resource *r; 1970 1971 for (i = 0; i < PCI_NUM_RESOURCES; i++) { 1972 r = &pdev->resource[i]; 1973 if (!r->flags || !(r->flags & IORESOURCE_MEM)) 1974 continue; 1975 iova = reserve_iova(&reserved_iova_list, 1976 IOVA_PFN(r->start), 1977 IOVA_PFN(r->end)); 1978 if (!iova) { 1979 pci_err(pdev, "Reserve iova for %pR failed\n", r); 1980 return -ENODEV; 1981 } 1982 } 1983 } 1984 return 0; 1985 } 1986 1987 static inline int guestwidth_to_adjustwidth(int gaw) 1988 { 1989 int agaw; 1990 int r = (gaw - 12) % 9; 1991 1992 if (r == 0) 1993 agaw = gaw; 1994 else 1995 agaw = gaw + 9 - r; 1996 if (agaw > 64) 1997 agaw = 64; 1998 return agaw; 1999 } 2000 2001 static void domain_exit(struct dmar_domain *domain) 2002 { 2003 2004 /* Remove associated devices and clear attached or cached domains */ 2005 domain_remove_dev_info(domain); 2006 2007 /* destroy iovas */ 2008 if (domain->domain.type == IOMMU_DOMAIN_DMA) 2009 put_iova_domain(&domain->iovad); 2010 2011 if (domain->pgd) { 2012 struct page *freelist; 2013 2014 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw)); 2015 dma_free_pagelist(freelist); 2016 } 2017 2018 free_domain_mem(domain); 2019 } 2020 2021 /* 2022 * Get the PASID directory size for scalable mode context entry. 2023 * Value of X in the PDTS field of a scalable mode context entry 2024 * indicates PASID directory with 2^(X + 7) entries. 2025 */ 2026 static inline unsigned long context_get_sm_pds(struct pasid_table *table) 2027 { 2028 int pds, max_pde; 2029 2030 max_pde = table->max_pasid >> PASID_PDE_SHIFT; 2031 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS); 2032 if (pds < 7) 2033 return 0; 2034 2035 return pds - 7; 2036 } 2037 2038 /* 2039 * Set the RID_PASID field of a scalable mode context entry. The 2040 * IOMMU hardware will use the PASID value set in this field for 2041 * DMA translations of DMA requests without PASID. 2042 */ 2043 static inline void 2044 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) 2045 { 2046 context->hi |= pasid & ((1 << 20) - 1); 2047 } 2048 2049 /* 2050 * Set the DTE(Device-TLB Enable) field of a scalable mode context 2051 * entry. 2052 */ 2053 static inline void context_set_sm_dte(struct context_entry *context) 2054 { 2055 context->lo |= (1 << 2); 2056 } 2057 2058 /* 2059 * Set the PRE(Page Request Enable) field of a scalable mode context 2060 * entry. 2061 */ 2062 static inline void context_set_sm_pre(struct context_entry *context) 2063 { 2064 context->lo |= (1 << 4); 2065 } 2066 2067 /* Convert value to context PASID directory size field coding. */ 2068 #define context_pdts(pds) (((pds) & 0x7) << 9) 2069 2070 static int domain_context_mapping_one(struct dmar_domain *domain, 2071 struct intel_iommu *iommu, 2072 struct pasid_table *table, 2073 u8 bus, u8 devfn) 2074 { 2075 u16 did = domain->iommu_did[iommu->seq_id]; 2076 int translation = CONTEXT_TT_MULTI_LEVEL; 2077 struct device_domain_info *info = NULL; 2078 struct context_entry *context; 2079 unsigned long flags; 2080 int ret; 2081 2082 WARN_ON(did == 0); 2083 2084 if (hw_pass_through && domain_type_is_si(domain)) 2085 translation = CONTEXT_TT_PASS_THROUGH; 2086 2087 pr_debug("Set context mapping for %02x:%02x.%d\n", 2088 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 2089 2090 BUG_ON(!domain->pgd); 2091 2092 spin_lock_irqsave(&device_domain_lock, flags); 2093 spin_lock(&iommu->lock); 2094 2095 ret = -ENOMEM; 2096 context = iommu_context_addr(iommu, bus, devfn, 1); 2097 if (!context) 2098 goto out_unlock; 2099 2100 ret = 0; 2101 if (context_present(context)) 2102 goto out_unlock; 2103 2104 /* 2105 * For kdump cases, old valid entries may be cached due to the 2106 * in-flight DMA and copied pgtable, but there is no unmapping 2107 * behaviour for them, thus we need an explicit cache flush for 2108 * the newly-mapped device. For kdump, at this point, the device 2109 * is supposed to finish reset at its driver probe stage, so no 2110 * in-flight DMA will exist, and we don't need to worry anymore 2111 * hereafter. 2112 */ 2113 if (context_copied(context)) { 2114 u16 did_old = context_domain_id(context); 2115 2116 if (did_old < cap_ndoms(iommu->cap)) { 2117 iommu->flush.flush_context(iommu, did_old, 2118 (((u16)bus) << 8) | devfn, 2119 DMA_CCMD_MASK_NOBIT, 2120 DMA_CCMD_DEVICE_INVL); 2121 iommu->flush.flush_iotlb(iommu, did_old, 0, 0, 2122 DMA_TLB_DSI_FLUSH); 2123 } 2124 } 2125 2126 context_clear_entry(context); 2127 2128 if (sm_supported(iommu)) { 2129 unsigned long pds; 2130 2131 WARN_ON(!table); 2132 2133 /* Setup the PASID DIR pointer: */ 2134 pds = context_get_sm_pds(table); 2135 context->lo = (u64)virt_to_phys(table->table) | 2136 context_pdts(pds); 2137 2138 /* Setup the RID_PASID field: */ 2139 context_set_sm_rid2pasid(context, PASID_RID2PASID); 2140 2141 /* 2142 * Setup the Device-TLB enable bit and Page request 2143 * Enable bit: 2144 */ 2145 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2146 if (info && info->ats_supported) 2147 context_set_sm_dte(context); 2148 if (info && info->pri_supported) 2149 context_set_sm_pre(context); 2150 } else { 2151 struct dma_pte *pgd = domain->pgd; 2152 int agaw; 2153 2154 context_set_domain_id(context, did); 2155 2156 if (translation != CONTEXT_TT_PASS_THROUGH) { 2157 /* 2158 * Skip top levels of page tables for iommu which has 2159 * less agaw than default. Unnecessary for PT mode. 2160 */ 2161 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2162 ret = -ENOMEM; 2163 pgd = phys_to_virt(dma_pte_addr(pgd)); 2164 if (!dma_pte_present(pgd)) 2165 goto out_unlock; 2166 } 2167 2168 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); 2169 if (info && info->ats_supported) 2170 translation = CONTEXT_TT_DEV_IOTLB; 2171 else 2172 translation = CONTEXT_TT_MULTI_LEVEL; 2173 2174 context_set_address_root(context, virt_to_phys(pgd)); 2175 context_set_address_width(context, agaw); 2176 } else { 2177 /* 2178 * In pass through mode, AW must be programmed to 2179 * indicate the largest AGAW value supported by 2180 * hardware. And ASR is ignored by hardware. 2181 */ 2182 context_set_address_width(context, iommu->msagaw); 2183 } 2184 2185 context_set_translation_type(context, translation); 2186 } 2187 2188 context_set_fault_enable(context); 2189 context_set_present(context); 2190 if (!ecap_coherent(iommu->ecap)) 2191 clflush_cache_range(context, sizeof(*context)); 2192 2193 /* 2194 * It's a non-present to present mapping. If hardware doesn't cache 2195 * non-present entry we only need to flush the write-buffer. If the 2196 * _does_ cache non-present entries, then it does so in the special 2197 * domain #0, which we have to flush: 2198 */ 2199 if (cap_caching_mode(iommu->cap)) { 2200 iommu->flush.flush_context(iommu, 0, 2201 (((u16)bus) << 8) | devfn, 2202 DMA_CCMD_MASK_NOBIT, 2203 DMA_CCMD_DEVICE_INVL); 2204 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 2205 } else { 2206 iommu_flush_write_buffer(iommu); 2207 } 2208 iommu_enable_dev_iotlb(info); 2209 2210 ret = 0; 2211 2212 out_unlock: 2213 spin_unlock(&iommu->lock); 2214 spin_unlock_irqrestore(&device_domain_lock, flags); 2215 2216 return ret; 2217 } 2218 2219 struct domain_context_mapping_data { 2220 struct dmar_domain *domain; 2221 struct intel_iommu *iommu; 2222 struct pasid_table *table; 2223 }; 2224 2225 static int domain_context_mapping_cb(struct pci_dev *pdev, 2226 u16 alias, void *opaque) 2227 { 2228 struct domain_context_mapping_data *data = opaque; 2229 2230 return domain_context_mapping_one(data->domain, data->iommu, 2231 data->table, PCI_BUS_NUM(alias), 2232 alias & 0xff); 2233 } 2234 2235 static int 2236 domain_context_mapping(struct dmar_domain *domain, struct device *dev) 2237 { 2238 struct domain_context_mapping_data data; 2239 struct pasid_table *table; 2240 struct intel_iommu *iommu; 2241 u8 bus, devfn; 2242 2243 iommu = device_to_iommu(dev, &bus, &devfn); 2244 if (!iommu) 2245 return -ENODEV; 2246 2247 table = intel_pasid_get_table(dev); 2248 2249 if (!dev_is_pci(dev)) 2250 return domain_context_mapping_one(domain, iommu, table, 2251 bus, devfn); 2252 2253 data.domain = domain; 2254 data.iommu = iommu; 2255 data.table = table; 2256 2257 return pci_for_each_dma_alias(to_pci_dev(dev), 2258 &domain_context_mapping_cb, &data); 2259 } 2260 2261 static int domain_context_mapped_cb(struct pci_dev *pdev, 2262 u16 alias, void *opaque) 2263 { 2264 struct intel_iommu *iommu = opaque; 2265 2266 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); 2267 } 2268 2269 static int domain_context_mapped(struct device *dev) 2270 { 2271 struct intel_iommu *iommu; 2272 u8 bus, devfn; 2273 2274 iommu = device_to_iommu(dev, &bus, &devfn); 2275 if (!iommu) 2276 return -ENODEV; 2277 2278 if (!dev_is_pci(dev)) 2279 return device_context_mapped(iommu, bus, devfn); 2280 2281 return !pci_for_each_dma_alias(to_pci_dev(dev), 2282 domain_context_mapped_cb, iommu); 2283 } 2284 2285 /* Returns a number of VTD pages, but aligned to MM page size */ 2286 static inline unsigned long aligned_nrpages(unsigned long host_addr, 2287 size_t size) 2288 { 2289 host_addr &= ~PAGE_MASK; 2290 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 2291 } 2292 2293 /* Return largest possible superpage level for a given mapping */ 2294 static inline int hardware_largepage_caps(struct dmar_domain *domain, 2295 unsigned long iov_pfn, 2296 unsigned long phy_pfn, 2297 unsigned long pages) 2298 { 2299 int support, level = 1; 2300 unsigned long pfnmerge; 2301 2302 support = domain->iommu_superpage; 2303 2304 /* To use a large page, the virtual *and* physical addresses 2305 must be aligned to 2MiB/1GiB/etc. Lower bits set in either 2306 of them will mean we have to use smaller pages. So just 2307 merge them and check both at once. */ 2308 pfnmerge = iov_pfn | phy_pfn; 2309 2310 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 2311 pages >>= VTD_STRIDE_SHIFT; 2312 if (!pages) 2313 break; 2314 pfnmerge >>= VTD_STRIDE_SHIFT; 2315 level++; 2316 support--; 2317 } 2318 return level; 2319 } 2320 2321 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2322 struct scatterlist *sg, unsigned long phys_pfn, 2323 unsigned long nr_pages, int prot) 2324 { 2325 struct dma_pte *first_pte = NULL, *pte = NULL; 2326 phys_addr_t pteval; 2327 unsigned long sg_res = 0; 2328 unsigned int largepage_lvl = 0; 2329 unsigned long lvl_pages = 0; 2330 u64 attr; 2331 2332 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); 2333 2334 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 2335 return -EINVAL; 2336 2337 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 2338 if (domain_use_first_level(domain)) 2339 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; 2340 2341 if (!sg) { 2342 sg_res = nr_pages; 2343 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 2344 } 2345 2346 while (nr_pages > 0) { 2347 uint64_t tmp; 2348 2349 if (!sg_res) { 2350 unsigned int pgoff = sg->offset & ~PAGE_MASK; 2351 2352 sg_res = aligned_nrpages(sg->offset, sg->length); 2353 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff; 2354 sg->dma_length = sg->length; 2355 pteval = (sg_phys(sg) - pgoff) | attr; 2356 phys_pfn = pteval >> VTD_PAGE_SHIFT; 2357 } 2358 2359 if (!pte) { 2360 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); 2361 2362 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); 2363 if (!pte) 2364 return -ENOMEM; 2365 /* It is large page*/ 2366 if (largepage_lvl > 1) { 2367 unsigned long nr_superpages, end_pfn; 2368 2369 pteval |= DMA_PTE_LARGE_PAGE; 2370 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2371 2372 nr_superpages = sg_res / lvl_pages; 2373 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; 2374 2375 /* 2376 * Ensure that old small page tables are 2377 * removed to make room for superpage(s). 2378 * We're adding new large pages, so make sure 2379 * we don't remove their parent tables. 2380 */ 2381 dma_pte_free_pagetable(domain, iov_pfn, end_pfn, 2382 largepage_lvl + 1); 2383 } else { 2384 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 2385 } 2386 2387 } 2388 /* We don't need lock here, nobody else 2389 * touches the iova range 2390 */ 2391 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); 2392 if (tmp) { 2393 static int dumps = 5; 2394 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 2395 iov_pfn, tmp, (unsigned long long)pteval); 2396 if (dumps) { 2397 dumps--; 2398 debug_dma_dump_mappings(NULL); 2399 } 2400 WARN_ON(1); 2401 } 2402 2403 lvl_pages = lvl_to_nr_pages(largepage_lvl); 2404 2405 BUG_ON(nr_pages < lvl_pages); 2406 BUG_ON(sg_res < lvl_pages); 2407 2408 nr_pages -= lvl_pages; 2409 iov_pfn += lvl_pages; 2410 phys_pfn += lvl_pages; 2411 pteval += lvl_pages * VTD_PAGE_SIZE; 2412 sg_res -= lvl_pages; 2413 2414 /* If the next PTE would be the first in a new page, then we 2415 need to flush the cache on the entries we've just written. 2416 And then we'll need to recalculate 'pte', so clear it and 2417 let it get set again in the if (!pte) block above. 2418 2419 If we're done (!nr_pages) we need to flush the cache too. 2420 2421 Also if we've been setting superpages, we may need to 2422 recalculate 'pte' and switch back to smaller pages for the 2423 end of the mapping, if the trailing size is not enough to 2424 use another superpage (i.e. sg_res < lvl_pages). */ 2425 pte++; 2426 if (!nr_pages || first_pte_in_page(pte) || 2427 (largepage_lvl > 1 && sg_res < lvl_pages)) { 2428 domain_flush_cache(domain, first_pte, 2429 (void *)pte - (void *)first_pte); 2430 pte = NULL; 2431 } 2432 2433 if (!sg_res && nr_pages) 2434 sg = sg_next(sg); 2435 } 2436 return 0; 2437 } 2438 2439 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2440 struct scatterlist *sg, unsigned long phys_pfn, 2441 unsigned long nr_pages, int prot) 2442 { 2443 int iommu_id, ret; 2444 struct intel_iommu *iommu; 2445 2446 /* Do the real mapping first */ 2447 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot); 2448 if (ret) 2449 return ret; 2450 2451 for_each_domain_iommu(iommu_id, domain) { 2452 iommu = g_iommus[iommu_id]; 2453 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); 2454 } 2455 2456 return 0; 2457 } 2458 2459 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2460 struct scatterlist *sg, unsigned long nr_pages, 2461 int prot) 2462 { 2463 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot); 2464 } 2465 2466 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 2467 unsigned long phys_pfn, unsigned long nr_pages, 2468 int prot) 2469 { 2470 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot); 2471 } 2472 2473 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) 2474 { 2475 unsigned long flags; 2476 struct context_entry *context; 2477 u16 did_old; 2478 2479 if (!iommu) 2480 return; 2481 2482 spin_lock_irqsave(&iommu->lock, flags); 2483 context = iommu_context_addr(iommu, bus, devfn, 0); 2484 if (!context) { 2485 spin_unlock_irqrestore(&iommu->lock, flags); 2486 return; 2487 } 2488 did_old = context_domain_id(context); 2489 context_clear_entry(context); 2490 __iommu_flush_cache(iommu, context, sizeof(*context)); 2491 spin_unlock_irqrestore(&iommu->lock, flags); 2492 iommu->flush.flush_context(iommu, 2493 did_old, 2494 (((u16)bus) << 8) | devfn, 2495 DMA_CCMD_MASK_NOBIT, 2496 DMA_CCMD_DEVICE_INVL); 2497 iommu->flush.flush_iotlb(iommu, 2498 did_old, 2499 0, 2500 0, 2501 DMA_TLB_DSI_FLUSH); 2502 } 2503 2504 static inline void unlink_domain_info(struct device_domain_info *info) 2505 { 2506 assert_spin_locked(&device_domain_lock); 2507 list_del(&info->link); 2508 list_del(&info->global); 2509 if (info->dev) 2510 dev_iommu_priv_set(info->dev, NULL); 2511 } 2512 2513 static void domain_remove_dev_info(struct dmar_domain *domain) 2514 { 2515 struct device_domain_info *info, *tmp; 2516 unsigned long flags; 2517 2518 spin_lock_irqsave(&device_domain_lock, flags); 2519 list_for_each_entry_safe(info, tmp, &domain->devices, link) 2520 __dmar_remove_one_dev_info(info); 2521 spin_unlock_irqrestore(&device_domain_lock, flags); 2522 } 2523 2524 struct dmar_domain *find_domain(struct device *dev) 2525 { 2526 struct device_domain_info *info; 2527 2528 if (unlikely(attach_deferred(dev))) 2529 return NULL; 2530 2531 /* No lock here, assumes no domain exit in normal case */ 2532 info = get_domain_info(dev); 2533 if (likely(info)) 2534 return info->domain; 2535 2536 return NULL; 2537 } 2538 2539 static void do_deferred_attach(struct device *dev) 2540 { 2541 struct iommu_domain *domain; 2542 2543 dev_iommu_priv_set(dev, NULL); 2544 domain = iommu_get_domain_for_dev(dev); 2545 if (domain) 2546 intel_iommu_attach_device(domain, dev); 2547 } 2548 2549 static inline struct device_domain_info * 2550 dmar_search_domain_by_dev_info(int segment, int bus, int devfn) 2551 { 2552 struct device_domain_info *info; 2553 2554 list_for_each_entry(info, &device_domain_list, global) 2555 if (info->segment == segment && info->bus == bus && 2556 info->devfn == devfn) 2557 return info; 2558 2559 return NULL; 2560 } 2561 2562 static int domain_setup_first_level(struct intel_iommu *iommu, 2563 struct dmar_domain *domain, 2564 struct device *dev, 2565 u32 pasid) 2566 { 2567 int flags = PASID_FLAG_SUPERVISOR_MODE; 2568 struct dma_pte *pgd = domain->pgd; 2569 int agaw, level; 2570 2571 /* 2572 * Skip top levels of page tables for iommu which has 2573 * less agaw than default. Unnecessary for PT mode. 2574 */ 2575 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { 2576 pgd = phys_to_virt(dma_pte_addr(pgd)); 2577 if (!dma_pte_present(pgd)) 2578 return -ENOMEM; 2579 } 2580 2581 level = agaw_to_level(agaw); 2582 if (level != 4 && level != 5) 2583 return -EINVAL; 2584 2585 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; 2586 2587 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, 2588 domain->iommu_did[iommu->seq_id], 2589 flags); 2590 } 2591 2592 static bool dev_is_real_dma_subdevice(struct device *dev) 2593 { 2594 return dev && dev_is_pci(dev) && 2595 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); 2596 } 2597 2598 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, 2599 int bus, int devfn, 2600 struct device *dev, 2601 struct dmar_domain *domain) 2602 { 2603 struct dmar_domain *found = NULL; 2604 struct device_domain_info *info; 2605 unsigned long flags; 2606 int ret; 2607 2608 info = alloc_devinfo_mem(); 2609 if (!info) 2610 return NULL; 2611 2612 if (!dev_is_real_dma_subdevice(dev)) { 2613 info->bus = bus; 2614 info->devfn = devfn; 2615 info->segment = iommu->segment; 2616 } else { 2617 struct pci_dev *pdev = to_pci_dev(dev); 2618 2619 info->bus = pdev->bus->number; 2620 info->devfn = pdev->devfn; 2621 info->segment = pci_domain_nr(pdev->bus); 2622 } 2623 2624 info->ats_supported = info->pasid_supported = info->pri_supported = 0; 2625 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; 2626 info->ats_qdep = 0; 2627 info->dev = dev; 2628 info->domain = domain; 2629 info->iommu = iommu; 2630 info->pasid_table = NULL; 2631 info->auxd_enabled = 0; 2632 INIT_LIST_HEAD(&info->auxiliary_domains); 2633 2634 if (dev && dev_is_pci(dev)) { 2635 struct pci_dev *pdev = to_pci_dev(info->dev); 2636 2637 if (ecap_dev_iotlb_support(iommu->ecap) && 2638 pci_ats_supported(pdev) && 2639 dmar_find_matched_atsr_unit(pdev)) 2640 info->ats_supported = 1; 2641 2642 if (sm_supported(iommu)) { 2643 if (pasid_supported(iommu)) { 2644 int features = pci_pasid_features(pdev); 2645 if (features >= 0) 2646 info->pasid_supported = features | 1; 2647 } 2648 2649 if (info->ats_supported && ecap_prs(iommu->ecap) && 2650 pci_pri_supported(pdev)) 2651 info->pri_supported = 1; 2652 } 2653 } 2654 2655 spin_lock_irqsave(&device_domain_lock, flags); 2656 if (dev) 2657 found = find_domain(dev); 2658 2659 if (!found) { 2660 struct device_domain_info *info2; 2661 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, 2662 info->devfn); 2663 if (info2) { 2664 found = info2->domain; 2665 info2->dev = dev; 2666 } 2667 } 2668 2669 if (found) { 2670 spin_unlock_irqrestore(&device_domain_lock, flags); 2671 free_devinfo_mem(info); 2672 /* Caller must free the original domain */ 2673 return found; 2674 } 2675 2676 spin_lock(&iommu->lock); 2677 ret = domain_attach_iommu(domain, iommu); 2678 spin_unlock(&iommu->lock); 2679 2680 if (ret) { 2681 spin_unlock_irqrestore(&device_domain_lock, flags); 2682 free_devinfo_mem(info); 2683 return NULL; 2684 } 2685 2686 list_add(&info->link, &domain->devices); 2687 list_add(&info->global, &device_domain_list); 2688 if (dev) 2689 dev_iommu_priv_set(dev, info); 2690 spin_unlock_irqrestore(&device_domain_lock, flags); 2691 2692 /* PASID table is mandatory for a PCI device in scalable mode. */ 2693 if (dev && dev_is_pci(dev) && sm_supported(iommu)) { 2694 ret = intel_pasid_alloc_table(dev); 2695 if (ret) { 2696 dev_err(dev, "PASID table allocation failed\n"); 2697 dmar_remove_one_dev_info(dev); 2698 return NULL; 2699 } 2700 2701 /* Setup the PASID entry for requests without PASID: */ 2702 spin_lock_irqsave(&iommu->lock, flags); 2703 if (hw_pass_through && domain_type_is_si(domain)) 2704 ret = intel_pasid_setup_pass_through(iommu, domain, 2705 dev, PASID_RID2PASID); 2706 else if (domain_use_first_level(domain)) 2707 ret = domain_setup_first_level(iommu, domain, dev, 2708 PASID_RID2PASID); 2709 else 2710 ret = intel_pasid_setup_second_level(iommu, domain, 2711 dev, PASID_RID2PASID); 2712 spin_unlock_irqrestore(&iommu->lock, flags); 2713 if (ret) { 2714 dev_err(dev, "Setup RID2PASID failed\n"); 2715 dmar_remove_one_dev_info(dev); 2716 return NULL; 2717 } 2718 } 2719 2720 if (dev && domain_context_mapping(domain, dev)) { 2721 dev_err(dev, "Domain context map failed\n"); 2722 dmar_remove_one_dev_info(dev); 2723 return NULL; 2724 } 2725 2726 return domain; 2727 } 2728 2729 static int iommu_domain_identity_map(struct dmar_domain *domain, 2730 unsigned long first_vpfn, 2731 unsigned long last_vpfn) 2732 { 2733 /* 2734 * RMRR range might have overlap with physical memory range, 2735 * clear it first 2736 */ 2737 dma_pte_clear_range(domain, first_vpfn, last_vpfn); 2738 2739 return __domain_mapping(domain, first_vpfn, NULL, 2740 first_vpfn, last_vpfn - first_vpfn + 1, 2741 DMA_PTE_READ|DMA_PTE_WRITE); 2742 } 2743 2744 static int md_domain_init(struct dmar_domain *domain, int guest_width); 2745 2746 static int __init si_domain_init(int hw) 2747 { 2748 struct dmar_rmrr_unit *rmrr; 2749 struct device *dev; 2750 int i, nid, ret; 2751 2752 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY); 2753 if (!si_domain) 2754 return -EFAULT; 2755 2756 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 2757 domain_exit(si_domain); 2758 return -EFAULT; 2759 } 2760 2761 if (hw) 2762 return 0; 2763 2764 for_each_online_node(nid) { 2765 unsigned long start_pfn, end_pfn; 2766 int i; 2767 2768 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 2769 ret = iommu_domain_identity_map(si_domain, 2770 mm_to_dma_pfn(start_pfn), 2771 mm_to_dma_pfn(end_pfn)); 2772 if (ret) 2773 return ret; 2774 } 2775 } 2776 2777 /* 2778 * Identity map the RMRRs so that devices with RMRRs could also use 2779 * the si_domain. 2780 */ 2781 for_each_rmrr_units(rmrr) { 2782 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 2783 i, dev) { 2784 unsigned long long start = rmrr->base_address; 2785 unsigned long long end = rmrr->end_address; 2786 2787 if (WARN_ON(end < start || 2788 end >> agaw_to_width(si_domain->agaw))) 2789 continue; 2790 2791 ret = iommu_domain_identity_map(si_domain, 2792 mm_to_dma_pfn(start >> PAGE_SHIFT), 2793 mm_to_dma_pfn(end >> PAGE_SHIFT)); 2794 if (ret) 2795 return ret; 2796 } 2797 } 2798 2799 return 0; 2800 } 2801 2802 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) 2803 { 2804 struct dmar_domain *ndomain; 2805 struct intel_iommu *iommu; 2806 u8 bus, devfn; 2807 2808 iommu = device_to_iommu(dev, &bus, &devfn); 2809 if (!iommu) 2810 return -ENODEV; 2811 2812 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); 2813 if (ndomain != domain) 2814 return -EBUSY; 2815 2816 return 0; 2817 } 2818 2819 static bool device_has_rmrr(struct device *dev) 2820 { 2821 struct dmar_rmrr_unit *rmrr; 2822 struct device *tmp; 2823 int i; 2824 2825 rcu_read_lock(); 2826 for_each_rmrr_units(rmrr) { 2827 /* 2828 * Return TRUE if this RMRR contains the device that 2829 * is passed in. 2830 */ 2831 for_each_active_dev_scope(rmrr->devices, 2832 rmrr->devices_cnt, i, tmp) 2833 if (tmp == dev || 2834 is_downstream_to_pci_bridge(dev, tmp)) { 2835 rcu_read_unlock(); 2836 return true; 2837 } 2838 } 2839 rcu_read_unlock(); 2840 return false; 2841 } 2842 2843 /** 2844 * device_rmrr_is_relaxable - Test whether the RMRR of this device 2845 * is relaxable (ie. is allowed to be not enforced under some conditions) 2846 * @dev: device handle 2847 * 2848 * We assume that PCI USB devices with RMRRs have them largely 2849 * for historical reasons and that the RMRR space is not actively used post 2850 * boot. This exclusion may change if vendors begin to abuse it. 2851 * 2852 * The same exception is made for graphics devices, with the requirement that 2853 * any use of the RMRR regions will be torn down before assigning the device 2854 * to a guest. 2855 * 2856 * Return: true if the RMRR is relaxable, false otherwise 2857 */ 2858 static bool device_rmrr_is_relaxable(struct device *dev) 2859 { 2860 struct pci_dev *pdev; 2861 2862 if (!dev_is_pci(dev)) 2863 return false; 2864 2865 pdev = to_pci_dev(dev); 2866 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) 2867 return true; 2868 else 2869 return false; 2870 } 2871 2872 /* 2873 * There are a couple cases where we need to restrict the functionality of 2874 * devices associated with RMRRs. The first is when evaluating a device for 2875 * identity mapping because problems exist when devices are moved in and out 2876 * of domains and their respective RMRR information is lost. This means that 2877 * a device with associated RMRRs will never be in a "passthrough" domain. 2878 * The second is use of the device through the IOMMU API. This interface 2879 * expects to have full control of the IOVA space for the device. We cannot 2880 * satisfy both the requirement that RMRR access is maintained and have an 2881 * unencumbered IOVA space. We also have no ability to quiesce the device's 2882 * use of the RMRR space or even inform the IOMMU API user of the restriction. 2883 * We therefore prevent devices associated with an RMRR from participating in 2884 * the IOMMU API, which eliminates them from device assignment. 2885 * 2886 * In both cases, devices which have relaxable RMRRs are not concerned by this 2887 * restriction. See device_rmrr_is_relaxable comment. 2888 */ 2889 static bool device_is_rmrr_locked(struct device *dev) 2890 { 2891 if (!device_has_rmrr(dev)) 2892 return false; 2893 2894 if (device_rmrr_is_relaxable(dev)) 2895 return false; 2896 2897 return true; 2898 } 2899 2900 /* 2901 * Return the required default domain type for a specific device. 2902 * 2903 * @dev: the device in query 2904 * @startup: true if this is during early boot 2905 * 2906 * Returns: 2907 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain 2908 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain 2909 * - 0: both identity and dynamic domains work for this device 2910 */ 2911 static int device_def_domain_type(struct device *dev) 2912 { 2913 if (dev_is_pci(dev)) { 2914 struct pci_dev *pdev = to_pci_dev(dev); 2915 2916 /* 2917 * Prevent any device marked as untrusted from getting 2918 * placed into the statically identity mapping domain. 2919 */ 2920 if (pdev->untrusted) 2921 return IOMMU_DOMAIN_DMA; 2922 2923 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2924 return IOMMU_DOMAIN_IDENTITY; 2925 2926 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) 2927 return IOMMU_DOMAIN_IDENTITY; 2928 } 2929 2930 return 0; 2931 } 2932 2933 static void intel_iommu_init_qi(struct intel_iommu *iommu) 2934 { 2935 /* 2936 * Start from the sane iommu hardware state. 2937 * If the queued invalidation is already initialized by us 2938 * (for example, while enabling interrupt-remapping) then 2939 * we got the things already rolling from a sane state. 2940 */ 2941 if (!iommu->qi) { 2942 /* 2943 * Clear any previous faults. 2944 */ 2945 dmar_fault(-1, iommu); 2946 /* 2947 * Disable queued invalidation if supported and already enabled 2948 * before OS handover. 2949 */ 2950 dmar_disable_qi(iommu); 2951 } 2952 2953 if (dmar_enable_qi(iommu)) { 2954 /* 2955 * Queued Invalidate not enabled, use Register Based Invalidate 2956 */ 2957 iommu->flush.flush_context = __iommu_flush_context; 2958 iommu->flush.flush_iotlb = __iommu_flush_iotlb; 2959 pr_info("%s: Using Register based invalidation\n", 2960 iommu->name); 2961 } else { 2962 iommu->flush.flush_context = qi_flush_context; 2963 iommu->flush.flush_iotlb = qi_flush_iotlb; 2964 pr_info("%s: Using Queued invalidation\n", iommu->name); 2965 } 2966 } 2967 2968 static int copy_context_table(struct intel_iommu *iommu, 2969 struct root_entry *old_re, 2970 struct context_entry **tbl, 2971 int bus, bool ext) 2972 { 2973 int tbl_idx, pos = 0, idx, devfn, ret = 0, did; 2974 struct context_entry *new_ce = NULL, ce; 2975 struct context_entry *old_ce = NULL; 2976 struct root_entry re; 2977 phys_addr_t old_ce_phys; 2978 2979 tbl_idx = ext ? bus * 2 : bus; 2980 memcpy(&re, old_re, sizeof(re)); 2981 2982 for (devfn = 0; devfn < 256; devfn++) { 2983 /* First calculate the correct index */ 2984 idx = (ext ? devfn * 2 : devfn) % 256; 2985 2986 if (idx == 0) { 2987 /* First save what we may have and clean up */ 2988 if (new_ce) { 2989 tbl[tbl_idx] = new_ce; 2990 __iommu_flush_cache(iommu, new_ce, 2991 VTD_PAGE_SIZE); 2992 pos = 1; 2993 } 2994 2995 if (old_ce) 2996 memunmap(old_ce); 2997 2998 ret = 0; 2999 if (devfn < 0x80) 3000 old_ce_phys = root_entry_lctp(&re); 3001 else 3002 old_ce_phys = root_entry_uctp(&re); 3003 3004 if (!old_ce_phys) { 3005 if (ext && devfn == 0) { 3006 /* No LCTP, try UCTP */ 3007 devfn = 0x7f; 3008 continue; 3009 } else { 3010 goto out; 3011 } 3012 } 3013 3014 ret = -ENOMEM; 3015 old_ce = memremap(old_ce_phys, PAGE_SIZE, 3016 MEMREMAP_WB); 3017 if (!old_ce) 3018 goto out; 3019 3020 new_ce = alloc_pgtable_page(iommu->node); 3021 if (!new_ce) 3022 goto out_unmap; 3023 3024 ret = 0; 3025 } 3026 3027 /* Now copy the context entry */ 3028 memcpy(&ce, old_ce + idx, sizeof(ce)); 3029 3030 if (!__context_present(&ce)) 3031 continue; 3032 3033 did = context_domain_id(&ce); 3034 if (did >= 0 && did < cap_ndoms(iommu->cap)) 3035 set_bit(did, iommu->domain_ids); 3036 3037 /* 3038 * We need a marker for copied context entries. This 3039 * marker needs to work for the old format as well as 3040 * for extended context entries. 3041 * 3042 * Bit 67 of the context entry is used. In the old 3043 * format this bit is available to software, in the 3044 * extended format it is the PGE bit, but PGE is ignored 3045 * by HW if PASIDs are disabled (and thus still 3046 * available). 3047 * 3048 * So disable PASIDs first and then mark the entry 3049 * copied. This means that we don't copy PASID 3050 * translations from the old kernel, but this is fine as 3051 * faults there are not fatal. 3052 */ 3053 context_clear_pasid_enable(&ce); 3054 context_set_copied(&ce); 3055 3056 new_ce[idx] = ce; 3057 } 3058 3059 tbl[tbl_idx + pos] = new_ce; 3060 3061 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); 3062 3063 out_unmap: 3064 memunmap(old_ce); 3065 3066 out: 3067 return ret; 3068 } 3069 3070 static int copy_translation_tables(struct intel_iommu *iommu) 3071 { 3072 struct context_entry **ctxt_tbls; 3073 struct root_entry *old_rt; 3074 phys_addr_t old_rt_phys; 3075 int ctxt_table_entries; 3076 unsigned long flags; 3077 u64 rtaddr_reg; 3078 int bus, ret; 3079 bool new_ext, ext; 3080 3081 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 3082 ext = !!(rtaddr_reg & DMA_RTADDR_RTT); 3083 new_ext = !!ecap_ecs(iommu->ecap); 3084 3085 /* 3086 * The RTT bit can only be changed when translation is disabled, 3087 * but disabling translation means to open a window for data 3088 * corruption. So bail out and don't copy anything if we would 3089 * have to change the bit. 3090 */ 3091 if (new_ext != ext) 3092 return -EINVAL; 3093 3094 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; 3095 if (!old_rt_phys) 3096 return -EINVAL; 3097 3098 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); 3099 if (!old_rt) 3100 return -ENOMEM; 3101 3102 /* This is too big for the stack - allocate it from slab */ 3103 ctxt_table_entries = ext ? 512 : 256; 3104 ret = -ENOMEM; 3105 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); 3106 if (!ctxt_tbls) 3107 goto out_unmap; 3108 3109 for (bus = 0; bus < 256; bus++) { 3110 ret = copy_context_table(iommu, &old_rt[bus], 3111 ctxt_tbls, bus, ext); 3112 if (ret) { 3113 pr_err("%s: Failed to copy context table for bus %d\n", 3114 iommu->name, bus); 3115 continue; 3116 } 3117 } 3118 3119 spin_lock_irqsave(&iommu->lock, flags); 3120 3121 /* Context tables are copied, now write them to the root_entry table */ 3122 for (bus = 0; bus < 256; bus++) { 3123 int idx = ext ? bus * 2 : bus; 3124 u64 val; 3125 3126 if (ctxt_tbls[idx]) { 3127 val = virt_to_phys(ctxt_tbls[idx]) | 1; 3128 iommu->root_entry[bus].lo = val; 3129 } 3130 3131 if (!ext || !ctxt_tbls[idx + 1]) 3132 continue; 3133 3134 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; 3135 iommu->root_entry[bus].hi = val; 3136 } 3137 3138 spin_unlock_irqrestore(&iommu->lock, flags); 3139 3140 kfree(ctxt_tbls); 3141 3142 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); 3143 3144 ret = 0; 3145 3146 out_unmap: 3147 memunmap(old_rt); 3148 3149 return ret; 3150 } 3151 3152 #ifdef CONFIG_INTEL_IOMMU_SVM 3153 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) 3154 { 3155 struct intel_iommu *iommu = data; 3156 ioasid_t ioasid; 3157 3158 if (!iommu) 3159 return INVALID_IOASID; 3160 /* 3161 * VT-d virtual command interface always uses the full 20 bit 3162 * PASID range. Host can partition guest PASID range based on 3163 * policies but it is out of guest's control. 3164 */ 3165 if (min < PASID_MIN || max > intel_pasid_max_id) 3166 return INVALID_IOASID; 3167 3168 if (vcmd_alloc_pasid(iommu, &ioasid)) 3169 return INVALID_IOASID; 3170 3171 return ioasid; 3172 } 3173 3174 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) 3175 { 3176 struct intel_iommu *iommu = data; 3177 3178 if (!iommu) 3179 return; 3180 /* 3181 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO 3182 * We can only free the PASID when all the devices are unbound. 3183 */ 3184 if (ioasid_find(NULL, ioasid, NULL)) { 3185 pr_alert("Cannot free active IOASID %d\n", ioasid); 3186 return; 3187 } 3188 vcmd_free_pasid(iommu, ioasid); 3189 } 3190 3191 static void register_pasid_allocator(struct intel_iommu *iommu) 3192 { 3193 /* 3194 * If we are running in the host, no need for custom allocator 3195 * in that PASIDs are allocated from the host system-wide. 3196 */ 3197 if (!cap_caching_mode(iommu->cap)) 3198 return; 3199 3200 if (!sm_supported(iommu)) { 3201 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); 3202 return; 3203 } 3204 3205 /* 3206 * Register a custom PASID allocator if we are running in a guest, 3207 * guest PASID must be obtained via virtual command interface. 3208 * There can be multiple vIOMMUs in each guest but only one allocator 3209 * is active. All vIOMMU allocators will eventually be calling the same 3210 * host allocator. 3211 */ 3212 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) 3213 return; 3214 3215 pr_info("Register custom PASID allocator\n"); 3216 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; 3217 iommu->pasid_allocator.free = intel_vcmd_ioasid_free; 3218 iommu->pasid_allocator.pdata = (void *)iommu; 3219 if (ioasid_register_allocator(&iommu->pasid_allocator)) { 3220 pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); 3221 /* 3222 * Disable scalable mode on this IOMMU if there 3223 * is no custom allocator. Mixing SM capable vIOMMU 3224 * and non-SM vIOMMU are not supported. 3225 */ 3226 intel_iommu_sm = 0; 3227 } 3228 } 3229 #endif 3230 3231 static int __init init_dmars(void) 3232 { 3233 struct dmar_drhd_unit *drhd; 3234 struct intel_iommu *iommu; 3235 int ret; 3236 3237 /* 3238 * for each drhd 3239 * allocate root 3240 * initialize and program root entry to not present 3241 * endfor 3242 */ 3243 for_each_drhd_unit(drhd) { 3244 /* 3245 * lock not needed as this is only incremented in the single 3246 * threaded kernel __init code path all other access are read 3247 * only 3248 */ 3249 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) { 3250 g_num_of_iommus++; 3251 continue; 3252 } 3253 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED); 3254 } 3255 3256 /* Preallocate enough resources for IOMMU hot-addition */ 3257 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) 3258 g_num_of_iommus = DMAR_UNITS_SUPPORTED; 3259 3260 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), 3261 GFP_KERNEL); 3262 if (!g_iommus) { 3263 pr_err("Allocating global iommu array failed\n"); 3264 ret = -ENOMEM; 3265 goto error; 3266 } 3267 3268 for_each_iommu(iommu, drhd) { 3269 if (drhd->ignored) { 3270 iommu_disable_translation(iommu); 3271 continue; 3272 } 3273 3274 /* 3275 * Find the max pasid size of all IOMMU's in the system. 3276 * We need to ensure the system pasid table is no bigger 3277 * than the smallest supported. 3278 */ 3279 if (pasid_supported(iommu)) { 3280 u32 temp = 2 << ecap_pss(iommu->ecap); 3281 3282 intel_pasid_max_id = min_t(u32, temp, 3283 intel_pasid_max_id); 3284 } 3285 3286 g_iommus[iommu->seq_id] = iommu; 3287 3288 intel_iommu_init_qi(iommu); 3289 3290 ret = iommu_init_domains(iommu); 3291 if (ret) 3292 goto free_iommu; 3293 3294 init_translation_status(iommu); 3295 3296 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { 3297 iommu_disable_translation(iommu); 3298 clear_translation_pre_enabled(iommu); 3299 pr_warn("Translation was enabled for %s but we are not in kdump mode\n", 3300 iommu->name); 3301 } 3302 3303 /* 3304 * TBD: 3305 * we could share the same root & context tables 3306 * among all IOMMU's. Need to Split it later. 3307 */ 3308 ret = iommu_alloc_root_entry(iommu); 3309 if (ret) 3310 goto free_iommu; 3311 3312 if (translation_pre_enabled(iommu)) { 3313 pr_info("Translation already enabled - trying to copy translation structures\n"); 3314 3315 ret = copy_translation_tables(iommu); 3316 if (ret) { 3317 /* 3318 * We found the IOMMU with translation 3319 * enabled - but failed to copy over the 3320 * old root-entry table. Try to proceed 3321 * by disabling translation now and 3322 * allocating a clean root-entry table. 3323 * This might cause DMAR faults, but 3324 * probably the dump will still succeed. 3325 */ 3326 pr_err("Failed to copy translation tables from previous kernel for %s\n", 3327 iommu->name); 3328 iommu_disable_translation(iommu); 3329 clear_translation_pre_enabled(iommu); 3330 } else { 3331 pr_info("Copied translation tables from previous kernel for %s\n", 3332 iommu->name); 3333 } 3334 } 3335 3336 if (!ecap_pass_through(iommu->ecap)) 3337 hw_pass_through = 0; 3338 intel_svm_check(iommu); 3339 } 3340 3341 /* 3342 * Now that qi is enabled on all iommus, set the root entry and flush 3343 * caches. This is required on some Intel X58 chipsets, otherwise the 3344 * flush_context function will loop forever and the boot hangs. 3345 */ 3346 for_each_active_iommu(iommu, drhd) { 3347 iommu_flush_write_buffer(iommu); 3348 #ifdef CONFIG_INTEL_IOMMU_SVM 3349 register_pasid_allocator(iommu); 3350 #endif 3351 iommu_set_root_entry(iommu); 3352 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 3353 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 3354 } 3355 3356 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA 3357 dmar_map_gfx = 0; 3358 #endif 3359 3360 if (!dmar_map_gfx) 3361 iommu_identity_mapping |= IDENTMAP_GFX; 3362 3363 check_tylersburg_isoch(); 3364 3365 ret = si_domain_init(hw_pass_through); 3366 if (ret) 3367 goto free_iommu; 3368 3369 /* 3370 * for each drhd 3371 * enable fault log 3372 * global invalidate context cache 3373 * global invalidate iotlb 3374 * enable translation 3375 */ 3376 for_each_iommu(iommu, drhd) { 3377 if (drhd->ignored) { 3378 /* 3379 * we always have to disable PMRs or DMA may fail on 3380 * this device 3381 */ 3382 if (force_on) 3383 iommu_disable_protect_mem_regions(iommu); 3384 continue; 3385 } 3386 3387 iommu_flush_write_buffer(iommu); 3388 3389 #ifdef CONFIG_INTEL_IOMMU_SVM 3390 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 3391 /* 3392 * Call dmar_alloc_hwirq() with dmar_global_lock held, 3393 * could cause possible lock race condition. 3394 */ 3395 up_write(&dmar_global_lock); 3396 ret = intel_svm_enable_prq(iommu); 3397 down_write(&dmar_global_lock); 3398 if (ret) 3399 goto free_iommu; 3400 } 3401 #endif 3402 ret = dmar_set_interrupt(iommu); 3403 if (ret) 3404 goto free_iommu; 3405 } 3406 3407 return 0; 3408 3409 free_iommu: 3410 for_each_active_iommu(iommu, drhd) { 3411 disable_dmar_iommu(iommu); 3412 free_dmar_iommu(iommu); 3413 } 3414 3415 kfree(g_iommus); 3416 3417 error: 3418 return ret; 3419 } 3420 3421 /* This takes a number of _MM_ pages, not VTD pages */ 3422 static unsigned long intel_alloc_iova(struct device *dev, 3423 struct dmar_domain *domain, 3424 unsigned long nrpages, uint64_t dma_mask) 3425 { 3426 unsigned long iova_pfn; 3427 3428 /* 3429 * Restrict dma_mask to the width that the iommu can handle. 3430 * First-level translation restricts the input-address to a 3431 * canonical address (i.e., address bits 63:N have the same 3432 * value as address bit [N-1], where N is 48-bits with 4-level 3433 * paging and 57-bits with 5-level paging). Hence, skip bit 3434 * [N-1]. 3435 */ 3436 if (domain_use_first_level(domain)) 3437 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1), 3438 dma_mask); 3439 else 3440 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), 3441 dma_mask); 3442 3443 /* Ensure we reserve the whole size-aligned region */ 3444 nrpages = __roundup_pow_of_two(nrpages); 3445 3446 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) { 3447 /* 3448 * First try to allocate an io virtual address in 3449 * DMA_BIT_MASK(32) and if that fails then try allocating 3450 * from higher range 3451 */ 3452 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3453 IOVA_PFN(DMA_BIT_MASK(32)), false); 3454 if (iova_pfn) 3455 return iova_pfn; 3456 } 3457 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, 3458 IOVA_PFN(dma_mask), true); 3459 if (unlikely(!iova_pfn)) { 3460 dev_err_once(dev, "Allocating %ld-page iova failed\n", 3461 nrpages); 3462 return 0; 3463 } 3464 3465 return iova_pfn; 3466 } 3467 3468 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, 3469 size_t size, int dir, u64 dma_mask) 3470 { 3471 struct dmar_domain *domain; 3472 phys_addr_t start_paddr; 3473 unsigned long iova_pfn; 3474 int prot = 0; 3475 int ret; 3476 struct intel_iommu *iommu; 3477 unsigned long paddr_pfn = paddr >> PAGE_SHIFT; 3478 3479 BUG_ON(dir == DMA_NONE); 3480 3481 if (unlikely(attach_deferred(dev))) 3482 do_deferred_attach(dev); 3483 3484 domain = find_domain(dev); 3485 if (!domain) 3486 return DMA_MAPPING_ERROR; 3487 3488 iommu = domain_get_iommu(domain); 3489 size = aligned_nrpages(paddr, size); 3490 3491 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask); 3492 if (!iova_pfn) 3493 goto error; 3494 3495 /* 3496 * Check if DMAR supports zero-length reads on write only 3497 * mappings.. 3498 */ 3499 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3500 !cap_zlr(iommu->cap)) 3501 prot |= DMA_PTE_READ; 3502 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3503 prot |= DMA_PTE_WRITE; 3504 /* 3505 * paddr - (paddr + size) might be partial page, we should map the whole 3506 * page. Note: if two part of one page are separately mapped, we 3507 * might have two guest_addr mapping to the same host paddr, but this 3508 * is not a big problem 3509 */ 3510 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3511 mm_to_dma_pfn(paddr_pfn), size, prot); 3512 if (ret) 3513 goto error; 3514 3515 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT; 3516 start_paddr += paddr & ~PAGE_MASK; 3517 3518 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT); 3519 3520 return start_paddr; 3521 3522 error: 3523 if (iova_pfn) 3524 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3525 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n", 3526 size, (unsigned long long)paddr, dir); 3527 return DMA_MAPPING_ERROR; 3528 } 3529 3530 static dma_addr_t intel_map_page(struct device *dev, struct page *page, 3531 unsigned long offset, size_t size, 3532 enum dma_data_direction dir, 3533 unsigned long attrs) 3534 { 3535 return __intel_map_single(dev, page_to_phys(page) + offset, 3536 size, dir, *dev->dma_mask); 3537 } 3538 3539 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr, 3540 size_t size, enum dma_data_direction dir, 3541 unsigned long attrs) 3542 { 3543 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask); 3544 } 3545 3546 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) 3547 { 3548 struct dmar_domain *domain; 3549 unsigned long start_pfn, last_pfn; 3550 unsigned long nrpages; 3551 unsigned long iova_pfn; 3552 struct intel_iommu *iommu; 3553 struct page *freelist; 3554 struct pci_dev *pdev = NULL; 3555 3556 domain = find_domain(dev); 3557 BUG_ON(!domain); 3558 3559 iommu = domain_get_iommu(domain); 3560 3561 iova_pfn = IOVA_PFN(dev_addr); 3562 3563 nrpages = aligned_nrpages(dev_addr, size); 3564 start_pfn = mm_to_dma_pfn(iova_pfn); 3565 last_pfn = start_pfn + nrpages - 1; 3566 3567 if (dev_is_pci(dev)) 3568 pdev = to_pci_dev(dev); 3569 3570 freelist = domain_unmap(domain, start_pfn, last_pfn); 3571 if (intel_iommu_strict || (pdev && pdev->untrusted) || 3572 !has_iova_flush_queue(&domain->iovad)) { 3573 iommu_flush_iotlb_psi(iommu, domain, start_pfn, 3574 nrpages, !freelist, 0); 3575 /* free iova */ 3576 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3577 dma_free_pagelist(freelist); 3578 } else { 3579 queue_iova(&domain->iovad, iova_pfn, nrpages, 3580 (unsigned long)freelist); 3581 /* 3582 * queue up the release of the unmap to save the 1/6th of the 3583 * cpu used up by the iotlb flush operation... 3584 */ 3585 } 3586 3587 trace_unmap_single(dev, dev_addr, size); 3588 } 3589 3590 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, 3591 size_t size, enum dma_data_direction dir, 3592 unsigned long attrs) 3593 { 3594 intel_unmap(dev, dev_addr, size); 3595 } 3596 3597 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr, 3598 size_t size, enum dma_data_direction dir, unsigned long attrs) 3599 { 3600 intel_unmap(dev, dev_addr, size); 3601 } 3602 3603 static void *intel_alloc_coherent(struct device *dev, size_t size, 3604 dma_addr_t *dma_handle, gfp_t flags, 3605 unsigned long attrs) 3606 { 3607 struct page *page = NULL; 3608 int order; 3609 3610 if (unlikely(attach_deferred(dev))) 3611 do_deferred_attach(dev); 3612 3613 size = PAGE_ALIGN(size); 3614 order = get_order(size); 3615 3616 if (gfpflags_allow_blocking(flags)) { 3617 unsigned int count = size >> PAGE_SHIFT; 3618 3619 page = dma_alloc_from_contiguous(dev, count, order, 3620 flags & __GFP_NOWARN); 3621 } 3622 3623 if (!page) 3624 page = alloc_pages(flags, order); 3625 if (!page) 3626 return NULL; 3627 memset(page_address(page), 0, size); 3628 3629 *dma_handle = __intel_map_single(dev, page_to_phys(page), size, 3630 DMA_BIDIRECTIONAL, 3631 dev->coherent_dma_mask); 3632 if (*dma_handle != DMA_MAPPING_ERROR) 3633 return page_address(page); 3634 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3635 __free_pages(page, order); 3636 3637 return NULL; 3638 } 3639 3640 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, 3641 dma_addr_t dma_handle, unsigned long attrs) 3642 { 3643 int order; 3644 struct page *page = virt_to_page(vaddr); 3645 3646 size = PAGE_ALIGN(size); 3647 order = get_order(size); 3648 3649 intel_unmap(dev, dma_handle, size); 3650 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3651 __free_pages(page, order); 3652 } 3653 3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, 3655 int nelems, enum dma_data_direction dir, 3656 unsigned long attrs) 3657 { 3658 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK; 3659 unsigned long nrpages = 0; 3660 struct scatterlist *sg; 3661 int i; 3662 3663 for_each_sg(sglist, sg, nelems, i) { 3664 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); 3665 } 3666 3667 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3668 3669 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT); 3670 } 3671 3672 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3673 enum dma_data_direction dir, unsigned long attrs) 3674 { 3675 int i; 3676 struct dmar_domain *domain; 3677 size_t size = 0; 3678 int prot = 0; 3679 unsigned long iova_pfn; 3680 int ret; 3681 struct scatterlist *sg; 3682 unsigned long start_vpfn; 3683 struct intel_iommu *iommu; 3684 3685 BUG_ON(dir == DMA_NONE); 3686 3687 if (unlikely(attach_deferred(dev))) 3688 do_deferred_attach(dev); 3689 3690 domain = find_domain(dev); 3691 if (!domain) 3692 return 0; 3693 3694 iommu = domain_get_iommu(domain); 3695 3696 for_each_sg(sglist, sg, nelems, i) 3697 size += aligned_nrpages(sg->offset, sg->length); 3698 3699 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), 3700 *dev->dma_mask); 3701 if (!iova_pfn) { 3702 sglist->dma_length = 0; 3703 return 0; 3704 } 3705 3706 /* 3707 * Check if DMAR supports zero-length reads on write only 3708 * mappings.. 3709 */ 3710 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \ 3711 !cap_zlr(iommu->cap)) 3712 prot |= DMA_PTE_READ; 3713 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3714 prot |= DMA_PTE_WRITE; 3715 3716 start_vpfn = mm_to_dma_pfn(iova_pfn); 3717 3718 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot); 3719 if (unlikely(ret)) { 3720 dma_pte_free_pagetable(domain, start_vpfn, 3721 start_vpfn + size - 1, 3722 agaw_to_level(domain->agaw) + 1); 3723 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size)); 3724 return 0; 3725 } 3726 3727 for_each_sg(sglist, sg, nelems, i) 3728 trace_map_sg(dev, i + 1, nelems, sg); 3729 3730 return nelems; 3731 } 3732 3733 static u64 intel_get_required_mask(struct device *dev) 3734 { 3735 return DMA_BIT_MASK(32); 3736 } 3737 3738 static const struct dma_map_ops intel_dma_ops = { 3739 .alloc = intel_alloc_coherent, 3740 .free = intel_free_coherent, 3741 .map_sg = intel_map_sg, 3742 .unmap_sg = intel_unmap_sg, 3743 .map_page = intel_map_page, 3744 .unmap_page = intel_unmap_page, 3745 .map_resource = intel_map_resource, 3746 .unmap_resource = intel_unmap_resource, 3747 .dma_supported = dma_direct_supported, 3748 .mmap = dma_common_mmap, 3749 .get_sgtable = dma_common_get_sgtable, 3750 .alloc_pages = dma_common_alloc_pages, 3751 .free_pages = dma_common_free_pages, 3752 .get_required_mask = intel_get_required_mask, 3753 }; 3754 3755 static void 3756 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size, 3757 enum dma_data_direction dir, enum dma_sync_target target) 3758 { 3759 struct dmar_domain *domain; 3760 phys_addr_t tlb_addr; 3761 3762 domain = find_domain(dev); 3763 if (WARN_ON(!domain)) 3764 return; 3765 3766 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr); 3767 if (is_swiotlb_buffer(tlb_addr)) 3768 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target); 3769 } 3770 3771 static dma_addr_t 3772 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, 3773 enum dma_data_direction dir, unsigned long attrs, 3774 u64 dma_mask) 3775 { 3776 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3777 struct dmar_domain *domain; 3778 struct intel_iommu *iommu; 3779 unsigned long iova_pfn; 3780 unsigned long nrpages; 3781 phys_addr_t tlb_addr; 3782 int prot = 0; 3783 int ret; 3784 3785 if (unlikely(attach_deferred(dev))) 3786 do_deferred_attach(dev); 3787 3788 domain = find_domain(dev); 3789 3790 if (WARN_ON(dir == DMA_NONE || !domain)) 3791 return DMA_MAPPING_ERROR; 3792 3793 iommu = domain_get_iommu(domain); 3794 if (WARN_ON(!iommu)) 3795 return DMA_MAPPING_ERROR; 3796 3797 nrpages = aligned_nrpages(0, size); 3798 iova_pfn = intel_alloc_iova(dev, domain, 3799 dma_to_mm_pfn(nrpages), dma_mask); 3800 if (!iova_pfn) 3801 return DMA_MAPPING_ERROR; 3802 3803 /* 3804 * Check if DMAR supports zero-length reads on write only 3805 * mappings.. 3806 */ 3807 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || 3808 !cap_zlr(iommu->cap)) 3809 prot |= DMA_PTE_READ; 3810 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 3811 prot |= DMA_PTE_WRITE; 3812 3813 /* 3814 * If both the physical buffer start address and size are 3815 * page aligned, we don't need to use a bounce page. 3816 */ 3817 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { 3818 tlb_addr = swiotlb_tbl_map_single(dev, 3819 phys_to_dma_unencrypted(dev, io_tlb_start), 3820 paddr, size, aligned_size, dir, attrs); 3821 if (tlb_addr == DMA_MAPPING_ERROR) { 3822 goto swiotlb_error; 3823 } else { 3824 /* Cleanup the padding area. */ 3825 void *padding_start = phys_to_virt(tlb_addr); 3826 size_t padding_size = aligned_size; 3827 3828 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 3829 (dir == DMA_TO_DEVICE || 3830 dir == DMA_BIDIRECTIONAL)) { 3831 padding_start += size; 3832 padding_size -= size; 3833 } 3834 3835 memset(padding_start, 0, padding_size); 3836 } 3837 } else { 3838 tlb_addr = paddr; 3839 } 3840 3841 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn), 3842 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot); 3843 if (ret) 3844 goto mapping_error; 3845 3846 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size); 3847 3848 return (phys_addr_t)iova_pfn << PAGE_SHIFT; 3849 3850 mapping_error: 3851 if (is_swiotlb_buffer(tlb_addr)) 3852 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3853 aligned_size, dir, attrs); 3854 swiotlb_error: 3855 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages)); 3856 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n", 3857 size, (unsigned long long)paddr, dir); 3858 3859 return DMA_MAPPING_ERROR; 3860 } 3861 3862 static void 3863 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, 3864 enum dma_data_direction dir, unsigned long attrs) 3865 { 3866 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE); 3867 struct dmar_domain *domain; 3868 phys_addr_t tlb_addr; 3869 3870 domain = find_domain(dev); 3871 if (WARN_ON(!domain)) 3872 return; 3873 3874 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr); 3875 if (WARN_ON(!tlb_addr)) 3876 return; 3877 3878 intel_unmap(dev, dev_addr, size); 3879 if (is_swiotlb_buffer(tlb_addr)) 3880 swiotlb_tbl_unmap_single(dev, tlb_addr, size, 3881 aligned_size, dir, attrs); 3882 3883 trace_bounce_unmap_single(dev, dev_addr, size); 3884 } 3885 3886 static dma_addr_t 3887 bounce_map_page(struct device *dev, struct page *page, unsigned long offset, 3888 size_t size, enum dma_data_direction dir, unsigned long attrs) 3889 { 3890 return bounce_map_single(dev, page_to_phys(page) + offset, 3891 size, dir, attrs, *dev->dma_mask); 3892 } 3893 3894 static dma_addr_t 3895 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, 3896 enum dma_data_direction dir, unsigned long attrs) 3897 { 3898 return bounce_map_single(dev, phys_addr, size, 3899 dir, attrs, *dev->dma_mask); 3900 } 3901 3902 static void 3903 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, 3904 enum dma_data_direction dir, unsigned long attrs) 3905 { 3906 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3907 } 3908 3909 static void 3910 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size, 3911 enum dma_data_direction dir, unsigned long attrs) 3912 { 3913 bounce_unmap_single(dev, dev_addr, size, dir, attrs); 3914 } 3915 3916 static void 3917 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3918 enum dma_data_direction dir, unsigned long attrs) 3919 { 3920 struct scatterlist *sg; 3921 int i; 3922 3923 for_each_sg(sglist, sg, nelems, i) 3924 bounce_unmap_page(dev, sg->dma_address, 3925 sg_dma_len(sg), dir, attrs); 3926 } 3927 3928 static int 3929 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, 3930 enum dma_data_direction dir, unsigned long attrs) 3931 { 3932 int i; 3933 struct scatterlist *sg; 3934 3935 for_each_sg(sglist, sg, nelems, i) { 3936 sg->dma_address = bounce_map_page(dev, sg_page(sg), 3937 sg->offset, sg->length, 3938 dir, attrs); 3939 if (sg->dma_address == DMA_MAPPING_ERROR) 3940 goto out_unmap; 3941 sg_dma_len(sg) = sg->length; 3942 } 3943 3944 for_each_sg(sglist, sg, nelems, i) 3945 trace_bounce_map_sg(dev, i + 1, nelems, sg); 3946 3947 return nelems; 3948 3949 out_unmap: 3950 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); 3951 return 0; 3952 } 3953 3954 static void 3955 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr, 3956 size_t size, enum dma_data_direction dir) 3957 { 3958 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU); 3959 } 3960 3961 static void 3962 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr, 3963 size_t size, enum dma_data_direction dir) 3964 { 3965 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE); 3966 } 3967 3968 static void 3969 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, 3970 int nelems, enum dma_data_direction dir) 3971 { 3972 struct scatterlist *sg; 3973 int i; 3974 3975 for_each_sg(sglist, sg, nelems, i) 3976 bounce_sync_single(dev, sg_dma_address(sg), 3977 sg_dma_len(sg), dir, SYNC_FOR_CPU); 3978 } 3979 3980 static void 3981 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, 3982 int nelems, enum dma_data_direction dir) 3983 { 3984 struct scatterlist *sg; 3985 int i; 3986 3987 for_each_sg(sglist, sg, nelems, i) 3988 bounce_sync_single(dev, sg_dma_address(sg), 3989 sg_dma_len(sg), dir, SYNC_FOR_DEVICE); 3990 } 3991 3992 static const struct dma_map_ops bounce_dma_ops = { 3993 .alloc = intel_alloc_coherent, 3994 .free = intel_free_coherent, 3995 .map_sg = bounce_map_sg, 3996 .unmap_sg = bounce_unmap_sg, 3997 .map_page = bounce_map_page, 3998 .unmap_page = bounce_unmap_page, 3999 .sync_single_for_cpu = bounce_sync_single_for_cpu, 4000 .sync_single_for_device = bounce_sync_single_for_device, 4001 .sync_sg_for_cpu = bounce_sync_sg_for_cpu, 4002 .sync_sg_for_device = bounce_sync_sg_for_device, 4003 .map_resource = bounce_map_resource, 4004 .unmap_resource = bounce_unmap_resource, 4005 .alloc_pages = dma_common_alloc_pages, 4006 .free_pages = dma_common_free_pages, 4007 .dma_supported = dma_direct_supported, 4008 }; 4009 4010 static inline int iommu_domain_cache_init(void) 4011 { 4012 int ret = 0; 4013 4014 iommu_domain_cache = kmem_cache_create("iommu_domain", 4015 sizeof(struct dmar_domain), 4016 0, 4017 SLAB_HWCACHE_ALIGN, 4018 4019 NULL); 4020 if (!iommu_domain_cache) { 4021 pr_err("Couldn't create iommu_domain cache\n"); 4022 ret = -ENOMEM; 4023 } 4024 4025 return ret; 4026 } 4027 4028 static inline int iommu_devinfo_cache_init(void) 4029 { 4030 int ret = 0; 4031 4032 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo", 4033 sizeof(struct device_domain_info), 4034 0, 4035 SLAB_HWCACHE_ALIGN, 4036 NULL); 4037 if (!iommu_devinfo_cache) { 4038 pr_err("Couldn't create devinfo cache\n"); 4039 ret = -ENOMEM; 4040 } 4041 4042 return ret; 4043 } 4044 4045 static int __init iommu_init_mempool(void) 4046 { 4047 int ret; 4048 ret = iova_cache_get(); 4049 if (ret) 4050 return ret; 4051 4052 ret = iommu_domain_cache_init(); 4053 if (ret) 4054 goto domain_error; 4055 4056 ret = iommu_devinfo_cache_init(); 4057 if (!ret) 4058 return ret; 4059 4060 kmem_cache_destroy(iommu_domain_cache); 4061 domain_error: 4062 iova_cache_put(); 4063 4064 return -ENOMEM; 4065 } 4066 4067 static void __init iommu_exit_mempool(void) 4068 { 4069 kmem_cache_destroy(iommu_devinfo_cache); 4070 kmem_cache_destroy(iommu_domain_cache); 4071 iova_cache_put(); 4072 } 4073 4074 static void __init init_no_remapping_devices(void) 4075 { 4076 struct dmar_drhd_unit *drhd; 4077 struct device *dev; 4078 int i; 4079 4080 for_each_drhd_unit(drhd) { 4081 if (!drhd->include_all) { 4082 for_each_active_dev_scope(drhd->devices, 4083 drhd->devices_cnt, i, dev) 4084 break; 4085 /* ignore DMAR unit if no devices exist */ 4086 if (i == drhd->devices_cnt) 4087 drhd->ignored = 1; 4088 } 4089 } 4090 4091 for_each_active_drhd_unit(drhd) { 4092 if (drhd->include_all) 4093 continue; 4094 4095 for_each_active_dev_scope(drhd->devices, 4096 drhd->devices_cnt, i, dev) 4097 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) 4098 break; 4099 if (i < drhd->devices_cnt) 4100 continue; 4101 4102 /* This IOMMU has *only* gfx devices. Either bypass it or 4103 set the gfx_mapped flag, as appropriate */ 4104 drhd->gfx_dedicated = 1; 4105 if (!dmar_map_gfx) 4106 drhd->ignored = 1; 4107 } 4108 } 4109 4110 #ifdef CONFIG_SUSPEND 4111 static int init_iommu_hw(void) 4112 { 4113 struct dmar_drhd_unit *drhd; 4114 struct intel_iommu *iommu = NULL; 4115 4116 for_each_active_iommu(iommu, drhd) 4117 if (iommu->qi) 4118 dmar_reenable_qi(iommu); 4119 4120 for_each_iommu(iommu, drhd) { 4121 if (drhd->ignored) { 4122 /* 4123 * we always have to disable PMRs or DMA may fail on 4124 * this device 4125 */ 4126 if (force_on) 4127 iommu_disable_protect_mem_regions(iommu); 4128 continue; 4129 } 4130 4131 iommu_flush_write_buffer(iommu); 4132 4133 iommu_set_root_entry(iommu); 4134 4135 iommu->flush.flush_context(iommu, 0, 0, 0, 4136 DMA_CCMD_GLOBAL_INVL); 4137 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4138 iommu_enable_translation(iommu); 4139 iommu_disable_protect_mem_regions(iommu); 4140 } 4141 4142 return 0; 4143 } 4144 4145 static void iommu_flush_all(void) 4146 { 4147 struct dmar_drhd_unit *drhd; 4148 struct intel_iommu *iommu; 4149 4150 for_each_active_iommu(iommu, drhd) { 4151 iommu->flush.flush_context(iommu, 0, 0, 0, 4152 DMA_CCMD_GLOBAL_INVL); 4153 iommu->flush.flush_iotlb(iommu, 0, 0, 0, 4154 DMA_TLB_GLOBAL_FLUSH); 4155 } 4156 } 4157 4158 static int iommu_suspend(void) 4159 { 4160 struct dmar_drhd_unit *drhd; 4161 struct intel_iommu *iommu = NULL; 4162 unsigned long flag; 4163 4164 for_each_active_iommu(iommu, drhd) { 4165 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32), 4166 GFP_ATOMIC); 4167 if (!iommu->iommu_state) 4168 goto nomem; 4169 } 4170 4171 iommu_flush_all(); 4172 4173 for_each_active_iommu(iommu, drhd) { 4174 iommu_disable_translation(iommu); 4175 4176 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4177 4178 iommu->iommu_state[SR_DMAR_FECTL_REG] = 4179 readl(iommu->reg + DMAR_FECTL_REG); 4180 iommu->iommu_state[SR_DMAR_FEDATA_REG] = 4181 readl(iommu->reg + DMAR_FEDATA_REG); 4182 iommu->iommu_state[SR_DMAR_FEADDR_REG] = 4183 readl(iommu->reg + DMAR_FEADDR_REG); 4184 iommu->iommu_state[SR_DMAR_FEUADDR_REG] = 4185 readl(iommu->reg + DMAR_FEUADDR_REG); 4186 4187 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4188 } 4189 return 0; 4190 4191 nomem: 4192 for_each_active_iommu(iommu, drhd) 4193 kfree(iommu->iommu_state); 4194 4195 return -ENOMEM; 4196 } 4197 4198 static void iommu_resume(void) 4199 { 4200 struct dmar_drhd_unit *drhd; 4201 struct intel_iommu *iommu = NULL; 4202 unsigned long flag; 4203 4204 if (init_iommu_hw()) { 4205 if (force_on) 4206 panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); 4207 else 4208 WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); 4209 return; 4210 } 4211 4212 for_each_active_iommu(iommu, drhd) { 4213 4214 raw_spin_lock_irqsave(&iommu->register_lock, flag); 4215 4216 writel(iommu->iommu_state[SR_DMAR_FECTL_REG], 4217 iommu->reg + DMAR_FECTL_REG); 4218 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], 4219 iommu->reg + DMAR_FEDATA_REG); 4220 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], 4221 iommu->reg + DMAR_FEADDR_REG); 4222 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], 4223 iommu->reg + DMAR_FEUADDR_REG); 4224 4225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 4226 } 4227 4228 for_each_active_iommu(iommu, drhd) 4229 kfree(iommu->iommu_state); 4230 } 4231 4232 static struct syscore_ops iommu_syscore_ops = { 4233 .resume = iommu_resume, 4234 .suspend = iommu_suspend, 4235 }; 4236 4237 static void __init init_iommu_pm_ops(void) 4238 { 4239 register_syscore_ops(&iommu_syscore_ops); 4240 } 4241 4242 #else 4243 static inline void init_iommu_pm_ops(void) {} 4244 #endif /* CONFIG_PM */ 4245 4246 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) 4247 { 4248 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || 4249 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || 4250 rmrr->end_address <= rmrr->base_address || 4251 arch_rmrr_sanity_check(rmrr)) 4252 return -EINVAL; 4253 4254 return 0; 4255 } 4256 4257 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) 4258 { 4259 struct acpi_dmar_reserved_memory *rmrr; 4260 struct dmar_rmrr_unit *rmrru; 4261 4262 rmrr = (struct acpi_dmar_reserved_memory *)header; 4263 if (rmrr_sanity_check(rmrr)) { 4264 pr_warn(FW_BUG 4265 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" 4266 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 4267 rmrr->base_address, rmrr->end_address, 4268 dmi_get_system_info(DMI_BIOS_VENDOR), 4269 dmi_get_system_info(DMI_BIOS_VERSION), 4270 dmi_get_system_info(DMI_PRODUCT_VERSION)); 4271 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); 4272 } 4273 4274 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); 4275 if (!rmrru) 4276 goto out; 4277 4278 rmrru->hdr = header; 4279 4280 rmrru->base_address = rmrr->base_address; 4281 rmrru->end_address = rmrr->end_address; 4282 4283 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), 4284 ((void *)rmrr) + rmrr->header.length, 4285 &rmrru->devices_cnt); 4286 if (rmrru->devices_cnt && rmrru->devices == NULL) 4287 goto free_rmrru; 4288 4289 list_add(&rmrru->list, &dmar_rmrr_units); 4290 4291 return 0; 4292 free_rmrru: 4293 kfree(rmrru); 4294 out: 4295 return -ENOMEM; 4296 } 4297 4298 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) 4299 { 4300 struct dmar_atsr_unit *atsru; 4301 struct acpi_dmar_atsr *tmp; 4302 4303 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, 4304 dmar_rcu_check()) { 4305 tmp = (struct acpi_dmar_atsr *)atsru->hdr; 4306 if (atsr->segment != tmp->segment) 4307 continue; 4308 if (atsr->header.length != tmp->header.length) 4309 continue; 4310 if (memcmp(atsr, tmp, atsr->header.length) == 0) 4311 return atsru; 4312 } 4313 4314 return NULL; 4315 } 4316 4317 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4318 { 4319 struct acpi_dmar_atsr *atsr; 4320 struct dmar_atsr_unit *atsru; 4321 4322 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) 4323 return 0; 4324 4325 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4326 atsru = dmar_find_atsr(atsr); 4327 if (atsru) 4328 return 0; 4329 4330 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); 4331 if (!atsru) 4332 return -ENOMEM; 4333 4334 /* 4335 * If memory is allocated from slab by ACPI _DSM method, we need to 4336 * copy the memory content because the memory buffer will be freed 4337 * on return. 4338 */ 4339 atsru->hdr = (void *)(atsru + 1); 4340 memcpy(atsru->hdr, hdr, hdr->length); 4341 atsru->include_all = atsr->flags & 0x1; 4342 if (!atsru->include_all) { 4343 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), 4344 (void *)atsr + atsr->header.length, 4345 &atsru->devices_cnt); 4346 if (atsru->devices_cnt && atsru->devices == NULL) { 4347 kfree(atsru); 4348 return -ENOMEM; 4349 } 4350 } 4351 4352 list_add_rcu(&atsru->list, &dmar_atsr_units); 4353 4354 return 0; 4355 } 4356 4357 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) 4358 { 4359 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); 4360 kfree(atsru); 4361 } 4362 4363 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4364 { 4365 struct acpi_dmar_atsr *atsr; 4366 struct dmar_atsr_unit *atsru; 4367 4368 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4369 atsru = dmar_find_atsr(atsr); 4370 if (atsru) { 4371 list_del_rcu(&atsru->list); 4372 synchronize_rcu(); 4373 intel_iommu_free_atsr(atsru); 4374 } 4375 4376 return 0; 4377 } 4378 4379 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) 4380 { 4381 int i; 4382 struct device *dev; 4383 struct acpi_dmar_atsr *atsr; 4384 struct dmar_atsr_unit *atsru; 4385 4386 atsr = container_of(hdr, struct acpi_dmar_atsr, header); 4387 atsru = dmar_find_atsr(atsr); 4388 if (!atsru) 4389 return 0; 4390 4391 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { 4392 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, 4393 i, dev) 4394 return -EBUSY; 4395 } 4396 4397 return 0; 4398 } 4399 4400 static int intel_iommu_add(struct dmar_drhd_unit *dmaru) 4401 { 4402 int sp, ret; 4403 struct intel_iommu *iommu = dmaru->iommu; 4404 4405 if (g_iommus[iommu->seq_id]) 4406 return 0; 4407 4408 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { 4409 pr_warn("%s: Doesn't support hardware pass through.\n", 4410 iommu->name); 4411 return -ENXIO; 4412 } 4413 if (!ecap_sc_support(iommu->ecap) && 4414 domain_update_iommu_snooping(iommu)) { 4415 pr_warn("%s: Doesn't support snooping.\n", 4416 iommu->name); 4417 return -ENXIO; 4418 } 4419 sp = domain_update_iommu_superpage(NULL, iommu) - 1; 4420 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { 4421 pr_warn("%s: Doesn't support large page.\n", 4422 iommu->name); 4423 return -ENXIO; 4424 } 4425 4426 /* 4427 * Disable translation if already enabled prior to OS handover. 4428 */ 4429 if (iommu->gcmd & DMA_GCMD_TE) 4430 iommu_disable_translation(iommu); 4431 4432 g_iommus[iommu->seq_id] = iommu; 4433 ret = iommu_init_domains(iommu); 4434 if (ret == 0) 4435 ret = iommu_alloc_root_entry(iommu); 4436 if (ret) 4437 goto out; 4438 4439 intel_svm_check(iommu); 4440 4441 if (dmaru->ignored) { 4442 /* 4443 * we always have to disable PMRs or DMA may fail on this device 4444 */ 4445 if (force_on) 4446 iommu_disable_protect_mem_regions(iommu); 4447 return 0; 4448 } 4449 4450 intel_iommu_init_qi(iommu); 4451 iommu_flush_write_buffer(iommu); 4452 4453 #ifdef CONFIG_INTEL_IOMMU_SVM 4454 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { 4455 ret = intel_svm_enable_prq(iommu); 4456 if (ret) 4457 goto disable_iommu; 4458 } 4459 #endif 4460 ret = dmar_set_interrupt(iommu); 4461 if (ret) 4462 goto disable_iommu; 4463 4464 iommu_set_root_entry(iommu); 4465 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); 4466 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 4467 iommu_enable_translation(iommu); 4468 4469 iommu_disable_protect_mem_regions(iommu); 4470 return 0; 4471 4472 disable_iommu: 4473 disable_dmar_iommu(iommu); 4474 out: 4475 free_dmar_iommu(iommu); 4476 return ret; 4477 } 4478 4479 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) 4480 { 4481 int ret = 0; 4482 struct intel_iommu *iommu = dmaru->iommu; 4483 4484 if (!intel_iommu_enabled) 4485 return 0; 4486 if (iommu == NULL) 4487 return -EINVAL; 4488 4489 if (insert) { 4490 ret = intel_iommu_add(dmaru); 4491 } else { 4492 disable_dmar_iommu(iommu); 4493 free_dmar_iommu(iommu); 4494 } 4495 4496 return ret; 4497 } 4498 4499 static void intel_iommu_free_dmars(void) 4500 { 4501 struct dmar_rmrr_unit *rmrru, *rmrr_n; 4502 struct dmar_atsr_unit *atsru, *atsr_n; 4503 4504 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { 4505 list_del(&rmrru->list); 4506 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); 4507 kfree(rmrru); 4508 } 4509 4510 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { 4511 list_del(&atsru->list); 4512 intel_iommu_free_atsr(atsru); 4513 } 4514 } 4515 4516 int dmar_find_matched_atsr_unit(struct pci_dev *dev) 4517 { 4518 int i, ret = 1; 4519 struct pci_bus *bus; 4520 struct pci_dev *bridge = NULL; 4521 struct device *tmp; 4522 struct acpi_dmar_atsr *atsr; 4523 struct dmar_atsr_unit *atsru; 4524 4525 dev = pci_physfn(dev); 4526 for (bus = dev->bus; bus; bus = bus->parent) { 4527 bridge = bus->self; 4528 /* If it's an integrated device, allow ATS */ 4529 if (!bridge) 4530 return 1; 4531 /* Connected via non-PCIe: no ATS */ 4532 if (!pci_is_pcie(bridge) || 4533 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) 4534 return 0; 4535 /* If we found the root port, look it up in the ATSR */ 4536 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) 4537 break; 4538 } 4539 4540 rcu_read_lock(); 4541 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { 4542 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4543 if (atsr->segment != pci_domain_nr(dev->bus)) 4544 continue; 4545 4546 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) 4547 if (tmp == &bridge->dev) 4548 goto out; 4549 4550 if (atsru->include_all) 4551 goto out; 4552 } 4553 ret = 0; 4554 out: 4555 rcu_read_unlock(); 4556 4557 return ret; 4558 } 4559 4560 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) 4561 { 4562 int ret; 4563 struct dmar_rmrr_unit *rmrru; 4564 struct dmar_atsr_unit *atsru; 4565 struct acpi_dmar_atsr *atsr; 4566 struct acpi_dmar_reserved_memory *rmrr; 4567 4568 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) 4569 return 0; 4570 4571 list_for_each_entry(rmrru, &dmar_rmrr_units, list) { 4572 rmrr = container_of(rmrru->hdr, 4573 struct acpi_dmar_reserved_memory, header); 4574 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4575 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), 4576 ((void *)rmrr) + rmrr->header.length, 4577 rmrr->segment, rmrru->devices, 4578 rmrru->devices_cnt); 4579 if (ret < 0) 4580 return ret; 4581 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4582 dmar_remove_dev_scope(info, rmrr->segment, 4583 rmrru->devices, rmrru->devices_cnt); 4584 } 4585 } 4586 4587 list_for_each_entry(atsru, &dmar_atsr_units, list) { 4588 if (atsru->include_all) 4589 continue; 4590 4591 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); 4592 if (info->event == BUS_NOTIFY_ADD_DEVICE) { 4593 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), 4594 (void *)atsr + atsr->header.length, 4595 atsr->segment, atsru->devices, 4596 atsru->devices_cnt); 4597 if (ret > 0) 4598 break; 4599 else if (ret < 0) 4600 return ret; 4601 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { 4602 if (dmar_remove_dev_scope(info, atsr->segment, 4603 atsru->devices, atsru->devices_cnt)) 4604 break; 4605 } 4606 } 4607 4608 return 0; 4609 } 4610 4611 static int intel_iommu_memory_notifier(struct notifier_block *nb, 4612 unsigned long val, void *v) 4613 { 4614 struct memory_notify *mhp = v; 4615 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); 4616 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + 4617 mhp->nr_pages - 1); 4618 4619 switch (val) { 4620 case MEM_GOING_ONLINE: 4621 if (iommu_domain_identity_map(si_domain, 4622 start_vpfn, last_vpfn)) { 4623 pr_warn("Failed to build identity map for [%lx-%lx]\n", 4624 start_vpfn, last_vpfn); 4625 return NOTIFY_BAD; 4626 } 4627 break; 4628 4629 case MEM_OFFLINE: 4630 case MEM_CANCEL_ONLINE: 4631 { 4632 struct dmar_drhd_unit *drhd; 4633 struct intel_iommu *iommu; 4634 struct page *freelist; 4635 4636 freelist = domain_unmap(si_domain, 4637 start_vpfn, last_vpfn); 4638 4639 rcu_read_lock(); 4640 for_each_active_iommu(iommu, drhd) 4641 iommu_flush_iotlb_psi(iommu, si_domain, 4642 start_vpfn, mhp->nr_pages, 4643 !freelist, 0); 4644 rcu_read_unlock(); 4645 dma_free_pagelist(freelist); 4646 } 4647 break; 4648 } 4649 4650 return NOTIFY_OK; 4651 } 4652 4653 static struct notifier_block intel_iommu_memory_nb = { 4654 .notifier_call = intel_iommu_memory_notifier, 4655 .priority = 0 4656 }; 4657 4658 static void free_all_cpu_cached_iovas(unsigned int cpu) 4659 { 4660 int i; 4661 4662 for (i = 0; i < g_num_of_iommus; i++) { 4663 struct intel_iommu *iommu = g_iommus[i]; 4664 struct dmar_domain *domain; 4665 int did; 4666 4667 if (!iommu) 4668 continue; 4669 4670 for (did = 0; did < cap_ndoms(iommu->cap); did++) { 4671 domain = get_iommu_domain(iommu, (u16)did); 4672 4673 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) 4674 continue; 4675 4676 free_cpu_cached_iovas(cpu, &domain->iovad); 4677 } 4678 } 4679 } 4680 4681 static int intel_iommu_cpu_dead(unsigned int cpu) 4682 { 4683 free_all_cpu_cached_iovas(cpu); 4684 return 0; 4685 } 4686 4687 static void intel_disable_iommus(void) 4688 { 4689 struct intel_iommu *iommu = NULL; 4690 struct dmar_drhd_unit *drhd; 4691 4692 for_each_iommu(iommu, drhd) 4693 iommu_disable_translation(iommu); 4694 } 4695 4696 void intel_iommu_shutdown(void) 4697 { 4698 struct dmar_drhd_unit *drhd; 4699 struct intel_iommu *iommu = NULL; 4700 4701 if (no_iommu || dmar_disabled) 4702 return; 4703 4704 down_write(&dmar_global_lock); 4705 4706 /* Disable PMRs explicitly here. */ 4707 for_each_iommu(iommu, drhd) 4708 iommu_disable_protect_mem_regions(iommu); 4709 4710 /* Make sure the IOMMUs are switched off */ 4711 intel_disable_iommus(); 4712 4713 up_write(&dmar_global_lock); 4714 } 4715 4716 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) 4717 { 4718 struct iommu_device *iommu_dev = dev_to_iommu_device(dev); 4719 4720 return container_of(iommu_dev, struct intel_iommu, iommu); 4721 } 4722 4723 static ssize_t intel_iommu_show_version(struct device *dev, 4724 struct device_attribute *attr, 4725 char *buf) 4726 { 4727 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4728 u32 ver = readl(iommu->reg + DMAR_VER_REG); 4729 return sprintf(buf, "%d:%d\n", 4730 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); 4731 } 4732 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); 4733 4734 static ssize_t intel_iommu_show_address(struct device *dev, 4735 struct device_attribute *attr, 4736 char *buf) 4737 { 4738 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4739 return sprintf(buf, "%llx\n", iommu->reg_phys); 4740 } 4741 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); 4742 4743 static ssize_t intel_iommu_show_cap(struct device *dev, 4744 struct device_attribute *attr, 4745 char *buf) 4746 { 4747 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4748 return sprintf(buf, "%llx\n", iommu->cap); 4749 } 4750 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); 4751 4752 static ssize_t intel_iommu_show_ecap(struct device *dev, 4753 struct device_attribute *attr, 4754 char *buf) 4755 { 4756 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4757 return sprintf(buf, "%llx\n", iommu->ecap); 4758 } 4759 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); 4760 4761 static ssize_t intel_iommu_show_ndoms(struct device *dev, 4762 struct device_attribute *attr, 4763 char *buf) 4764 { 4765 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4766 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); 4767 } 4768 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); 4769 4770 static ssize_t intel_iommu_show_ndoms_used(struct device *dev, 4771 struct device_attribute *attr, 4772 char *buf) 4773 { 4774 struct intel_iommu *iommu = dev_to_intel_iommu(dev); 4775 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, 4776 cap_ndoms(iommu->cap))); 4777 } 4778 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); 4779 4780 static struct attribute *intel_iommu_attrs[] = { 4781 &dev_attr_version.attr, 4782 &dev_attr_address.attr, 4783 &dev_attr_cap.attr, 4784 &dev_attr_ecap.attr, 4785 &dev_attr_domains_supported.attr, 4786 &dev_attr_domains_used.attr, 4787 NULL, 4788 }; 4789 4790 static struct attribute_group intel_iommu_group = { 4791 .name = "intel-iommu", 4792 .attrs = intel_iommu_attrs, 4793 }; 4794 4795 const struct attribute_group *intel_iommu_groups[] = { 4796 &intel_iommu_group, 4797 NULL, 4798 }; 4799 4800 static inline bool has_external_pci(void) 4801 { 4802 struct pci_dev *pdev = NULL; 4803 4804 for_each_pci_dev(pdev) 4805 if (pdev->external_facing) 4806 return true; 4807 4808 return false; 4809 } 4810 4811 static int __init platform_optin_force_iommu(void) 4812 { 4813 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) 4814 return 0; 4815 4816 if (no_iommu || dmar_disabled) 4817 pr_info("Intel-IOMMU force enabled due to platform opt in\n"); 4818 4819 /* 4820 * If Intel-IOMMU is disabled by default, we will apply identity 4821 * map for all devices except those marked as being untrusted. 4822 */ 4823 if (dmar_disabled) 4824 iommu_set_default_passthrough(false); 4825 4826 dmar_disabled = 0; 4827 no_iommu = 0; 4828 4829 return 1; 4830 } 4831 4832 static int __init probe_acpi_namespace_devices(void) 4833 { 4834 struct dmar_drhd_unit *drhd; 4835 /* To avoid a -Wunused-but-set-variable warning. */ 4836 struct intel_iommu *iommu __maybe_unused; 4837 struct device *dev; 4838 int i, ret = 0; 4839 4840 for_each_active_iommu(iommu, drhd) { 4841 for_each_active_dev_scope(drhd->devices, 4842 drhd->devices_cnt, i, dev) { 4843 struct acpi_device_physical_node *pn; 4844 struct iommu_group *group; 4845 struct acpi_device *adev; 4846 4847 if (dev->bus != &acpi_bus_type) 4848 continue; 4849 4850 adev = to_acpi_device(dev); 4851 mutex_lock(&adev->physical_node_lock); 4852 list_for_each_entry(pn, 4853 &adev->physical_node_list, node) { 4854 group = iommu_group_get(pn->dev); 4855 if (group) { 4856 iommu_group_put(group); 4857 continue; 4858 } 4859 4860 pn->dev->bus->iommu_ops = &intel_iommu_ops; 4861 ret = iommu_probe_device(pn->dev); 4862 if (ret) 4863 break; 4864 } 4865 mutex_unlock(&adev->physical_node_lock); 4866 4867 if (ret) 4868 return ret; 4869 } 4870 } 4871 4872 return 0; 4873 } 4874 4875 int __init intel_iommu_init(void) 4876 { 4877 int ret = -ENODEV; 4878 struct dmar_drhd_unit *drhd; 4879 struct intel_iommu *iommu; 4880 4881 /* 4882 * Intel IOMMU is required for a TXT/tboot launch or platform 4883 * opt in, so enforce that. 4884 */ 4885 force_on = tboot_force_iommu() || platform_optin_force_iommu(); 4886 4887 if (iommu_init_mempool()) { 4888 if (force_on) 4889 panic("tboot: Failed to initialize iommu memory\n"); 4890 return -ENOMEM; 4891 } 4892 4893 down_write(&dmar_global_lock); 4894 if (dmar_table_init()) { 4895 if (force_on) 4896 panic("tboot: Failed to initialize DMAR table\n"); 4897 goto out_free_dmar; 4898 } 4899 4900 if (dmar_dev_scope_init() < 0) { 4901 if (force_on) 4902 panic("tboot: Failed to initialize DMAR device scope\n"); 4903 goto out_free_dmar; 4904 } 4905 4906 up_write(&dmar_global_lock); 4907 4908 /* 4909 * The bus notifier takes the dmar_global_lock, so lockdep will 4910 * complain later when we register it under the lock. 4911 */ 4912 dmar_register_bus_notifier(); 4913 4914 down_write(&dmar_global_lock); 4915 4916 if (!no_iommu) 4917 intel_iommu_debugfs_init(); 4918 4919 if (no_iommu || dmar_disabled) { 4920 /* 4921 * We exit the function here to ensure IOMMU's remapping and 4922 * mempool aren't setup, which means that the IOMMU's PMRs 4923 * won't be disabled via the call to init_dmars(). So disable 4924 * it explicitly here. The PMRs were setup by tboot prior to 4925 * calling SENTER, but the kernel is expected to reset/tear 4926 * down the PMRs. 4927 */ 4928 if (intel_iommu_tboot_noforce) { 4929 for_each_iommu(iommu, drhd) 4930 iommu_disable_protect_mem_regions(iommu); 4931 } 4932 4933 /* 4934 * Make sure the IOMMUs are switched off, even when we 4935 * boot into a kexec kernel and the previous kernel left 4936 * them enabled 4937 */ 4938 intel_disable_iommus(); 4939 goto out_free_dmar; 4940 } 4941 4942 if (list_empty(&dmar_rmrr_units)) 4943 pr_info("No RMRR found\n"); 4944 4945 if (list_empty(&dmar_atsr_units)) 4946 pr_info("No ATSR found\n"); 4947 4948 if (dmar_init_reserved_ranges()) { 4949 if (force_on) 4950 panic("tboot: Failed to reserve iommu ranges\n"); 4951 goto out_free_reserved_range; 4952 } 4953 4954 if (dmar_map_gfx) 4955 intel_iommu_gfx_mapped = 1; 4956 4957 init_no_remapping_devices(); 4958 4959 ret = init_dmars(); 4960 if (ret) { 4961 if (force_on) 4962 panic("tboot: Failed to initialize DMARs\n"); 4963 pr_err("Initialization failed\n"); 4964 goto out_free_reserved_range; 4965 } 4966 up_write(&dmar_global_lock); 4967 4968 init_iommu_pm_ops(); 4969 4970 down_read(&dmar_global_lock); 4971 for_each_active_iommu(iommu, drhd) { 4972 iommu_device_sysfs_add(&iommu->iommu, NULL, 4973 intel_iommu_groups, 4974 "%s", iommu->name); 4975 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops); 4976 iommu_device_register(&iommu->iommu); 4977 } 4978 up_read(&dmar_global_lock); 4979 4980 bus_set_iommu(&pci_bus_type, &intel_iommu_ops); 4981 if (si_domain && !hw_pass_through) 4982 register_memory_notifier(&intel_iommu_memory_nb); 4983 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, 4984 intel_iommu_cpu_dead); 4985 4986 down_read(&dmar_global_lock); 4987 if (probe_acpi_namespace_devices()) 4988 pr_warn("ACPI name space devices didn't probe correctly\n"); 4989 4990 /* Finally, we enable the DMA remapping hardware. */ 4991 for_each_iommu(iommu, drhd) { 4992 if (!drhd->ignored && !translation_pre_enabled(iommu)) 4993 iommu_enable_translation(iommu); 4994 4995 iommu_disable_protect_mem_regions(iommu); 4996 } 4997 up_read(&dmar_global_lock); 4998 4999 pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); 5000 5001 intel_iommu_enabled = 1; 5002 5003 return 0; 5004 5005 out_free_reserved_range: 5006 put_iova_domain(&reserved_iova_list); 5007 out_free_dmar: 5008 intel_iommu_free_dmars(); 5009 up_write(&dmar_global_lock); 5010 iommu_exit_mempool(); 5011 return ret; 5012 } 5013 5014 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) 5015 { 5016 struct intel_iommu *iommu = opaque; 5017 5018 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); 5019 return 0; 5020 } 5021 5022 /* 5023 * NB - intel-iommu lacks any sort of reference counting for the users of 5024 * dependent devices. If multiple endpoints have intersecting dependent 5025 * devices, unbinding the driver from any one of them will possibly leave 5026 * the others unable to operate. 5027 */ 5028 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) 5029 { 5030 if (!iommu || !dev || !dev_is_pci(dev)) 5031 return; 5032 5033 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); 5034 } 5035 5036 static void __dmar_remove_one_dev_info(struct device_domain_info *info) 5037 { 5038 struct dmar_domain *domain; 5039 struct intel_iommu *iommu; 5040 unsigned long flags; 5041 5042 assert_spin_locked(&device_domain_lock); 5043 5044 if (WARN_ON(!info)) 5045 return; 5046 5047 iommu = info->iommu; 5048 domain = info->domain; 5049 5050 if (info->dev) { 5051 if (dev_is_pci(info->dev) && sm_supported(iommu)) 5052 intel_pasid_tear_down_entry(iommu, info->dev, 5053 PASID_RID2PASID, false); 5054 5055 iommu_disable_dev_iotlb(info); 5056 if (!dev_is_real_dma_subdevice(info->dev)) 5057 domain_context_clear(iommu, info->dev); 5058 intel_pasid_free_table(info->dev); 5059 } 5060 5061 unlink_domain_info(info); 5062 5063 spin_lock_irqsave(&iommu->lock, flags); 5064 domain_detach_iommu(domain, iommu); 5065 spin_unlock_irqrestore(&iommu->lock, flags); 5066 5067 free_devinfo_mem(info); 5068 } 5069 5070 static void dmar_remove_one_dev_info(struct device *dev) 5071 { 5072 struct device_domain_info *info; 5073 unsigned long flags; 5074 5075 spin_lock_irqsave(&device_domain_lock, flags); 5076 info = get_domain_info(dev); 5077 if (info) 5078 __dmar_remove_one_dev_info(info); 5079 spin_unlock_irqrestore(&device_domain_lock, flags); 5080 } 5081 5082 static int md_domain_init(struct dmar_domain *domain, int guest_width) 5083 { 5084 int adjust_width; 5085 5086 /* calculate AGAW */ 5087 domain->gaw = guest_width; 5088 adjust_width = guestwidth_to_adjustwidth(guest_width); 5089 domain->agaw = width_to_agaw(adjust_width); 5090 5091 domain->iommu_coherency = 0; 5092 domain->iommu_snooping = 0; 5093 domain->iommu_superpage = 0; 5094 domain->max_addr = 0; 5095 5096 /* always allocate the top pgd */ 5097 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); 5098 if (!domain->pgd) 5099 return -ENOMEM; 5100 domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 5101 return 0; 5102 } 5103 5104 static void intel_init_iova_domain(struct dmar_domain *dmar_domain) 5105 { 5106 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); 5107 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad); 5108 5109 if (!intel_iommu_strict && 5110 init_iova_flush_queue(&dmar_domain->iovad, 5111 iommu_flush_iova, iova_entry_free)) 5112 pr_info("iova flush queue initialization failed\n"); 5113 } 5114 5115 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) 5116 { 5117 struct dmar_domain *dmar_domain; 5118 struct iommu_domain *domain; 5119 5120 switch (type) { 5121 case IOMMU_DOMAIN_DMA: 5122 case IOMMU_DOMAIN_UNMANAGED: 5123 dmar_domain = alloc_domain(0); 5124 if (!dmar_domain) { 5125 pr_err("Can't allocate dmar_domain\n"); 5126 return NULL; 5127 } 5128 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { 5129 pr_err("Domain initialization failed\n"); 5130 domain_exit(dmar_domain); 5131 return NULL; 5132 } 5133 5134 if (type == IOMMU_DOMAIN_DMA) 5135 intel_init_iova_domain(dmar_domain); 5136 5137 domain = &dmar_domain->domain; 5138 domain->geometry.aperture_start = 0; 5139 domain->geometry.aperture_end = 5140 __DOMAIN_MAX_ADDR(dmar_domain->gaw); 5141 domain->geometry.force_aperture = true; 5142 5143 return domain; 5144 case IOMMU_DOMAIN_IDENTITY: 5145 return &si_domain->domain; 5146 default: 5147 return NULL; 5148 } 5149 5150 return NULL; 5151 } 5152 5153 static void intel_iommu_domain_free(struct iommu_domain *domain) 5154 { 5155 if (domain != &si_domain->domain) 5156 domain_exit(to_dmar_domain(domain)); 5157 } 5158 5159 /* 5160 * Check whether a @domain could be attached to the @dev through the 5161 * aux-domain attach/detach APIs. 5162 */ 5163 static inline bool 5164 is_aux_domain(struct device *dev, struct iommu_domain *domain) 5165 { 5166 struct device_domain_info *info = get_domain_info(dev); 5167 5168 return info && info->auxd_enabled && 5169 domain->type == IOMMU_DOMAIN_UNMANAGED; 5170 } 5171 5172 static void auxiliary_link_device(struct dmar_domain *domain, 5173 struct device *dev) 5174 { 5175 struct device_domain_info *info = get_domain_info(dev); 5176 5177 assert_spin_locked(&device_domain_lock); 5178 if (WARN_ON(!info)) 5179 return; 5180 5181 domain->auxd_refcnt++; 5182 list_add(&domain->auxd, &info->auxiliary_domains); 5183 } 5184 5185 static void auxiliary_unlink_device(struct dmar_domain *domain, 5186 struct device *dev) 5187 { 5188 struct device_domain_info *info = get_domain_info(dev); 5189 5190 assert_spin_locked(&device_domain_lock); 5191 if (WARN_ON(!info)) 5192 return; 5193 5194 list_del(&domain->auxd); 5195 domain->auxd_refcnt--; 5196 5197 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5198 ioasid_free(domain->default_pasid); 5199 } 5200 5201 static int aux_domain_add_dev(struct dmar_domain *domain, 5202 struct device *dev) 5203 { 5204 int ret; 5205 unsigned long flags; 5206 struct intel_iommu *iommu; 5207 5208 iommu = device_to_iommu(dev, NULL, NULL); 5209 if (!iommu) 5210 return -ENODEV; 5211 5212 if (domain->default_pasid <= 0) { 5213 u32 pasid; 5214 5215 /* No private data needed for the default pasid */ 5216 pasid = ioasid_alloc(NULL, PASID_MIN, 5217 pci_max_pasids(to_pci_dev(dev)) - 1, 5218 NULL); 5219 if (pasid == INVALID_IOASID) { 5220 pr_err("Can't allocate default pasid\n"); 5221 return -ENODEV; 5222 } 5223 domain->default_pasid = pasid; 5224 } 5225 5226 spin_lock_irqsave(&device_domain_lock, flags); 5227 /* 5228 * iommu->lock must be held to attach domain to iommu and setup the 5229 * pasid entry for second level translation. 5230 */ 5231 spin_lock(&iommu->lock); 5232 ret = domain_attach_iommu(domain, iommu); 5233 if (ret) 5234 goto attach_failed; 5235 5236 /* Setup the PASID entry for mediated devices: */ 5237 if (domain_use_first_level(domain)) 5238 ret = domain_setup_first_level(iommu, domain, dev, 5239 domain->default_pasid); 5240 else 5241 ret = intel_pasid_setup_second_level(iommu, domain, dev, 5242 domain->default_pasid); 5243 if (ret) 5244 goto table_failed; 5245 spin_unlock(&iommu->lock); 5246 5247 auxiliary_link_device(domain, dev); 5248 5249 spin_unlock_irqrestore(&device_domain_lock, flags); 5250 5251 return 0; 5252 5253 table_failed: 5254 domain_detach_iommu(domain, iommu); 5255 attach_failed: 5256 spin_unlock(&iommu->lock); 5257 spin_unlock_irqrestore(&device_domain_lock, flags); 5258 if (!domain->auxd_refcnt && domain->default_pasid > 0) 5259 ioasid_free(domain->default_pasid); 5260 5261 return ret; 5262 } 5263 5264 static void aux_domain_remove_dev(struct dmar_domain *domain, 5265 struct device *dev) 5266 { 5267 struct device_domain_info *info; 5268 struct intel_iommu *iommu; 5269 unsigned long flags; 5270 5271 if (!is_aux_domain(dev, &domain->domain)) 5272 return; 5273 5274 spin_lock_irqsave(&device_domain_lock, flags); 5275 info = get_domain_info(dev); 5276 iommu = info->iommu; 5277 5278 auxiliary_unlink_device(domain, dev); 5279 5280 spin_lock(&iommu->lock); 5281 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); 5282 domain_detach_iommu(domain, iommu); 5283 spin_unlock(&iommu->lock); 5284 5285 spin_unlock_irqrestore(&device_domain_lock, flags); 5286 } 5287 5288 static int prepare_domain_attach_device(struct iommu_domain *domain, 5289 struct device *dev) 5290 { 5291 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5292 struct intel_iommu *iommu; 5293 int addr_width; 5294 5295 iommu = device_to_iommu(dev, NULL, NULL); 5296 if (!iommu) 5297 return -ENODEV; 5298 5299 /* check if this iommu agaw is sufficient for max mapped address */ 5300 addr_width = agaw_to_width(iommu->agaw); 5301 if (addr_width > cap_mgaw(iommu->cap)) 5302 addr_width = cap_mgaw(iommu->cap); 5303 5304 if (dmar_domain->max_addr > (1LL << addr_width)) { 5305 dev_err(dev, "%s: iommu width (%d) is not " 5306 "sufficient for the mapped address (%llx)\n", 5307 __func__, addr_width, dmar_domain->max_addr); 5308 return -EFAULT; 5309 } 5310 dmar_domain->gaw = addr_width; 5311 5312 /* 5313 * Knock out extra levels of page tables if necessary 5314 */ 5315 while (iommu->agaw < dmar_domain->agaw) { 5316 struct dma_pte *pte; 5317 5318 pte = dmar_domain->pgd; 5319 if (dma_pte_present(pte)) { 5320 dmar_domain->pgd = (struct dma_pte *) 5321 phys_to_virt(dma_pte_addr(pte)); 5322 free_pgtable_page(pte); 5323 } 5324 dmar_domain->agaw--; 5325 } 5326 5327 return 0; 5328 } 5329 5330 static int intel_iommu_attach_device(struct iommu_domain *domain, 5331 struct device *dev) 5332 { 5333 int ret; 5334 5335 if (domain->type == IOMMU_DOMAIN_UNMANAGED && 5336 device_is_rmrr_locked(dev)) { 5337 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n"); 5338 return -EPERM; 5339 } 5340 5341 if (is_aux_domain(dev, domain)) 5342 return -EPERM; 5343 5344 /* normally dev is not mapped */ 5345 if (unlikely(domain_context_mapped(dev))) { 5346 struct dmar_domain *old_domain; 5347 5348 old_domain = find_domain(dev); 5349 if (old_domain) 5350 dmar_remove_one_dev_info(dev); 5351 } 5352 5353 ret = prepare_domain_attach_device(domain, dev); 5354 if (ret) 5355 return ret; 5356 5357 return domain_add_dev_info(to_dmar_domain(domain), dev); 5358 } 5359 5360 static int intel_iommu_aux_attach_device(struct iommu_domain *domain, 5361 struct device *dev) 5362 { 5363 int ret; 5364 5365 if (!is_aux_domain(dev, domain)) 5366 return -EPERM; 5367 5368 ret = prepare_domain_attach_device(domain, dev); 5369 if (ret) 5370 return ret; 5371 5372 return aux_domain_add_dev(to_dmar_domain(domain), dev); 5373 } 5374 5375 static void intel_iommu_detach_device(struct iommu_domain *domain, 5376 struct device *dev) 5377 { 5378 dmar_remove_one_dev_info(dev); 5379 } 5380 5381 static void intel_iommu_aux_detach_device(struct iommu_domain *domain, 5382 struct device *dev) 5383 { 5384 aux_domain_remove_dev(to_dmar_domain(domain), dev); 5385 } 5386 5387 /* 5388 * 2D array for converting and sanitizing IOMMU generic TLB granularity to 5389 * VT-d granularity. Invalidation is typically included in the unmap operation 5390 * as a result of DMA or VFIO unmap. However, for assigned devices guest 5391 * owns the first level page tables. Invalidations of translation caches in the 5392 * guest are trapped and passed down to the host. 5393 * 5394 * vIOMMU in the guest will only expose first level page tables, therefore 5395 * we do not support IOTLB granularity for request without PASID (second level). 5396 * 5397 * For example, to find the VT-d granularity encoding for IOTLB 5398 * type and page selective granularity within PASID: 5399 * X: indexed by iommu cache type 5400 * Y: indexed by enum iommu_inv_granularity 5401 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] 5402 */ 5403 5404 static const int 5405 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { 5406 /* 5407 * PASID based IOTLB invalidation: PASID selective (per PASID), 5408 * page selective (address granularity) 5409 */ 5410 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, 5411 /* PASID based dev TLBs */ 5412 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, 5413 /* PASID cache */ 5414 {-EINVAL, -EINVAL, -EINVAL} 5415 }; 5416 5417 static inline int to_vtd_granularity(int type, int granu) 5418 { 5419 return inv_type_granu_table[type][granu]; 5420 } 5421 5422 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) 5423 { 5424 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; 5425 5426 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. 5427 * IOMMU cache invalidate API passes granu_size in bytes, and number of 5428 * granu size in contiguous memory. 5429 */ 5430 return order_base_2(nr_pages); 5431 } 5432 5433 #ifdef CONFIG_INTEL_IOMMU_SVM 5434 static int 5435 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, 5436 struct iommu_cache_invalidate_info *inv_info) 5437 { 5438 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5439 struct device_domain_info *info; 5440 struct intel_iommu *iommu; 5441 unsigned long flags; 5442 int cache_type; 5443 u8 bus, devfn; 5444 u16 did, sid; 5445 int ret = 0; 5446 u64 size = 0; 5447 5448 if (!inv_info || !dmar_domain) 5449 return -EINVAL; 5450 5451 if (!dev || !dev_is_pci(dev)) 5452 return -ENODEV; 5453 5454 iommu = device_to_iommu(dev, &bus, &devfn); 5455 if (!iommu) 5456 return -ENODEV; 5457 5458 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) 5459 return -EINVAL; 5460 5461 spin_lock_irqsave(&device_domain_lock, flags); 5462 spin_lock(&iommu->lock); 5463 info = get_domain_info(dev); 5464 if (!info) { 5465 ret = -EINVAL; 5466 goto out_unlock; 5467 } 5468 did = dmar_domain->iommu_did[iommu->seq_id]; 5469 sid = PCI_DEVID(bus, devfn); 5470 5471 /* Size is only valid in address selective invalidation */ 5472 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) 5473 size = to_vtd_size(inv_info->granu.addr_info.granule_size, 5474 inv_info->granu.addr_info.nb_granules); 5475 5476 for_each_set_bit(cache_type, 5477 (unsigned long *)&inv_info->cache, 5478 IOMMU_CACHE_INV_TYPE_NR) { 5479 int granu = 0; 5480 u64 pasid = 0; 5481 u64 addr = 0; 5482 5483 granu = to_vtd_granularity(cache_type, inv_info->granularity); 5484 if (granu == -EINVAL) { 5485 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", 5486 cache_type, inv_info->granularity); 5487 break; 5488 } 5489 5490 /* 5491 * PASID is stored in different locations based on the 5492 * granularity. 5493 */ 5494 if (inv_info->granularity == IOMMU_INV_GRANU_PASID && 5495 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) 5496 pasid = inv_info->granu.pasid_info.pasid; 5497 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5498 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) 5499 pasid = inv_info->granu.addr_info.pasid; 5500 5501 switch (BIT(cache_type)) { 5502 case IOMMU_CACHE_INV_TYPE_IOTLB: 5503 /* HW will ignore LSB bits based on address mask */ 5504 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && 5505 size && 5506 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { 5507 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n", 5508 inv_info->granu.addr_info.addr, size); 5509 } 5510 5511 /* 5512 * If granu is PASID-selective, address is ignored. 5513 * We use npages = -1 to indicate that. 5514 */ 5515 qi_flush_piotlb(iommu, did, pasid, 5516 mm_to_dma_pfn(inv_info->granu.addr_info.addr), 5517 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, 5518 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); 5519 5520 if (!info->ats_enabled) 5521 break; 5522 /* 5523 * Always flush device IOTLB if ATS is enabled. vIOMMU 5524 * in the guest may assume IOTLB flush is inclusive, 5525 * which is more efficient. 5526 */ 5527 fallthrough; 5528 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: 5529 /* 5530 * PASID based device TLB invalidation does not support 5531 * IOMMU_INV_GRANU_PASID granularity but only supports 5532 * IOMMU_INV_GRANU_ADDR. 5533 * The equivalent of that is we set the size to be the 5534 * entire range of 64 bit. User only provides PASID info 5535 * without address info. So we set addr to 0. 5536 */ 5537 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) { 5538 size = 64 - VTD_PAGE_SHIFT; 5539 addr = 0; 5540 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) { 5541 addr = inv_info->granu.addr_info.addr; 5542 } 5543 5544 if (info->ats_enabled) 5545 qi_flush_dev_iotlb_pasid(iommu, sid, 5546 info->pfsid, pasid, 5547 info->ats_qdep, addr, 5548 size); 5549 else 5550 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); 5551 break; 5552 default: 5553 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", 5554 cache_type); 5555 ret = -EINVAL; 5556 } 5557 } 5558 out_unlock: 5559 spin_unlock(&iommu->lock); 5560 spin_unlock_irqrestore(&device_domain_lock, flags); 5561 5562 return ret; 5563 } 5564 #endif 5565 5566 static int intel_iommu_map(struct iommu_domain *domain, 5567 unsigned long iova, phys_addr_t hpa, 5568 size_t size, int iommu_prot, gfp_t gfp) 5569 { 5570 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5571 u64 max_addr; 5572 int prot = 0; 5573 int ret; 5574 5575 if (iommu_prot & IOMMU_READ) 5576 prot |= DMA_PTE_READ; 5577 if (iommu_prot & IOMMU_WRITE) 5578 prot |= DMA_PTE_WRITE; 5579 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) 5580 prot |= DMA_PTE_SNP; 5581 5582 max_addr = iova + size; 5583 if (dmar_domain->max_addr < max_addr) { 5584 u64 end; 5585 5586 /* check if minimum agaw is sufficient for mapped address */ 5587 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 5588 if (end < max_addr) { 5589 pr_err("%s: iommu width (%d) is not " 5590 "sufficient for the mapped address (%llx)\n", 5591 __func__, dmar_domain->gaw, max_addr); 5592 return -EFAULT; 5593 } 5594 dmar_domain->max_addr = max_addr; 5595 } 5596 /* Round up size to next multiple of PAGE_SIZE, if it and 5597 the low bits of hpa would take us onto the next page */ 5598 size = aligned_nrpages(hpa, size); 5599 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 5600 hpa >> VTD_PAGE_SHIFT, size, prot); 5601 return ret; 5602 } 5603 5604 static size_t intel_iommu_unmap(struct iommu_domain *domain, 5605 unsigned long iova, size_t size, 5606 struct iommu_iotlb_gather *gather) 5607 { 5608 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5609 struct page *freelist = NULL; 5610 unsigned long start_pfn, last_pfn; 5611 unsigned int npages; 5612 int iommu_id, level = 0; 5613 5614 /* Cope with horrid API which requires us to unmap more than the 5615 size argument if it happens to be a large-page mapping. */ 5616 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); 5617 5618 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 5619 size = VTD_PAGE_SIZE << level_to_offset_bits(level); 5620 5621 start_pfn = iova >> VTD_PAGE_SHIFT; 5622 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 5623 5624 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn); 5625 5626 npages = last_pfn - start_pfn + 1; 5627 5628 for_each_domain_iommu(iommu_id, dmar_domain) 5629 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, 5630 start_pfn, npages, !freelist, 0); 5631 5632 dma_free_pagelist(freelist); 5633 5634 if (dmar_domain->max_addr == iova + size) 5635 dmar_domain->max_addr = iova; 5636 5637 return size; 5638 } 5639 5640 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 5641 dma_addr_t iova) 5642 { 5643 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5644 struct dma_pte *pte; 5645 int level = 0; 5646 u64 phys = 0; 5647 5648 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); 5649 if (pte && dma_pte_present(pte)) 5650 phys = dma_pte_addr(pte) + 5651 (iova & (BIT_MASK(level_to_offset_bits(level) + 5652 VTD_PAGE_SHIFT) - 1)); 5653 5654 return phys; 5655 } 5656 5657 static inline bool scalable_mode_support(void) 5658 { 5659 struct dmar_drhd_unit *drhd; 5660 struct intel_iommu *iommu; 5661 bool ret = true; 5662 5663 rcu_read_lock(); 5664 for_each_active_iommu(iommu, drhd) { 5665 if (!sm_supported(iommu)) { 5666 ret = false; 5667 break; 5668 } 5669 } 5670 rcu_read_unlock(); 5671 5672 return ret; 5673 } 5674 5675 static inline bool iommu_pasid_support(void) 5676 { 5677 struct dmar_drhd_unit *drhd; 5678 struct intel_iommu *iommu; 5679 bool ret = true; 5680 5681 rcu_read_lock(); 5682 for_each_active_iommu(iommu, drhd) { 5683 if (!pasid_supported(iommu)) { 5684 ret = false; 5685 break; 5686 } 5687 } 5688 rcu_read_unlock(); 5689 5690 return ret; 5691 } 5692 5693 static inline bool nested_mode_support(void) 5694 { 5695 struct dmar_drhd_unit *drhd; 5696 struct intel_iommu *iommu; 5697 bool ret = true; 5698 5699 rcu_read_lock(); 5700 for_each_active_iommu(iommu, drhd) { 5701 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { 5702 ret = false; 5703 break; 5704 } 5705 } 5706 rcu_read_unlock(); 5707 5708 return ret; 5709 } 5710 5711 static bool intel_iommu_capable(enum iommu_cap cap) 5712 { 5713 if (cap == IOMMU_CAP_CACHE_COHERENCY) 5714 return domain_update_iommu_snooping(NULL) == 1; 5715 if (cap == IOMMU_CAP_INTR_REMAP) 5716 return irq_remapping_enabled == 1; 5717 5718 return false; 5719 } 5720 5721 static struct iommu_device *intel_iommu_probe_device(struct device *dev) 5722 { 5723 struct intel_iommu *iommu; 5724 5725 iommu = device_to_iommu(dev, NULL, NULL); 5726 if (!iommu) 5727 return ERR_PTR(-ENODEV); 5728 5729 if (translation_pre_enabled(iommu)) 5730 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO); 5731 5732 return &iommu->iommu; 5733 } 5734 5735 static void intel_iommu_release_device(struct device *dev) 5736 { 5737 struct intel_iommu *iommu; 5738 5739 iommu = device_to_iommu(dev, NULL, NULL); 5740 if (!iommu) 5741 return; 5742 5743 dmar_remove_one_dev_info(dev); 5744 5745 set_dma_ops(dev, NULL); 5746 } 5747 5748 static void intel_iommu_probe_finalize(struct device *dev) 5749 { 5750 struct iommu_domain *domain; 5751 5752 domain = iommu_get_domain_for_dev(dev); 5753 if (device_needs_bounce(dev)) 5754 set_dma_ops(dev, &bounce_dma_ops); 5755 else if (domain && domain->type == IOMMU_DOMAIN_DMA) 5756 set_dma_ops(dev, &intel_dma_ops); 5757 else 5758 set_dma_ops(dev, NULL); 5759 } 5760 5761 static void intel_iommu_get_resv_regions(struct device *device, 5762 struct list_head *head) 5763 { 5764 int prot = DMA_PTE_READ | DMA_PTE_WRITE; 5765 struct iommu_resv_region *reg; 5766 struct dmar_rmrr_unit *rmrr; 5767 struct device *i_dev; 5768 int i; 5769 5770 down_read(&dmar_global_lock); 5771 for_each_rmrr_units(rmrr) { 5772 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, 5773 i, i_dev) { 5774 struct iommu_resv_region *resv; 5775 enum iommu_resv_type type; 5776 size_t length; 5777 5778 if (i_dev != device && 5779 !is_downstream_to_pci_bridge(device, i_dev)) 5780 continue; 5781 5782 length = rmrr->end_address - rmrr->base_address + 1; 5783 5784 type = device_rmrr_is_relaxable(device) ? 5785 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; 5786 5787 resv = iommu_alloc_resv_region(rmrr->base_address, 5788 length, prot, type); 5789 if (!resv) 5790 break; 5791 5792 list_add_tail(&resv->list, head); 5793 } 5794 } 5795 up_read(&dmar_global_lock); 5796 5797 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA 5798 if (dev_is_pci(device)) { 5799 struct pci_dev *pdev = to_pci_dev(device); 5800 5801 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { 5802 reg = iommu_alloc_resv_region(0, 1UL << 24, prot, 5803 IOMMU_RESV_DIRECT_RELAXABLE); 5804 if (reg) 5805 list_add_tail(®->list, head); 5806 } 5807 } 5808 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ 5809 5810 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, 5811 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 5812 0, IOMMU_RESV_MSI); 5813 if (!reg) 5814 return; 5815 list_add_tail(®->list, head); 5816 } 5817 5818 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) 5819 { 5820 struct device_domain_info *info; 5821 struct context_entry *context; 5822 struct dmar_domain *domain; 5823 unsigned long flags; 5824 u64 ctx_lo; 5825 int ret; 5826 5827 domain = find_domain(dev); 5828 if (!domain) 5829 return -EINVAL; 5830 5831 spin_lock_irqsave(&device_domain_lock, flags); 5832 spin_lock(&iommu->lock); 5833 5834 ret = -EINVAL; 5835 info = get_domain_info(dev); 5836 if (!info || !info->pasid_supported) 5837 goto out; 5838 5839 context = iommu_context_addr(iommu, info->bus, info->devfn, 0); 5840 if (WARN_ON(!context)) 5841 goto out; 5842 5843 ctx_lo = context[0].lo; 5844 5845 if (!(ctx_lo & CONTEXT_PASIDE)) { 5846 ctx_lo |= CONTEXT_PASIDE; 5847 context[0].lo = ctx_lo; 5848 wmb(); 5849 iommu->flush.flush_context(iommu, 5850 domain->iommu_did[iommu->seq_id], 5851 PCI_DEVID(info->bus, info->devfn), 5852 DMA_CCMD_MASK_NOBIT, 5853 DMA_CCMD_DEVICE_INVL); 5854 } 5855 5856 /* Enable PASID support in the device, if it wasn't already */ 5857 if (!info->pasid_enabled) 5858 iommu_enable_dev_iotlb(info); 5859 5860 ret = 0; 5861 5862 out: 5863 spin_unlock(&iommu->lock); 5864 spin_unlock_irqrestore(&device_domain_lock, flags); 5865 5866 return ret; 5867 } 5868 5869 static void intel_iommu_apply_resv_region(struct device *dev, 5870 struct iommu_domain *domain, 5871 struct iommu_resv_region *region) 5872 { 5873 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 5874 unsigned long start, end; 5875 5876 start = IOVA_PFN(region->start); 5877 end = IOVA_PFN(region->start + region->length - 1); 5878 5879 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); 5880 } 5881 5882 static struct iommu_group *intel_iommu_device_group(struct device *dev) 5883 { 5884 if (dev_is_pci(dev)) 5885 return pci_device_group(dev); 5886 return generic_device_group(dev); 5887 } 5888 5889 static int intel_iommu_enable_auxd(struct device *dev) 5890 { 5891 struct device_domain_info *info; 5892 struct intel_iommu *iommu; 5893 unsigned long flags; 5894 int ret; 5895 5896 iommu = device_to_iommu(dev, NULL, NULL); 5897 if (!iommu || dmar_disabled) 5898 return -EINVAL; 5899 5900 if (!sm_supported(iommu) || !pasid_supported(iommu)) 5901 return -EINVAL; 5902 5903 ret = intel_iommu_enable_pasid(iommu, dev); 5904 if (ret) 5905 return -ENODEV; 5906 5907 spin_lock_irqsave(&device_domain_lock, flags); 5908 info = get_domain_info(dev); 5909 info->auxd_enabled = 1; 5910 spin_unlock_irqrestore(&device_domain_lock, flags); 5911 5912 return 0; 5913 } 5914 5915 static int intel_iommu_disable_auxd(struct device *dev) 5916 { 5917 struct device_domain_info *info; 5918 unsigned long flags; 5919 5920 spin_lock_irqsave(&device_domain_lock, flags); 5921 info = get_domain_info(dev); 5922 if (!WARN_ON(!info)) 5923 info->auxd_enabled = 0; 5924 spin_unlock_irqrestore(&device_domain_lock, flags); 5925 5926 return 0; 5927 } 5928 5929 /* 5930 * A PCI express designated vendor specific extended capability is defined 5931 * in the section 3.7 of Intel scalable I/O virtualization technical spec 5932 * for system software and tools to detect endpoint devices supporting the 5933 * Intel scalable IO virtualization without host driver dependency. 5934 * 5935 * Returns the address of the matching extended capability structure within 5936 * the device's PCI configuration space or 0 if the device does not support 5937 * it. 5938 */ 5939 static int siov_find_pci_dvsec(struct pci_dev *pdev) 5940 { 5941 int pos; 5942 u16 vendor, id; 5943 5944 pos = pci_find_next_ext_capability(pdev, 0, 0x23); 5945 while (pos) { 5946 pci_read_config_word(pdev, pos + 4, &vendor); 5947 pci_read_config_word(pdev, pos + 8, &id); 5948 if (vendor == PCI_VENDOR_ID_INTEL && id == 5) 5949 return pos; 5950 5951 pos = pci_find_next_ext_capability(pdev, pos, 0x23); 5952 } 5953 5954 return 0; 5955 } 5956 5957 static bool 5958 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) 5959 { 5960 if (feat == IOMMU_DEV_FEAT_AUX) { 5961 int ret; 5962 5963 if (!dev_is_pci(dev) || dmar_disabled || 5964 !scalable_mode_support() || !iommu_pasid_support()) 5965 return false; 5966 5967 ret = pci_pasid_features(to_pci_dev(dev)); 5968 if (ret < 0) 5969 return false; 5970 5971 return !!siov_find_pci_dvsec(to_pci_dev(dev)); 5972 } 5973 5974 if (feat == IOMMU_DEV_FEAT_SVA) { 5975 struct device_domain_info *info = get_domain_info(dev); 5976 5977 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && 5978 info->pasid_supported && info->pri_supported && 5979 info->ats_supported; 5980 } 5981 5982 return false; 5983 } 5984 5985 static int 5986 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) 5987 { 5988 if (feat == IOMMU_DEV_FEAT_AUX) 5989 return intel_iommu_enable_auxd(dev); 5990 5991 if (feat == IOMMU_DEV_FEAT_SVA) { 5992 struct device_domain_info *info = get_domain_info(dev); 5993 5994 if (!info) 5995 return -EINVAL; 5996 5997 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) 5998 return 0; 5999 } 6000 6001 return -ENODEV; 6002 } 6003 6004 static int 6005 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) 6006 { 6007 if (feat == IOMMU_DEV_FEAT_AUX) 6008 return intel_iommu_disable_auxd(dev); 6009 6010 return -ENODEV; 6011 } 6012 6013 static bool 6014 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) 6015 { 6016 struct device_domain_info *info = get_domain_info(dev); 6017 6018 if (feat == IOMMU_DEV_FEAT_AUX) 6019 return scalable_mode_support() && info && info->auxd_enabled; 6020 6021 return false; 6022 } 6023 6024 static int 6025 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) 6026 { 6027 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6028 6029 return dmar_domain->default_pasid > 0 ? 6030 dmar_domain->default_pasid : -EINVAL; 6031 } 6032 6033 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, 6034 struct device *dev) 6035 { 6036 return attach_deferred(dev); 6037 } 6038 6039 static int 6040 intel_iommu_domain_set_attr(struct iommu_domain *domain, 6041 enum iommu_attr attr, void *data) 6042 { 6043 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 6044 unsigned long flags; 6045 int ret = 0; 6046 6047 if (domain->type != IOMMU_DOMAIN_UNMANAGED) 6048 return -EINVAL; 6049 6050 switch (attr) { 6051 case DOMAIN_ATTR_NESTING: 6052 spin_lock_irqsave(&device_domain_lock, flags); 6053 if (nested_mode_support() && 6054 list_empty(&dmar_domain->devices)) { 6055 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; 6056 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; 6057 } else { 6058 ret = -ENODEV; 6059 } 6060 spin_unlock_irqrestore(&device_domain_lock, flags); 6061 break; 6062 default: 6063 ret = -EINVAL; 6064 break; 6065 } 6066 6067 return ret; 6068 } 6069 6070 /* 6071 * Check that the device does not live on an external facing PCI port that is 6072 * marked as untrusted. Such devices should not be able to apply quirks and 6073 * thus not be able to bypass the IOMMU restrictions. 6074 */ 6075 static bool risky_device(struct pci_dev *pdev) 6076 { 6077 if (pdev->untrusted) { 6078 pci_info(pdev, 6079 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", 6080 pdev->vendor, pdev->device); 6081 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); 6082 return true; 6083 } 6084 return false; 6085 } 6086 6087 const struct iommu_ops intel_iommu_ops = { 6088 .capable = intel_iommu_capable, 6089 .domain_alloc = intel_iommu_domain_alloc, 6090 .domain_free = intel_iommu_domain_free, 6091 .domain_set_attr = intel_iommu_domain_set_attr, 6092 .attach_dev = intel_iommu_attach_device, 6093 .detach_dev = intel_iommu_detach_device, 6094 .aux_attach_dev = intel_iommu_aux_attach_device, 6095 .aux_detach_dev = intel_iommu_aux_detach_device, 6096 .aux_get_pasid = intel_iommu_aux_get_pasid, 6097 .map = intel_iommu_map, 6098 .unmap = intel_iommu_unmap, 6099 .iova_to_phys = intel_iommu_iova_to_phys, 6100 .probe_device = intel_iommu_probe_device, 6101 .probe_finalize = intel_iommu_probe_finalize, 6102 .release_device = intel_iommu_release_device, 6103 .get_resv_regions = intel_iommu_get_resv_regions, 6104 .put_resv_regions = generic_iommu_put_resv_regions, 6105 .apply_resv_region = intel_iommu_apply_resv_region, 6106 .device_group = intel_iommu_device_group, 6107 .dev_has_feat = intel_iommu_dev_has_feat, 6108 .dev_feat_enabled = intel_iommu_dev_feat_enabled, 6109 .dev_enable_feat = intel_iommu_dev_enable_feat, 6110 .dev_disable_feat = intel_iommu_dev_disable_feat, 6111 .is_attach_deferred = intel_iommu_is_attach_deferred, 6112 .def_domain_type = device_def_domain_type, 6113 .pgsize_bitmap = INTEL_IOMMU_PGSIZES, 6114 #ifdef CONFIG_INTEL_IOMMU_SVM 6115 .cache_invalidate = intel_iommu_sva_invalidate, 6116 .sva_bind_gpasid = intel_svm_bind_gpasid, 6117 .sva_unbind_gpasid = intel_svm_unbind_gpasid, 6118 .sva_bind = intel_svm_bind, 6119 .sva_unbind = intel_svm_unbind, 6120 .sva_get_pasid = intel_svm_get_pasid, 6121 .page_response = intel_svm_page_response, 6122 #endif 6123 }; 6124 6125 static void quirk_iommu_igfx(struct pci_dev *dev) 6126 { 6127 if (risky_device(dev)) 6128 return; 6129 6130 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); 6131 dmar_map_gfx = 0; 6132 } 6133 6134 /* G4x/GM45 integrated gfx dmar support is totally busted. */ 6135 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); 6136 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); 6137 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); 6138 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); 6139 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); 6140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); 6141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); 6142 6143 /* Broadwell igfx malfunctions with dmar */ 6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); 6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); 6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); 6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); 6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); 6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); 6150 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); 6151 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); 6152 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); 6153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); 6154 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); 6155 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); 6156 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); 6157 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); 6158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); 6159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); 6160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); 6161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); 6162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); 6163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); 6164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); 6165 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); 6166 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); 6167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); 6168 6169 static void quirk_iommu_rwbf(struct pci_dev *dev) 6170 { 6171 if (risky_device(dev)) 6172 return; 6173 6174 /* 6175 * Mobile 4 Series Chipset neglects to set RWBF capability, 6176 * but needs it. Same seems to hold for the desktop versions. 6177 */ 6178 pci_info(dev, "Forcing write-buffer flush capability\n"); 6179 rwbf_quirk = 1; 6180 } 6181 6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); 6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); 6184 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); 6185 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); 6186 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); 6187 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); 6188 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); 6189 6190 #define GGC 0x52 6191 #define GGC_MEMORY_SIZE_MASK (0xf << 8) 6192 #define GGC_MEMORY_SIZE_NONE (0x0 << 8) 6193 #define GGC_MEMORY_SIZE_1M (0x1 << 8) 6194 #define GGC_MEMORY_SIZE_2M (0x3 << 8) 6195 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) 6196 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) 6197 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) 6198 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) 6199 6200 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) 6201 { 6202 unsigned short ggc; 6203 6204 if (risky_device(dev)) 6205 return; 6206 6207 if (pci_read_config_word(dev, GGC, &ggc)) 6208 return; 6209 6210 if (!(ggc & GGC_MEMORY_VT_ENABLED)) { 6211 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); 6212 dmar_map_gfx = 0; 6213 } else if (dmar_map_gfx) { 6214 /* we have to ensure the gfx device is idle before we flush */ 6215 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); 6216 intel_iommu_strict = 1; 6217 } 6218 } 6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); 6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); 6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); 6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); 6223 6224 static void quirk_igfx_skip_te_disable(struct pci_dev *dev) 6225 { 6226 unsigned short ver; 6227 6228 if (!IS_GFX_DEVICE(dev)) 6229 return; 6230 6231 ver = (dev->device >> 8) & 0xff; 6232 if (ver != 0x45 && ver != 0x46 && ver != 0x4c && 6233 ver != 0x4e && ver != 0x8a && ver != 0x98 && 6234 ver != 0x9a) 6235 return; 6236 6237 if (risky_device(dev)) 6238 return; 6239 6240 pci_info(dev, "Skip IOMMU disabling for graphics\n"); 6241 iommu_skip_te_disable = 1; 6242 } 6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); 6244 6245 /* On Tylersburg chipsets, some BIOSes have been known to enable the 6246 ISOCH DMAR unit for the Azalia sound device, but not give it any 6247 TLB entries, which causes it to deadlock. Check for that. We do 6248 this in a function called from init_dmars(), instead of in a PCI 6249 quirk, because we don't want to print the obnoxious "BIOS broken" 6250 message if VT-d is actually disabled. 6251 */ 6252 static void __init check_tylersburg_isoch(void) 6253 { 6254 struct pci_dev *pdev; 6255 uint32_t vtisochctrl; 6256 6257 /* If there's no Azalia in the system anyway, forget it. */ 6258 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); 6259 if (!pdev) 6260 return; 6261 6262 if (risky_device(pdev)) { 6263 pci_dev_put(pdev); 6264 return; 6265 } 6266 6267 pci_dev_put(pdev); 6268 6269 /* System Management Registers. Might be hidden, in which case 6270 we can't do the sanity check. But that's OK, because the 6271 known-broken BIOSes _don't_ actually hide it, so far. */ 6272 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); 6273 if (!pdev) 6274 return; 6275 6276 if (risky_device(pdev)) { 6277 pci_dev_put(pdev); 6278 return; 6279 } 6280 6281 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { 6282 pci_dev_put(pdev); 6283 return; 6284 } 6285 6286 pci_dev_put(pdev); 6287 6288 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ 6289 if (vtisochctrl & 1) 6290 return; 6291 6292 /* Drop all bits other than the number of TLB entries */ 6293 vtisochctrl &= 0x1c; 6294 6295 /* If we have the recommended number of TLB entries (16), fine. */ 6296 if (vtisochctrl == 0x10) 6297 return; 6298 6299 /* Zero TLB entries? You get to ride the short bus to school. */ 6300 if (!vtisochctrl) { 6301 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" 6302 "BIOS vendor: %s; Ver: %s; Product Version: %s\n", 6303 dmi_get_system_info(DMI_BIOS_VENDOR), 6304 dmi_get_system_info(DMI_BIOS_VERSION), 6305 dmi_get_system_info(DMI_PRODUCT_VERSION)); 6306 iommu_identity_mapping |= IDENTMAP_AZALIA; 6307 return; 6308 } 6309 6310 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", 6311 vtisochctrl); 6312 } 6313