1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN (1)
63
64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE (9)
68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69
agaw_to_level(int agaw)70 static inline int agaw_to_level(int agaw)
71 {
72 return agaw + 2;
73 }
74
agaw_to_width(int agaw)75 static inline int agaw_to_width(int agaw)
76 {
77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
width_to_agaw(int width)80 static inline int width_to_agaw(int width)
81 {
82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
level_to_offset_bits(int level)85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 return (level - 1) * LEVEL_STRIDE;
88 }
89
pfn_level_offset(u64 pfn,int level)90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
level_mask(int level)95 static inline u64 level_mask(int level)
96 {
97 return -1ULL << level_to_offset_bits(level);
98 }
99
level_size(int level)100 static inline u64 level_size(int level)
101 {
102 return 1ULL << level_to_offset_bits(level);
103 }
104
align_to_level(u64 pfn,int level)105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
lvl_to_nr_pages(unsigned int lvl)110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116 are never going to work. */
mm_to_dma_pfn_start(unsigned long mm_pfn)117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
mm_to_dma_pfn_end(unsigned long mm_pfn)121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
page_to_dma_pfn(struct page * pg)125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
virt_to_dma_pfn(void * p)129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
140 */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149 * if marked present.
150 */
root_entry_lctp(struct root_entry * re)151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 if (!(re->lo & 1))
154 return 0;
155
156 return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161 * if marked present.
162 */
root_entry_uctp(struct root_entry * re)163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 if (!(re->hi & 1))
166 return 0;
167
168 return re->hi & VTD_PAGE_MASK;
169 }
170
context_set_present(struct context_entry * context)171 static inline void context_set_present(struct context_entry *context)
172 {
173 context->lo |= 1;
174 }
175
context_set_fault_enable(struct context_entry * context)176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 context->lo &= (((u64)-1) << 2) | 1;
179 }
180
context_set_translation_type(struct context_entry * context,unsigned long value)181 static inline void context_set_translation_type(struct context_entry *context,
182 unsigned long value)
183 {
184 context->lo &= (((u64)-1) << 4) | 3;
185 context->lo |= (value & 3) << 2;
186 }
187
context_set_address_root(struct context_entry * context,unsigned long value)188 static inline void context_set_address_root(struct context_entry *context,
189 unsigned long value)
190 {
191 context->lo &= ~VTD_PAGE_MASK;
192 context->lo |= value & VTD_PAGE_MASK;
193 }
194
context_set_address_width(struct context_entry * context,unsigned long value)195 static inline void context_set_address_width(struct context_entry *context,
196 unsigned long value)
197 {
198 context->hi |= value & 7;
199 }
200
context_set_domain_id(struct context_entry * context,unsigned long value)201 static inline void context_set_domain_id(struct context_entry *context,
202 unsigned long value)
203 {
204 context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
context_set_pasid(struct context_entry * context)207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 context->lo |= CONTEXT_PASIDE;
210 }
211
context_domain_id(struct context_entry * c)212 static inline int context_domain_id(struct context_entry *c)
213 {
214 return((c->hi >> 8) & 0xffff);
215 }
216
context_clear_entry(struct context_entry * context)217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 context->lo = 0;
220 context->hi = 0;
221 }
222
context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 if (!iommu->copied_tables)
226 return false;
227
228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
set_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
clear_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244 * This domain is a statically identity mapping domain.
245 * 1. This domain creats a static 1:1 mapping to all usable memory.
246 * 2. It maps to each iommu if successful.
247 * 3. Each iommu mapps to this domain if successful.
248 */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253 struct list_head list; /* list of rmrr units */
254 struct acpi_dmar_header *hdr; /* ACPI header */
255 u64 base_address; /* reserved base address*/
256 u64 end_address; /* reserved end address */
257 struct dmar_dev_scope *devices; /* target devices */
258 int devices_cnt; /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262 struct list_head list; /* list of ATSR units */
263 struct acpi_dmar_header *hdr; /* ACPI header */
264 struct dmar_dev_scope *devices; /* target devices */
265 int devices_cnt; /* target device count */
266 u8 include_all:1; /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270 struct list_head list; /* list of SATC units */
271 struct acpi_dmar_header *hdr; /* ACPI header */
272 struct dmar_dev_scope *devices; /* target devices */
273 struct intel_iommu *iommu; /* the corresponding iommu */
274 int devices_cnt; /* target device count */
275 u8 atc_required:1; /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298
299 #define IDENTMAP_GFX 2
300 #define IDENTMAP_AZALIA 4
301
302 const struct iommu_ops intel_iommu_ops;
303
translation_pre_enabled(struct intel_iommu * iommu)304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
clear_translation_pre_enabled(struct intel_iommu * iommu)309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
init_translation_status(struct intel_iommu * iommu)314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 u32 gsts;
317
318 gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 if (gsts & DMA_GSTS_TES)
320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
intel_iommu_setup(char * str)323 static int __init intel_iommu_setup(char *str)
324 {
325 if (!str)
326 return -EINVAL;
327
328 while (*str) {
329 if (!strncmp(str, "on", 2)) {
330 dmar_disabled = 0;
331 pr_info("IOMMU enabled\n");
332 } else if (!strncmp(str, "off", 3)) {
333 dmar_disabled = 1;
334 no_platform_optin = 1;
335 pr_info("IOMMU disabled\n");
336 } else if (!strncmp(str, "igfx_off", 8)) {
337 dmar_map_gfx = 0;
338 pr_info("Disable GFX device mapping\n");
339 } else if (!strncmp(str, "forcedac", 8)) {
340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 iommu_dma_forcedac = true;
342 } else if (!strncmp(str, "strict", 6)) {
343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 iommu_set_dma_strict();
345 } else if (!strncmp(str, "sp_off", 6)) {
346 pr_info("Disable supported super page\n");
347 intel_iommu_superpage = 0;
348 } else if (!strncmp(str, "sm_on", 5)) {
349 pr_info("Enable scalable mode if hardware supports\n");
350 intel_iommu_sm = 1;
351 } else if (!strncmp(str, "sm_off", 6)) {
352 pr_info("Scalable mode is disallowed\n");
353 intel_iommu_sm = 0;
354 } else if (!strncmp(str, "tboot_noforce", 13)) {
355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 intel_iommu_tboot_noforce = 1;
357 } else {
358 pr_notice("Unknown option - '%s'\n", str);
359 }
360
361 str += strcspn(str, ",");
362 while (*str == ',')
363 str++;
364 }
365
366 return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
alloc_pgtable_page(int node,gfp_t gfp)370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 struct page *page;
373 void *vaddr = NULL;
374
375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 if (page)
377 vaddr = page_address(page);
378 return vaddr;
379 }
380
free_pgtable_page(void * vaddr)381 void free_pgtable_page(void *vaddr)
382 {
383 free_page((unsigned long)vaddr);
384 }
385
domain_type_is_si(struct dmar_domain * domain)386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 unsigned long pfn)
393 {
394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402 * the returned SAGAW.
403 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 unsigned long fl_sagaw, sl_sagaw;
407
408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 sl_sagaw = cap_sagaw(iommu->cap);
410
411 /* Second level only. */
412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 return sl_sagaw;
414
415 /* First level only. */
416 if (!ecap_slts(iommu->ecap))
417 return fl_sagaw;
418
419 return fl_sagaw & sl_sagaw;
420 }
421
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 unsigned long sagaw;
425 int agaw;
426
427 sagaw = __iommu_calculate_sagaw(iommu);
428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 if (test_bit(agaw, &sagaw))
430 break;
431 }
432
433 return agaw;
434 }
435
436 /*
437 * Calculate max SAGAW for each iommu.
438 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445 * calculate agaw for each iommu.
446 * "SAGAW" may be different across iommus, use a default agaw, and
447 * get a supported less agaw for iommus that don't support the default agaw.
448 */
iommu_calculate_agaw(struct intel_iommu * iommu)449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
iommu_paging_structure_coherency(struct intel_iommu * iommu)454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 return sm_supported(iommu) ?
457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
domain_update_iommu_coherency(struct dmar_domain * domain)460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 struct iommu_domain_info *info;
463 struct dmar_drhd_unit *drhd;
464 struct intel_iommu *iommu;
465 bool found = false;
466 unsigned long i;
467
468 domain->iommu_coherency = true;
469 xa_for_each(&domain->iommu_array, i, info) {
470 found = true;
471 if (!iommu_paging_structure_coherency(info->iommu)) {
472 domain->iommu_coherency = false;
473 break;
474 }
475 }
476 if (found)
477 return;
478
479 /* No hardware attached; use lowest common denominator */
480 rcu_read_lock();
481 for_each_active_iommu(iommu, drhd) {
482 if (!iommu_paging_structure_coherency(iommu)) {
483 domain->iommu_coherency = false;
484 break;
485 }
486 }
487 rcu_read_unlock();
488 }
489
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 struct intel_iommu *skip)
492 {
493 struct dmar_drhd_unit *drhd;
494 struct intel_iommu *iommu;
495 int mask = 0x3;
496
497 if (!intel_iommu_superpage)
498 return 0;
499
500 /* set iommu_superpage to the smallest common denominator */
501 rcu_read_lock();
502 for_each_active_iommu(iommu, drhd) {
503 if (iommu != skip) {
504 if (domain && domain->use_first_level) {
505 if (!cap_fl1gp_support(iommu->cap))
506 mask = 0x1;
507 } else {
508 mask &= cap_super_page_val(iommu->cap);
509 }
510
511 if (!mask)
512 break;
513 }
514 }
515 rcu_read_unlock();
516
517 return fls(mask);
518 }
519
domain_update_device_node(struct dmar_domain * domain)520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 struct device_domain_info *info;
523 int nid = NUMA_NO_NODE;
524 unsigned long flags;
525
526 spin_lock_irqsave(&domain->lock, flags);
527 list_for_each_entry(info, &domain->devices, link) {
528 /*
529 * There could possibly be multiple device numa nodes as devices
530 * within the same domain may sit behind different IOMMUs. There
531 * isn't perfect answer in such situation, so we select first
532 * come first served policy.
533 */
534 nid = dev_to_node(info->dev);
535 if (nid != NUMA_NO_NODE)
536 break;
537 }
538 spin_unlock_irqrestore(&domain->lock, flags);
539
540 return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 unsigned long bitmap = 0;
549
550 /*
551 * 1-level super page supports page size of 2MiB, 2-level super page
552 * supports page size of both 2MiB and 1GiB.
553 */
554 if (domain->iommu_superpage == 1)
555 bitmap |= SZ_2M;
556 else if (domain->iommu_superpage == 2)
557 bitmap |= SZ_2M | SZ_1G;
558
559 return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 domain_update_iommu_coherency(domain);
566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568 /*
569 * If RHSA is missing, we should default to the device numa domain
570 * as fall back.
571 */
572 if (domain->nid == NUMA_NO_NODE)
573 domain->nid = domain_update_device_node(domain);
574
575 /*
576 * First-level translation restricts the input-address to a
577 * canonical address (i.e., address bits 63:N have the same
578 * value as address bit [N-1], where N is 48-bits with 4-level
579 * paging and 57-bits with 5-level paging). Hence, skip bit
580 * [N-1].
581 */
582 if (domain->use_first_level)
583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 else
585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 domain_update_iotlb(domain);
589 }
590
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 u8 devfn, int alloc)
593 {
594 struct root_entry *root = &iommu->root_entry[bus];
595 struct context_entry *context;
596 u64 *entry;
597
598 /*
599 * Except that the caller requested to allocate a new entry,
600 * returning a copied context entry makes no sense.
601 */
602 if (!alloc && context_copied(iommu, bus, devfn))
603 return NULL;
604
605 entry = &root->lo;
606 if (sm_supported(iommu)) {
607 if (devfn >= 0x80) {
608 devfn -= 0x80;
609 entry = &root->hi;
610 }
611 devfn *= 2;
612 }
613 if (*entry & 1)
614 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 else {
616 unsigned long phy_addr;
617 if (!alloc)
618 return NULL;
619
620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 if (!context)
622 return NULL;
623
624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 phy_addr = virt_to_phys((void *)context);
626 *entry = phy_addr | 1;
627 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628 }
629 return &context[devfn];
630 }
631
632 /**
633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634 * sub-hierarchy of a candidate PCI-PCI bridge
635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636 * @bridge: the candidate PCI-PCI bridge
637 *
638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639 */
640 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 struct pci_dev *pdev, *pbridge;
644
645 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 return false;
647
648 pdev = to_pci_dev(dev);
649 pbridge = to_pci_dev(bridge);
650
651 if (pbridge->subordinate &&
652 pbridge->subordinate->number <= pdev->bus->number &&
653 pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 return true;
655
656 return false;
657 }
658
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 struct dmar_drhd_unit *drhd;
662 u32 vtbar;
663 int rc;
664
665 /* We know that this device on this chipset has its own IOMMU.
666 * If we find it under a different IOMMU, then the BIOS is lying
667 * to us. Hope that the IOMMU for this device is actually
668 * disabled, and it needs no translation...
669 */
670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 if (rc) {
672 /* "can't" happen */
673 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 return false;
675 }
676 vtbar &= 0xffff0000;
677
678 /* we know that the this iommu should be at offset 0xa000 from vtbar */
679 drhd = dmar_find_matched_drhd_unit(pdev);
680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 return true;
684 }
685
686 return false;
687 }
688
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 if (!iommu || iommu->drhd->ignored)
692 return true;
693
694 if (dev_is_pci(dev)) {
695 struct pci_dev *pdev = to_pci_dev(dev);
696
697 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 quirk_ioat_snb_local_iommu(pdev))
700 return true;
701 }
702
703 return false;
704 }
705
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 struct dmar_drhd_unit *drhd = NULL;
709 struct pci_dev *pdev = NULL;
710 struct intel_iommu *iommu;
711 struct device *tmp;
712 u16 segment = 0;
713 int i;
714
715 if (!dev)
716 return NULL;
717
718 if (dev_is_pci(dev)) {
719 struct pci_dev *pf_pdev;
720
721 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723 /* VFs aren't listed in scope tables; we need to look up
724 * the PF instead to find the IOMMU. */
725 pf_pdev = pci_physfn(pdev);
726 dev = &pf_pdev->dev;
727 segment = pci_domain_nr(pdev->bus);
728 } else if (has_acpi_companion(dev))
729 dev = &ACPI_COMPANION(dev)->dev;
730
731 rcu_read_lock();
732 for_each_iommu(iommu, drhd) {
733 if (pdev && segment != drhd->segment)
734 continue;
735
736 for_each_active_dev_scope(drhd->devices,
737 drhd->devices_cnt, i, tmp) {
738 if (tmp == dev) {
739 /* For a VF use its original BDF# not that of the PF
740 * which we used for the IOMMU lookup. Strictly speaking
741 * we could do this for all PCI devices; we only need to
742 * get the BDF# from the scope table for ACPI matches. */
743 if (pdev && pdev->is_virtfn)
744 goto got_pdev;
745
746 if (bus && devfn) {
747 *bus = drhd->devices[i].bus;
748 *devfn = drhd->devices[i].devfn;
749 }
750 goto out;
751 }
752
753 if (is_downstream_to_pci_bridge(dev, tmp))
754 goto got_pdev;
755 }
756
757 if (pdev && drhd->include_all) {
758 got_pdev:
759 if (bus && devfn) {
760 *bus = pdev->bus->number;
761 *devfn = pdev->devfn;
762 }
763 goto out;
764 }
765 }
766 iommu = NULL;
767 out:
768 if (iommu_is_dummy(iommu, dev))
769 iommu = NULL;
770
771 rcu_read_unlock();
772
773 return iommu;
774 }
775
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)776 static void domain_flush_cache(struct dmar_domain *domain,
777 void *addr, int size)
778 {
779 if (!domain->iommu_coherency)
780 clflush_cache_range(addr, size);
781 }
782
free_context_table(struct intel_iommu * iommu)783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 struct context_entry *context;
786 int i;
787
788 if (!iommu->root_entry)
789 return;
790
791 for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 context = iommu_context_addr(iommu, i, 0, 0);
793 if (context)
794 free_pgtable_page(context);
795
796 if (!sm_supported(iommu))
797 continue;
798
799 context = iommu_context_addr(iommu, i, 0x80, 0);
800 if (context)
801 free_pgtable_page(context);
802 }
803
804 free_pgtable_page(iommu->root_entry);
805 iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 struct dma_pte *pte;
813 int offset;
814
815 while (1) {
816 offset = pfn_level_offset(pfn, level);
817 pte = &parent[offset];
818 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 pr_info("PTE not present at level %d\n", level);
820 break;
821 }
822
823 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825 if (level == 1)
826 break;
827
828 parent = phys_to_virt(dma_pte_addr(pte));
829 level--;
830 }
831 }
832
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 unsigned long long addr, u32 pasid)
835 {
836 struct pasid_dir_entry *dir, *pde;
837 struct pasid_entry *entries, *pte;
838 struct context_entry *ctx_entry;
839 struct root_entry *rt_entry;
840 int i, dir_index, index, level;
841 u8 devfn = source_id & 0xff;
842 u8 bus = source_id >> 8;
843 struct dma_pte *pgtable;
844
845 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847 /* root entry dump */
848 rt_entry = &iommu->root_entry[bus];
849 if (!rt_entry) {
850 pr_info("root table entry is not present\n");
851 return;
852 }
853
854 if (sm_supported(iommu))
855 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 rt_entry->hi, rt_entry->lo);
857 else
858 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860 /* context entry dump */
861 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862 if (!ctx_entry) {
863 pr_info("context table entry is not present\n");
864 return;
865 }
866
867 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 ctx_entry->hi, ctx_entry->lo);
869
870 /* legacy mode does not require PASID entries */
871 if (!sm_supported(iommu)) {
872 level = agaw_to_level(ctx_entry->hi & 7);
873 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 goto pgtable_walk;
875 }
876
877 /* get the pointer to pasid directory entry */
878 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 if (!dir) {
880 pr_info("pasid directory entry is not present\n");
881 return;
882 }
883 /* For request-without-pasid, get the pasid from context entry */
884 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885 pasid = IOMMU_NO_PASID;
886
887 dir_index = pasid >> PASID_PDE_SHIFT;
888 pde = &dir[dir_index];
889 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891 /* get the pointer to the pasid table entry */
892 entries = get_pasid_table_from_pde(pde);
893 if (!entries) {
894 pr_info("pasid table entry is not present\n");
895 return;
896 }
897 index = pasid & PASID_PTE_MASK;
898 pte = &entries[index];
899 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905 } else {
906 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908 }
909
910 pgtable_walk:
911 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916 unsigned long pfn, int *target_level,
917 gfp_t gfp)
918 {
919 struct dma_pte *parent, *pte;
920 int level = agaw_to_level(domain->agaw);
921 int offset;
922
923 if (!domain_pfn_supported(domain, pfn))
924 /* Address beyond IOMMU's addressing capabilities. */
925 return NULL;
926
927 parent = domain->pgd;
928
929 while (1) {
930 void *tmp_page;
931
932 offset = pfn_level_offset(pfn, level);
933 pte = &parent[offset];
934 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935 break;
936 if (level == *target_level)
937 break;
938
939 if (!dma_pte_present(pte)) {
940 uint64_t pteval;
941
942 tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944 if (!tmp_page)
945 return NULL;
946
947 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949 if (domain->use_first_level)
950 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952 if (cmpxchg64(&pte->val, 0ULL, pteval))
953 /* Someone else set it while we were thinking; use theirs. */
954 free_pgtable_page(tmp_page);
955 else
956 domain_flush_cache(domain, pte, sizeof(*pte));
957 }
958 if (level == 1)
959 break;
960
961 parent = phys_to_virt(dma_pte_addr(pte));
962 level--;
963 }
964
965 if (!*target_level)
966 *target_level = level;
967
968 return pte;
969 }
970
971 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973 unsigned long pfn,
974 int level, int *large_page)
975 {
976 struct dma_pte *parent, *pte;
977 int total = agaw_to_level(domain->agaw);
978 int offset;
979
980 parent = domain->pgd;
981 while (level <= total) {
982 offset = pfn_level_offset(pfn, total);
983 pte = &parent[offset];
984 if (level == total)
985 return pte;
986
987 if (!dma_pte_present(pte)) {
988 *large_page = total;
989 break;
990 }
991
992 if (dma_pte_superpage(pte)) {
993 *large_page = total;
994 return pte;
995 }
996
997 parent = phys_to_virt(dma_pte_addr(pte));
998 total--;
999 }
1000 return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005 unsigned long start_pfn,
1006 unsigned long last_pfn)
1007 {
1008 unsigned int large_page;
1009 struct dma_pte *first_pte, *pte;
1010
1011 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 WARN_ON(start_pfn > last_pfn))
1013 return;
1014
1015 /* we don't need lock here; nobody else touches the iova range */
1016 do {
1017 large_page = 1;
1018 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019 if (!pte) {
1020 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021 continue;
1022 }
1023 do {
1024 dma_clear_pte(pte);
1025 start_pfn += lvl_to_nr_pages(large_page);
1026 pte++;
1027 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029 domain_flush_cache(domain, first_pte,
1030 (void *)pte - (void *)first_pte);
1031
1032 } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036 int retain_level, struct dma_pte *pte,
1037 unsigned long pfn, unsigned long start_pfn,
1038 unsigned long last_pfn)
1039 {
1040 pfn = max(start_pfn, pfn);
1041 pte = &pte[pfn_level_offset(pfn, level)];
1042
1043 do {
1044 unsigned long level_pfn;
1045 struct dma_pte *level_pte;
1046
1047 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048 goto next;
1049
1050 level_pfn = pfn & level_mask(level);
1051 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053 if (level > 2) {
1054 dma_pte_free_level(domain, level - 1, retain_level,
1055 level_pte, level_pfn, start_pfn,
1056 last_pfn);
1057 }
1058
1059 /*
1060 * Free the page table if we're below the level we want to
1061 * retain and the range covers the entire table.
1062 */
1063 if (level < retain_level && !(start_pfn > level_pfn ||
1064 last_pfn < level_pfn + level_size(level) - 1)) {
1065 dma_clear_pte(pte);
1066 domain_flush_cache(domain, pte, sizeof(*pte));
1067 free_pgtable_page(level_pte);
1068 }
1069 next:
1070 pfn += level_size(level);
1071 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075 * clear last level (leaf) ptes and free page table pages below the
1076 * level we wish to keep intact.
1077 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079 unsigned long start_pfn,
1080 unsigned long last_pfn,
1081 int retain_level)
1082 {
1083 dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085 /* We don't need lock here; nobody else touches the iova range */
1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 domain->pgd, 0, start_pfn, last_pfn);
1088
1089 /* free pgd */
1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 free_pgtable_page(domain->pgd);
1092 domain->pgd = NULL;
1093 }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097 need to *modify* it at all. All we need to do is make a list of all the
1098 pages which can be freed just as soon as we've flushed the IOTLB and we
1099 know the hardware page-walk will no longer touch them.
1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 int level, struct dma_pte *pte,
1104 struct list_head *freelist)
1105 {
1106 struct page *pg;
1107
1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 list_add_tail(&pg->lru, freelist);
1110
1111 if (level == 1)
1112 return;
1113
1114 pte = page_address(pg);
1115 do {
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118 pte++;
1119 } while (!first_pte_in_page(pte));
1120 }
1121
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 struct dma_pte *pte, unsigned long pfn,
1124 unsigned long start_pfn, unsigned long last_pfn,
1125 struct list_head *freelist)
1126 {
1127 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129 pfn = max(start_pfn, pfn);
1130 pte = &pte[pfn_level_offset(pfn, level)];
1131
1132 do {
1133 unsigned long level_pfn = pfn & level_mask(level);
1134
1135 if (!dma_pte_present(pte))
1136 goto next;
1137
1138 /* If range covers entire pagetable, free it */
1139 if (start_pfn <= level_pfn &&
1140 last_pfn >= level_pfn + level_size(level) - 1) {
1141 /* These suborbinate page tables are going away entirely. Don't
1142 bother to clear them; we're just going to *free* them. */
1143 if (level > 1 && !dma_pte_superpage(pte))
1144 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146 dma_clear_pte(pte);
1147 if (!first_pte)
1148 first_pte = pte;
1149 last_pte = pte;
1150 } else if (level > 1) {
1151 /* Recurse down into a level that isn't *entirely* obsolete */
1152 dma_pte_clear_level(domain, level - 1,
1153 phys_to_virt(dma_pte_addr(pte)),
1154 level_pfn, start_pfn, last_pfn,
1155 freelist);
1156 }
1157 next:
1158 pfn = level_pfn + level_size(level);
1159 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161 if (first_pte)
1162 domain_flush_cache(domain, first_pte,
1163 (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167 the page tables, and may have cached the intermediate levels. The
1168 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 unsigned long last_pfn, struct list_head *freelist)
1171 {
1172 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 WARN_ON(start_pfn > last_pfn))
1174 return;
1175
1176 /* we don't need lock here; nobody else touches the iova range */
1177 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180 /* free pgd */
1181 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 struct page *pgd_page = virt_to_page(domain->pgd);
1183 list_add_tail(&pgd_page->lru, freelist);
1184 domain->pgd = NULL;
1185 }
1186 }
1187
1188 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191 struct root_entry *root;
1192
1193 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194 if (!root) {
1195 pr_err("Allocating root entry for %s failed\n",
1196 iommu->name);
1197 return -ENOMEM;
1198 }
1199
1200 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201 iommu->root_entry = root;
1202
1203 return 0;
1204 }
1205
iommu_set_root_entry(struct intel_iommu * iommu)1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208 u64 addr;
1209 u32 sts;
1210 unsigned long flag;
1211
1212 addr = virt_to_phys(iommu->root_entry);
1213 if (sm_supported(iommu))
1214 addr |= DMA_RTADDR_SMT;
1215
1216 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221 /* Make sure hardware complete it */
1222 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223 readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227 /*
1228 * Hardware invalidates all DMA remapping hardware translation
1229 * caches as part of SRTP flow.
1230 */
1231 if (cap_esrtps(iommu->cap))
1232 return;
1233
1234 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 if (sm_supported(iommu))
1236 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
iommu_flush_write_buffer(struct intel_iommu * iommu)1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242 u32 val;
1243 unsigned long flag;
1244
1245 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246 return;
1247
1248 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251 /* Make sure hardware complete it */
1252 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253 readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260 u16 did, u16 source_id, u8 function_mask,
1261 u64 type)
1262 {
1263 u64 val = 0;
1264 unsigned long flag;
1265
1266 switch (type) {
1267 case DMA_CCMD_GLOBAL_INVL:
1268 val = DMA_CCMD_GLOBAL_INVL;
1269 break;
1270 case DMA_CCMD_DOMAIN_INVL:
1271 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 break;
1273 case DMA_CCMD_DEVICE_INVL:
1274 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276 break;
1277 default:
1278 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279 iommu->name, type);
1280 return;
1281 }
1282 val |= DMA_CCMD_ICC;
1283
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 u64 addr, unsigned int size_order, u64 type)
1297 {
1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 u64 val = 0, val_iva = 0;
1300 unsigned long flag;
1301
1302 switch (type) {
1303 case DMA_TLB_GLOBAL_FLUSH:
1304 /* global flush doesn't need set IVA_REG */
1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 break;
1307 case DMA_TLB_DSI_FLUSH:
1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 break;
1310 case DMA_TLB_PSI_FLUSH:
1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 /* IH bit is passed in as part of address */
1313 val_iva = size_order | addr;
1314 break;
1315 default:
1316 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317 iommu->name, type);
1318 return;
1319 }
1320
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1323
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 /* Note: Only uses first TLB reg currently */
1326 if (val_iva)
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
1338 pr_err("Flush IOTLB failed\n");
1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349 struct device_domain_info *info;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&domain->lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 spin_unlock_irqrestore(&domain->lock, flags);
1357 return info;
1358 }
1359 }
1360 spin_unlock_irqrestore(&domain->lock, flags);
1361
1362 return NULL;
1363 }
1364
domain_update_iotlb(struct dmar_domain * domain)1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 struct dev_pasid_info *dev_pasid;
1368 struct device_domain_info *info;
1369 bool has_iotlb_device = false;
1370 unsigned long flags;
1371
1372 spin_lock_irqsave(&domain->lock, flags);
1373 list_for_each_entry(info, &domain->devices, link) {
1374 if (info->ats_enabled) {
1375 has_iotlb_device = true;
1376 break;
1377 }
1378 }
1379
1380 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 info = dev_iommu_priv_get(dev_pasid->dev);
1382 if (info->ats_enabled) {
1383 has_iotlb_device = true;
1384 break;
1385 }
1386 }
1387 domain->has_iotlb_device = has_iotlb_device;
1388 spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394 * check because it applies only to the built-in QAT devices and it doesn't
1395 * grant additional privileges.
1396 */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401 return false;
1402
1403 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404 return false;
1405
1406 return true;
1407 }
1408
iommu_enable_pci_caps(struct device_domain_info * info)1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411 struct pci_dev *pdev;
1412
1413 if (!dev_is_pci(info->dev))
1414 return;
1415
1416 pdev = to_pci_dev(info->dev);
1417
1418 /* The PCIe spec, in its wisdom, declares that the behaviour of
1419 the device if you enable PASID support after ATS support is
1420 undefined. So always enable PASID support on devices which
1421 have it, even if we can't yet know if we're ever going to
1422 use it. */
1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 info->pasid_enabled = 1;
1425
1426 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428 info->ats_enabled = 1;
1429 domain_update_iotlb(info->domain);
1430 }
1431 }
1432
iommu_disable_pci_caps(struct device_domain_info * info)1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435 struct pci_dev *pdev;
1436
1437 if (!dev_is_pci(info->dev))
1438 return;
1439
1440 pdev = to_pci_dev(info->dev);
1441
1442 if (info->ats_enabled) {
1443 pci_disable_ats(pdev);
1444 info->ats_enabled = 0;
1445 domain_update_iotlb(info->domain);
1446 }
1447
1448 if (info->pasid_enabled) {
1449 pci_disable_pasid(pdev);
1450 info->pasid_enabled = 0;
1451 }
1452 }
1453
__iommu_flush_dev_iotlb(struct device_domain_info * info,u64 addr,unsigned int mask)1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 u64 addr, unsigned int mask)
1456 {
1457 u16 sid, qdep;
1458
1459 if (!info || !info->ats_enabled)
1460 return;
1461
1462 sid = info->bus << 8 | info->devfn;
1463 qdep = info->ats_qdep;
1464 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 qdep, addr, mask);
1466 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 u64 addr, unsigned mask)
1471 {
1472 struct dev_pasid_info *dev_pasid;
1473 struct device_domain_info *info;
1474 unsigned long flags;
1475
1476 if (!domain->has_iotlb_device)
1477 return;
1478
1479 spin_lock_irqsave(&domain->lock, flags);
1480 list_for_each_entry(info, &domain->devices, link)
1481 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486 if (!info->ats_enabled)
1487 continue;
1488
1489 qi_flush_dev_iotlb_pasid(info->iommu,
1490 PCI_DEVID(info->bus, info->devfn),
1491 info->pfsid, dev_pasid->pasid,
1492 info->ats_qdep, addr,
1493 mask);
1494 }
1495 spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
domain_flush_pasid_iotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 struct dmar_domain *domain, u64 addr,
1500 unsigned long npages, bool ih)
1501 {
1502 u16 did = domain_id_iommu(domain, iommu);
1503 struct dev_pasid_info *dev_pasid;
1504 unsigned long flags;
1505
1506 spin_lock_irqsave(&domain->lock, flags);
1507 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510 if (!list_empty(&domain->devices))
1511 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512 spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 struct dmar_domain *domain,
1517 unsigned long pfn, unsigned int pages,
1518 int ih, int map)
1519 {
1520 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 unsigned int mask = ilog2(aligned_pages);
1522 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523 u16 did = domain_id_iommu(domain, iommu);
1524
1525 if (WARN_ON(!pages))
1526 return;
1527
1528 if (ih)
1529 ih = 1 << 6;
1530
1531 if (domain->use_first_level) {
1532 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533 } else {
1534 unsigned long bitmask = aligned_pages - 1;
1535
1536 /*
1537 * PSI masks the low order bits of the base address. If the
1538 * address isn't aligned to the mask, then compute a mask value
1539 * needed to ensure the target range is flushed.
1540 */
1541 if (unlikely(bitmask & pfn)) {
1542 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544 /*
1545 * Since end_pfn <= pfn + bitmask, the only way bits
1546 * higher than bitmask can differ in pfn and end_pfn is
1547 * by carrying. This means after masking out bitmask,
1548 * high bits starting with the first set bit in
1549 * shared_bits are all equal in both pfn and end_pfn.
1550 */
1551 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553 }
1554
1555 /*
1556 * Fallback to domain selective flush if no PSI support or
1557 * the size is too big.
1558 */
1559 if (!cap_pgsel_inv(iommu->cap) ||
1560 mask > cap_max_amask_val(iommu->cap))
1561 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 DMA_TLB_DSI_FLUSH);
1563 else
1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 DMA_TLB_PSI_FLUSH);
1566 }
1567
1568 /*
1569 * In caching mode, changes of pages from non-present to present require
1570 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571 */
1572 if (!cap_caching_mode(iommu->cap) || !map)
1573 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 struct dmar_domain *domain,
1579 unsigned long pfn, unsigned int pages)
1580 {
1581 /*
1582 * It's a non-present to present mapping. Only flush if caching mode
1583 * and second level.
1584 */
1585 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 else
1588 iommu_flush_write_buffer(iommu);
1589 }
1590
intel_flush_iotlb_all(struct iommu_domain * domain)1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594 struct iommu_domain_info *info;
1595 unsigned long idx;
1596
1597 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 struct intel_iommu *iommu = info->iommu;
1599 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601 if (dmar_domain->use_first_level)
1602 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603 else
1604 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605 DMA_TLB_DSI_FLUSH);
1606
1607 if (!cap_caching_mode(iommu->cap))
1608 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609 }
1610 }
1611
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614 u32 pmen;
1615 unsigned long flags;
1616
1617 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618 return;
1619
1620 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 pmen &= ~DMA_PMEN_EPM;
1623 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625 /* wait for the protected region status bit to clear */
1626 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
iommu_enable_translation(struct intel_iommu * iommu)1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634 u32 sts;
1635 unsigned long flags;
1636
1637 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 iommu->gcmd |= DMA_GCMD_TE;
1639 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641 /* Make sure hardware complete it */
1642 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643 readl, (sts & DMA_GSTS_TES), sts);
1644
1645 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
iommu_disable_translation(struct intel_iommu * iommu)1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650 u32 sts;
1651 unsigned long flag;
1652
1653 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655 return;
1656
1657 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658 iommu->gcmd &= ~DMA_GCMD_TE;
1659 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661 /* Make sure hardware complete it */
1662 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663 readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
iommu_init_domains(struct intel_iommu * iommu)1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670 u32 ndomains;
1671
1672 ndomains = cap_ndoms(iommu->cap);
1673 pr_debug("%s: Number of Domains supported <%d>\n",
1674 iommu->name, ndomains);
1675
1676 spin_lock_init(&iommu->lock);
1677
1678 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679 if (!iommu->domain_ids)
1680 return -ENOMEM;
1681
1682 /*
1683 * If Caching mode is set, then invalid translations are tagged
1684 * with domain-id 0, hence we need to pre-allocate it. We also
1685 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 * make sure it is not used for a real domain.
1687 */
1688 set_bit(0, iommu->domain_ids);
1689
1690 /*
1691 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 * entry for first-level or pass-through translation modes should
1693 * be programmed with a domain id different from those used for
1694 * second-level or nested translation. We reserve a domain id for
1695 * this purpose. This domain id is also used for identity domain
1696 * in legacy mode.
1697 */
1698 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700 return 0;
1701 }
1702
disable_dmar_iommu(struct intel_iommu * iommu)1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705 if (!iommu->domain_ids)
1706 return;
1707
1708 /*
1709 * All iommu domains must have been detached from the devices,
1710 * hence there should be no domain IDs in use.
1711 */
1712 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 > NUM_RESERVED_DID))
1714 return;
1715
1716 if (iommu->gcmd & DMA_GCMD_TE)
1717 iommu_disable_translation(iommu);
1718 }
1719
free_dmar_iommu(struct intel_iommu * iommu)1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722 if (iommu->domain_ids) {
1723 bitmap_free(iommu->domain_ids);
1724 iommu->domain_ids = NULL;
1725 }
1726
1727 if (iommu->copied_tables) {
1728 bitmap_free(iommu->copied_tables);
1729 iommu->copied_tables = NULL;
1730 }
1731
1732 /* free context mapping */
1733 free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736 if (pasid_supported(iommu)) {
1737 if (ecap_prs(iommu->ecap))
1738 intel_svm_finish_prq(iommu);
1739 }
1740 #endif
1741 }
1742
1743 /*
1744 * Check and return whether first level is used by default for
1745 * DMA translation.
1746 */
first_level_by_default(unsigned int type)1747 static bool first_level_by_default(unsigned int type)
1748 {
1749 /* Only SL is available in legacy mode */
1750 if (!scalable_mode_support())
1751 return false;
1752
1753 /* Only level (either FL or SL) is available, just use it */
1754 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 return intel_cap_flts_sanity();
1756
1757 /* Both levels are available, decide it based on domain type */
1758 return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
alloc_domain(unsigned int type)1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763 struct dmar_domain *domain;
1764
1765 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766 if (!domain)
1767 return NULL;
1768
1769 domain->nid = NUMA_NO_NODE;
1770 if (first_level_by_default(type))
1771 domain->use_first_level = true;
1772 domain->has_iotlb_device = false;
1773 INIT_LIST_HEAD(&domain->devices);
1774 INIT_LIST_HEAD(&domain->dev_pasids);
1775 spin_lock_init(&domain->lock);
1776 xa_init(&domain->iommu_array);
1777
1778 return domain;
1779 }
1780
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1781 static int domain_attach_iommu(struct dmar_domain *domain,
1782 struct intel_iommu *iommu)
1783 {
1784 struct iommu_domain_info *info, *curr;
1785 unsigned long ndomains;
1786 int num, ret = -ENOSPC;
1787
1788 info = kzalloc(sizeof(*info), GFP_KERNEL);
1789 if (!info)
1790 return -ENOMEM;
1791
1792 spin_lock(&iommu->lock);
1793 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794 if (curr) {
1795 curr->refcnt++;
1796 spin_unlock(&iommu->lock);
1797 kfree(info);
1798 return 0;
1799 }
1800
1801 ndomains = cap_ndoms(iommu->cap);
1802 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803 if (num >= ndomains) {
1804 pr_err("%s: No free domain ids\n", iommu->name);
1805 goto err_unlock;
1806 }
1807
1808 set_bit(num, iommu->domain_ids);
1809 info->refcnt = 1;
1810 info->did = num;
1811 info->iommu = iommu;
1812 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813 NULL, info, GFP_ATOMIC);
1814 if (curr) {
1815 ret = xa_err(curr) ? : -EBUSY;
1816 goto err_clear;
1817 }
1818 domain_update_iommu_cap(domain);
1819
1820 spin_unlock(&iommu->lock);
1821 return 0;
1822
1823 err_clear:
1824 clear_bit(info->did, iommu->domain_ids);
1825 err_unlock:
1826 spin_unlock(&iommu->lock);
1827 kfree(info);
1828 return ret;
1829 }
1830
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1831 static void domain_detach_iommu(struct dmar_domain *domain,
1832 struct intel_iommu *iommu)
1833 {
1834 struct iommu_domain_info *info;
1835
1836 spin_lock(&iommu->lock);
1837 info = xa_load(&domain->iommu_array, iommu->seq_id);
1838 if (--info->refcnt == 0) {
1839 clear_bit(info->did, iommu->domain_ids);
1840 xa_erase(&domain->iommu_array, iommu->seq_id);
1841 domain->nid = NUMA_NO_NODE;
1842 domain_update_iommu_cap(domain);
1843 kfree(info);
1844 }
1845 spin_unlock(&iommu->lock);
1846 }
1847
guestwidth_to_adjustwidth(int gaw)1848 static inline int guestwidth_to_adjustwidth(int gaw)
1849 {
1850 int agaw;
1851 int r = (gaw - 12) % 9;
1852
1853 if (r == 0)
1854 agaw = gaw;
1855 else
1856 agaw = gaw + 9 - r;
1857 if (agaw > 64)
1858 agaw = 64;
1859 return agaw;
1860 }
1861
domain_exit(struct dmar_domain * domain)1862 static void domain_exit(struct dmar_domain *domain)
1863 {
1864 if (domain->pgd) {
1865 LIST_HEAD(freelist);
1866
1867 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868 put_pages_list(&freelist);
1869 }
1870
1871 if (WARN_ON(!list_empty(&domain->devices)))
1872 return;
1873
1874 kfree(domain);
1875 }
1876
1877 /*
1878 * Get the PASID directory size for scalable mode context entry.
1879 * Value of X in the PDTS field of a scalable mode context entry
1880 * indicates PASID directory with 2^(X + 7) entries.
1881 */
context_get_sm_pds(struct pasid_table * table)1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883 {
1884 unsigned long pds, max_pde;
1885
1886 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888 if (pds < 7)
1889 return 0;
1890
1891 return pds - 7;
1892 }
1893
1894 /*
1895 * Set the RID_PASID field of a scalable mode context entry. The
1896 * IOMMU hardware will use the PASID value set in this field for
1897 * DMA translations of DMA requests without PASID.
1898 */
1899 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901 {
1902 context->hi |= pasid & ((1 << 20) - 1);
1903 }
1904
1905 /*
1906 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907 * entry.
1908 */
context_set_sm_dte(struct context_entry * context)1909 static inline void context_set_sm_dte(struct context_entry *context)
1910 {
1911 context->lo |= BIT_ULL(2);
1912 }
1913
1914 /*
1915 * Set the PRE(Page Request Enable) field of a scalable mode context
1916 * entry.
1917 */
context_set_sm_pre(struct context_entry * context)1918 static inline void context_set_sm_pre(struct context_entry *context)
1919 {
1920 context->lo |= BIT_ULL(4);
1921 }
1922
1923 /* Convert value to context PASID directory size field coding. */
1924 #define context_pdts(pds) (((pds) & 0x7) << 9)
1925
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)1926 static int domain_context_mapping_one(struct dmar_domain *domain,
1927 struct intel_iommu *iommu,
1928 struct pasid_table *table,
1929 u8 bus, u8 devfn)
1930 {
1931 struct device_domain_info *info =
1932 domain_lookup_dev_info(domain, iommu, bus, devfn);
1933 u16 did = domain_id_iommu(domain, iommu);
1934 int translation = CONTEXT_TT_MULTI_LEVEL;
1935 struct context_entry *context;
1936 int ret;
1937
1938 if (hw_pass_through && domain_type_is_si(domain))
1939 translation = CONTEXT_TT_PASS_THROUGH;
1940
1941 pr_debug("Set context mapping for %02x:%02x.%d\n",
1942 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943
1944 spin_lock(&iommu->lock);
1945 ret = -ENOMEM;
1946 context = iommu_context_addr(iommu, bus, devfn, 1);
1947 if (!context)
1948 goto out_unlock;
1949
1950 ret = 0;
1951 if (context_present(context) && !context_copied(iommu, bus, devfn))
1952 goto out_unlock;
1953
1954 /*
1955 * For kdump cases, old valid entries may be cached due to the
1956 * in-flight DMA and copied pgtable, but there is no unmapping
1957 * behaviour for them, thus we need an explicit cache flush for
1958 * the newly-mapped device. For kdump, at this point, the device
1959 * is supposed to finish reset at its driver probe stage, so no
1960 * in-flight DMA will exist, and we don't need to worry anymore
1961 * hereafter.
1962 */
1963 if (context_copied(iommu, bus, devfn)) {
1964 u16 did_old = context_domain_id(context);
1965
1966 if (did_old < cap_ndoms(iommu->cap)) {
1967 iommu->flush.flush_context(iommu, did_old,
1968 (((u16)bus) << 8) | devfn,
1969 DMA_CCMD_MASK_NOBIT,
1970 DMA_CCMD_DEVICE_INVL);
1971 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972 DMA_TLB_DSI_FLUSH);
1973 }
1974
1975 clear_context_copied(iommu, bus, devfn);
1976 }
1977
1978 context_clear_entry(context);
1979
1980 if (sm_supported(iommu)) {
1981 unsigned long pds;
1982
1983 /* Setup the PASID DIR pointer: */
1984 pds = context_get_sm_pds(table);
1985 context->lo = (u64)virt_to_phys(table->table) |
1986 context_pdts(pds);
1987
1988 /* Setup the RID_PASID field: */
1989 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990
1991 /*
1992 * Setup the Device-TLB enable bit and Page request
1993 * Enable bit:
1994 */
1995 if (info && info->ats_supported)
1996 context_set_sm_dte(context);
1997 if (info && info->pri_supported)
1998 context_set_sm_pre(context);
1999 if (info && info->pasid_supported)
2000 context_set_pasid(context);
2001 } else {
2002 struct dma_pte *pgd = domain->pgd;
2003 int agaw;
2004
2005 context_set_domain_id(context, did);
2006
2007 if (translation != CONTEXT_TT_PASS_THROUGH) {
2008 /*
2009 * Skip top levels of page tables for iommu which has
2010 * less agaw than default. Unnecessary for PT mode.
2011 */
2012 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013 ret = -ENOMEM;
2014 pgd = phys_to_virt(dma_pte_addr(pgd));
2015 if (!dma_pte_present(pgd))
2016 goto out_unlock;
2017 }
2018
2019 if (info && info->ats_supported)
2020 translation = CONTEXT_TT_DEV_IOTLB;
2021 else
2022 translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024 context_set_address_root(context, virt_to_phys(pgd));
2025 context_set_address_width(context, agaw);
2026 } else {
2027 /*
2028 * In pass through mode, AW must be programmed to
2029 * indicate the largest AGAW value supported by
2030 * hardware. And ASR is ignored by hardware.
2031 */
2032 context_set_address_width(context, iommu->msagaw);
2033 }
2034
2035 context_set_translation_type(context, translation);
2036 }
2037
2038 context_set_fault_enable(context);
2039 context_set_present(context);
2040 if (!ecap_coherent(iommu->ecap))
2041 clflush_cache_range(context, sizeof(*context));
2042
2043 /*
2044 * It's a non-present to present mapping. If hardware doesn't cache
2045 * non-present entry we only need to flush the write-buffer. If the
2046 * _does_ cache non-present entries, then it does so in the special
2047 * domain #0, which we have to flush:
2048 */
2049 if (cap_caching_mode(iommu->cap)) {
2050 iommu->flush.flush_context(iommu, 0,
2051 (((u16)bus) << 8) | devfn,
2052 DMA_CCMD_MASK_NOBIT,
2053 DMA_CCMD_DEVICE_INVL);
2054 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055 } else {
2056 iommu_flush_write_buffer(iommu);
2057 }
2058
2059 ret = 0;
2060
2061 out_unlock:
2062 spin_unlock(&iommu->lock);
2063
2064 return ret;
2065 }
2066
2067 struct domain_context_mapping_data {
2068 struct dmar_domain *domain;
2069 struct intel_iommu *iommu;
2070 struct pasid_table *table;
2071 };
2072
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2073 static int domain_context_mapping_cb(struct pci_dev *pdev,
2074 u16 alias, void *opaque)
2075 {
2076 struct domain_context_mapping_data *data = opaque;
2077
2078 return domain_context_mapping_one(data->domain, data->iommu,
2079 data->table, PCI_BUS_NUM(alias),
2080 alias & 0xff);
2081 }
2082
2083 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085 {
2086 struct domain_context_mapping_data data;
2087 struct pasid_table *table;
2088 struct intel_iommu *iommu;
2089 u8 bus, devfn;
2090
2091 iommu = device_to_iommu(dev, &bus, &devfn);
2092 if (!iommu)
2093 return -ENODEV;
2094
2095 table = intel_pasid_get_table(dev);
2096
2097 if (!dev_is_pci(dev))
2098 return domain_context_mapping_one(domain, iommu, table,
2099 bus, devfn);
2100
2101 data.domain = domain;
2102 data.iommu = iommu;
2103 data.table = table;
2104
2105 return pci_for_each_dma_alias(to_pci_dev(dev),
2106 &domain_context_mapping_cb, &data);
2107 }
2108
2109 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2110 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111 size_t size)
2112 {
2113 host_addr &= ~PAGE_MASK;
2114 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 }
2116
2117 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2118 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119 unsigned long iov_pfn,
2120 unsigned long phy_pfn,
2121 unsigned long pages)
2122 {
2123 int support, level = 1;
2124 unsigned long pfnmerge;
2125
2126 support = domain->iommu_superpage;
2127
2128 /* To use a large page, the virtual *and* physical addresses
2129 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130 of them will mean we have to use smaller pages. So just
2131 merge them and check both at once. */
2132 pfnmerge = iov_pfn | phy_pfn;
2133
2134 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135 pages >>= VTD_STRIDE_SHIFT;
2136 if (!pages)
2137 break;
2138 pfnmerge >>= VTD_STRIDE_SHIFT;
2139 level++;
2140 support--;
2141 }
2142 return level;
2143 }
2144
2145 /*
2146 * Ensure that old small page tables are removed to make room for superpage(s).
2147 * We're going to add new large pages, so make sure we don't remove their parent
2148 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)2150 static void switch_to_super_page(struct dmar_domain *domain,
2151 unsigned long start_pfn,
2152 unsigned long end_pfn, int level)
2153 {
2154 unsigned long lvl_pages = lvl_to_nr_pages(level);
2155 struct iommu_domain_info *info;
2156 struct dma_pte *pte = NULL;
2157 unsigned long i;
2158
2159 while (start_pfn <= end_pfn) {
2160 if (!pte)
2161 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162 GFP_ATOMIC);
2163
2164 if (dma_pte_present(pte)) {
2165 dma_pte_free_pagetable(domain, start_pfn,
2166 start_pfn + lvl_pages - 1,
2167 level + 1);
2168
2169 xa_for_each(&domain->iommu_array, i, info)
2170 iommu_flush_iotlb_psi(info->iommu, domain,
2171 start_pfn, lvl_pages,
2172 0, 0);
2173 }
2174
2175 pte++;
2176 start_pfn += lvl_pages;
2177 if (first_pte_in_page(pte))
2178 pte = NULL;
2179 }
2180 }
2181
2182 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185 gfp_t gfp)
2186 {
2187 struct dma_pte *first_pte = NULL, *pte = NULL;
2188 unsigned int largepage_lvl = 0;
2189 unsigned long lvl_pages = 0;
2190 phys_addr_t pteval;
2191 u64 attr;
2192
2193 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194 return -EINVAL;
2195
2196 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197 return -EINVAL;
2198
2199 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200 attr |= DMA_FL_PTE_PRESENT;
2201 if (domain->use_first_level) {
2202 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203 if (prot & DMA_PTE_WRITE)
2204 attr |= DMA_FL_PTE_DIRTY;
2205 }
2206
2207 domain->has_mappings = true;
2208
2209 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2210
2211 while (nr_pages > 0) {
2212 uint64_t tmp;
2213
2214 if (!pte) {
2215 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2216 phys_pfn, nr_pages);
2217
2218 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2219 gfp);
2220 if (!pte)
2221 return -ENOMEM;
2222 first_pte = pte;
2223
2224 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2225
2226 /* It is large page*/
2227 if (largepage_lvl > 1) {
2228 unsigned long end_pfn;
2229 unsigned long pages_to_remove;
2230
2231 pteval |= DMA_PTE_LARGE_PAGE;
2232 pages_to_remove = min_t(unsigned long, nr_pages,
2233 nr_pte_to_next_page(pte) * lvl_pages);
2234 end_pfn = iov_pfn + pages_to_remove - 1;
2235 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2236 } else {
2237 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2238 }
2239
2240 }
2241 /* We don't need lock here, nobody else
2242 * touches the iova range
2243 */
2244 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2245 if (tmp) {
2246 static int dumps = 5;
2247 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2248 iov_pfn, tmp, (unsigned long long)pteval);
2249 if (dumps) {
2250 dumps--;
2251 debug_dma_dump_mappings(NULL);
2252 }
2253 WARN_ON(1);
2254 }
2255
2256 nr_pages -= lvl_pages;
2257 iov_pfn += lvl_pages;
2258 phys_pfn += lvl_pages;
2259 pteval += lvl_pages * VTD_PAGE_SIZE;
2260
2261 /* If the next PTE would be the first in a new page, then we
2262 * need to flush the cache on the entries we've just written.
2263 * And then we'll need to recalculate 'pte', so clear it and
2264 * let it get set again in the if (!pte) block above.
2265 *
2266 * If we're done (!nr_pages) we need to flush the cache too.
2267 *
2268 * Also if we've been setting superpages, we may need to
2269 * recalculate 'pte' and switch back to smaller pages for the
2270 * end of the mapping, if the trailing size is not enough to
2271 * use another superpage (i.e. nr_pages < lvl_pages).
2272 */
2273 pte++;
2274 if (!nr_pages || first_pte_in_page(pte) ||
2275 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2276 domain_flush_cache(domain, first_pte,
2277 (void *)pte - (void *)first_pte);
2278 pte = NULL;
2279 }
2280 }
2281
2282 return 0;
2283 }
2284
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)2285 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2286 {
2287 struct intel_iommu *iommu = info->iommu;
2288 struct context_entry *context;
2289 u16 did_old;
2290
2291 if (!iommu)
2292 return;
2293
2294 spin_lock(&iommu->lock);
2295 context = iommu_context_addr(iommu, bus, devfn, 0);
2296 if (!context) {
2297 spin_unlock(&iommu->lock);
2298 return;
2299 }
2300
2301 if (sm_supported(iommu)) {
2302 if (hw_pass_through && domain_type_is_si(info->domain))
2303 did_old = FLPT_DEFAULT_DID;
2304 else
2305 did_old = domain_id_iommu(info->domain, iommu);
2306 } else {
2307 did_old = context_domain_id(context);
2308 }
2309
2310 context_clear_entry(context);
2311 __iommu_flush_cache(iommu, context, sizeof(*context));
2312 spin_unlock(&iommu->lock);
2313 iommu->flush.flush_context(iommu,
2314 did_old,
2315 (((u16)bus) << 8) | devfn,
2316 DMA_CCMD_MASK_NOBIT,
2317 DMA_CCMD_DEVICE_INVL);
2318
2319 if (sm_supported(iommu))
2320 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2321
2322 iommu->flush.flush_iotlb(iommu,
2323 did_old,
2324 0,
2325 0,
2326 DMA_TLB_DSI_FLUSH);
2327
2328 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2329 }
2330
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2331 static int domain_setup_first_level(struct intel_iommu *iommu,
2332 struct dmar_domain *domain,
2333 struct device *dev,
2334 u32 pasid)
2335 {
2336 struct dma_pte *pgd = domain->pgd;
2337 int agaw, level;
2338 int flags = 0;
2339
2340 /*
2341 * Skip top levels of page tables for iommu which has
2342 * less agaw than default. Unnecessary for PT mode.
2343 */
2344 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2345 pgd = phys_to_virt(dma_pte_addr(pgd));
2346 if (!dma_pte_present(pgd))
2347 return -ENOMEM;
2348 }
2349
2350 level = agaw_to_level(agaw);
2351 if (level != 4 && level != 5)
2352 return -EINVAL;
2353
2354 if (level == 5)
2355 flags |= PASID_FLAG_FL5LP;
2356
2357 if (domain->force_snooping)
2358 flags |= PASID_FLAG_PAGE_SNOOP;
2359
2360 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2361 domain_id_iommu(domain, iommu),
2362 flags);
2363 }
2364
dev_is_real_dma_subdevice(struct device * dev)2365 static bool dev_is_real_dma_subdevice(struct device *dev)
2366 {
2367 return dev && dev_is_pci(dev) &&
2368 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2369 }
2370
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2371 static int iommu_domain_identity_map(struct dmar_domain *domain,
2372 unsigned long first_vpfn,
2373 unsigned long last_vpfn)
2374 {
2375 /*
2376 * RMRR range might have overlap with physical memory range,
2377 * clear it first
2378 */
2379 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2380
2381 return __domain_mapping(domain, first_vpfn,
2382 first_vpfn, last_vpfn - first_vpfn + 1,
2383 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2384 }
2385
2386 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2387
si_domain_init(int hw)2388 static int __init si_domain_init(int hw)
2389 {
2390 struct dmar_rmrr_unit *rmrr;
2391 struct device *dev;
2392 int i, nid, ret;
2393
2394 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2395 if (!si_domain)
2396 return -EFAULT;
2397
2398 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2399 domain_exit(si_domain);
2400 si_domain = NULL;
2401 return -EFAULT;
2402 }
2403
2404 if (hw)
2405 return 0;
2406
2407 for_each_online_node(nid) {
2408 unsigned long start_pfn, end_pfn;
2409 int i;
2410
2411 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2412 ret = iommu_domain_identity_map(si_domain,
2413 mm_to_dma_pfn_start(start_pfn),
2414 mm_to_dma_pfn_end(end_pfn-1));
2415 if (ret)
2416 return ret;
2417 }
2418 }
2419
2420 /*
2421 * Identity map the RMRRs so that devices with RMRRs could also use
2422 * the si_domain.
2423 */
2424 for_each_rmrr_units(rmrr) {
2425 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2426 i, dev) {
2427 unsigned long long start = rmrr->base_address;
2428 unsigned long long end = rmrr->end_address;
2429
2430 if (WARN_ON(end < start ||
2431 end >> agaw_to_width(si_domain->agaw)))
2432 continue;
2433
2434 ret = iommu_domain_identity_map(si_domain,
2435 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2436 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2437 if (ret)
2438 return ret;
2439 }
2440 }
2441
2442 return 0;
2443 }
2444
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)2445 static int dmar_domain_attach_device(struct dmar_domain *domain,
2446 struct device *dev)
2447 {
2448 struct device_domain_info *info = dev_iommu_priv_get(dev);
2449 struct intel_iommu *iommu;
2450 unsigned long flags;
2451 u8 bus, devfn;
2452 int ret;
2453
2454 iommu = device_to_iommu(dev, &bus, &devfn);
2455 if (!iommu)
2456 return -ENODEV;
2457
2458 ret = domain_attach_iommu(domain, iommu);
2459 if (ret)
2460 return ret;
2461 info->domain = domain;
2462 spin_lock_irqsave(&domain->lock, flags);
2463 list_add(&info->link, &domain->devices);
2464 spin_unlock_irqrestore(&domain->lock, flags);
2465
2466 /* PASID table is mandatory for a PCI device in scalable mode. */
2467 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2468 /* Setup the PASID entry for requests without PASID: */
2469 if (hw_pass_through && domain_type_is_si(domain))
2470 ret = intel_pasid_setup_pass_through(iommu, domain,
2471 dev, IOMMU_NO_PASID);
2472 else if (domain->use_first_level)
2473 ret = domain_setup_first_level(iommu, domain, dev,
2474 IOMMU_NO_PASID);
2475 else
2476 ret = intel_pasid_setup_second_level(iommu, domain,
2477 dev, IOMMU_NO_PASID);
2478 if (ret) {
2479 dev_err(dev, "Setup RID2PASID failed\n");
2480 device_block_translation(dev);
2481 return ret;
2482 }
2483 }
2484
2485 ret = domain_context_mapping(domain, dev);
2486 if (ret) {
2487 dev_err(dev, "Domain context map failed\n");
2488 device_block_translation(dev);
2489 return ret;
2490 }
2491
2492 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2493 iommu_enable_pci_caps(info);
2494
2495 return 0;
2496 }
2497
2498 /**
2499 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500 * is relaxable (ie. is allowed to be not enforced under some conditions)
2501 * @dev: device handle
2502 *
2503 * We assume that PCI USB devices with RMRRs have them largely
2504 * for historical reasons and that the RMRR space is not actively used post
2505 * boot. This exclusion may change if vendors begin to abuse it.
2506 *
2507 * The same exception is made for graphics devices, with the requirement that
2508 * any use of the RMRR regions will be torn down before assigning the device
2509 * to a guest.
2510 *
2511 * Return: true if the RMRR is relaxable, false otherwise
2512 */
device_rmrr_is_relaxable(struct device * dev)2513 static bool device_rmrr_is_relaxable(struct device *dev)
2514 {
2515 struct pci_dev *pdev;
2516
2517 if (!dev_is_pci(dev))
2518 return false;
2519
2520 pdev = to_pci_dev(dev);
2521 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522 return true;
2523 else
2524 return false;
2525 }
2526
2527 /*
2528 * Return the required default domain type for a specific device.
2529 *
2530 * @dev: the device in query
2531 * @startup: true if this is during early boot
2532 *
2533 * Returns:
2534 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536 * - 0: both identity and dynamic domains work for this device
2537 */
device_def_domain_type(struct device * dev)2538 static int device_def_domain_type(struct device *dev)
2539 {
2540 if (dev_is_pci(dev)) {
2541 struct pci_dev *pdev = to_pci_dev(dev);
2542
2543 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544 return IOMMU_DOMAIN_IDENTITY;
2545
2546 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547 return IOMMU_DOMAIN_IDENTITY;
2548 }
2549
2550 return 0;
2551 }
2552
intel_iommu_init_qi(struct intel_iommu * iommu)2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554 {
2555 /*
2556 * Start from the sane iommu hardware state.
2557 * If the queued invalidation is already initialized by us
2558 * (for example, while enabling interrupt-remapping) then
2559 * we got the things already rolling from a sane state.
2560 */
2561 if (!iommu->qi) {
2562 /*
2563 * Clear any previous faults.
2564 */
2565 dmar_fault(-1, iommu);
2566 /*
2567 * Disable queued invalidation if supported and already enabled
2568 * before OS handover.
2569 */
2570 dmar_disable_qi(iommu);
2571 }
2572
2573 if (dmar_enable_qi(iommu)) {
2574 /*
2575 * Queued Invalidate not enabled, use Register Based Invalidate
2576 */
2577 iommu->flush.flush_context = __iommu_flush_context;
2578 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579 pr_info("%s: Using Register based invalidation\n",
2580 iommu->name);
2581 } else {
2582 iommu->flush.flush_context = qi_flush_context;
2583 iommu->flush.flush_iotlb = qi_flush_iotlb;
2584 pr_info("%s: Using Queued invalidation\n", iommu->name);
2585 }
2586 }
2587
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2588 static int copy_context_table(struct intel_iommu *iommu,
2589 struct root_entry *old_re,
2590 struct context_entry **tbl,
2591 int bus, bool ext)
2592 {
2593 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594 struct context_entry *new_ce = NULL, ce;
2595 struct context_entry *old_ce = NULL;
2596 struct root_entry re;
2597 phys_addr_t old_ce_phys;
2598
2599 tbl_idx = ext ? bus * 2 : bus;
2600 memcpy(&re, old_re, sizeof(re));
2601
2602 for (devfn = 0; devfn < 256; devfn++) {
2603 /* First calculate the correct index */
2604 idx = (ext ? devfn * 2 : devfn) % 256;
2605
2606 if (idx == 0) {
2607 /* First save what we may have and clean up */
2608 if (new_ce) {
2609 tbl[tbl_idx] = new_ce;
2610 __iommu_flush_cache(iommu, new_ce,
2611 VTD_PAGE_SIZE);
2612 pos = 1;
2613 }
2614
2615 if (old_ce)
2616 memunmap(old_ce);
2617
2618 ret = 0;
2619 if (devfn < 0x80)
2620 old_ce_phys = root_entry_lctp(&re);
2621 else
2622 old_ce_phys = root_entry_uctp(&re);
2623
2624 if (!old_ce_phys) {
2625 if (ext && devfn == 0) {
2626 /* No LCTP, try UCTP */
2627 devfn = 0x7f;
2628 continue;
2629 } else {
2630 goto out;
2631 }
2632 }
2633
2634 ret = -ENOMEM;
2635 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636 MEMREMAP_WB);
2637 if (!old_ce)
2638 goto out;
2639
2640 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641 if (!new_ce)
2642 goto out_unmap;
2643
2644 ret = 0;
2645 }
2646
2647 /* Now copy the context entry */
2648 memcpy(&ce, old_ce + idx, sizeof(ce));
2649
2650 if (!context_present(&ce))
2651 continue;
2652
2653 did = context_domain_id(&ce);
2654 if (did >= 0 && did < cap_ndoms(iommu->cap))
2655 set_bit(did, iommu->domain_ids);
2656
2657 set_context_copied(iommu, bus, devfn);
2658 new_ce[idx] = ce;
2659 }
2660
2661 tbl[tbl_idx + pos] = new_ce;
2662
2663 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664
2665 out_unmap:
2666 memunmap(old_ce);
2667
2668 out:
2669 return ret;
2670 }
2671
copy_translation_tables(struct intel_iommu * iommu)2672 static int copy_translation_tables(struct intel_iommu *iommu)
2673 {
2674 struct context_entry **ctxt_tbls;
2675 struct root_entry *old_rt;
2676 phys_addr_t old_rt_phys;
2677 int ctxt_table_entries;
2678 u64 rtaddr_reg;
2679 int bus, ret;
2680 bool new_ext, ext;
2681
2682 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684 new_ext = !!sm_supported(iommu);
2685
2686 /*
2687 * The RTT bit can only be changed when translation is disabled,
2688 * but disabling translation means to open a window for data
2689 * corruption. So bail out and don't copy anything if we would
2690 * have to change the bit.
2691 */
2692 if (new_ext != ext)
2693 return -EINVAL;
2694
2695 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696 if (!iommu->copied_tables)
2697 return -ENOMEM;
2698
2699 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700 if (!old_rt_phys)
2701 return -EINVAL;
2702
2703 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704 if (!old_rt)
2705 return -ENOMEM;
2706
2707 /* This is too big for the stack - allocate it from slab */
2708 ctxt_table_entries = ext ? 512 : 256;
2709 ret = -ENOMEM;
2710 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711 if (!ctxt_tbls)
2712 goto out_unmap;
2713
2714 for (bus = 0; bus < 256; bus++) {
2715 ret = copy_context_table(iommu, &old_rt[bus],
2716 ctxt_tbls, bus, ext);
2717 if (ret) {
2718 pr_err("%s: Failed to copy context table for bus %d\n",
2719 iommu->name, bus);
2720 continue;
2721 }
2722 }
2723
2724 spin_lock(&iommu->lock);
2725
2726 /* Context tables are copied, now write them to the root_entry table */
2727 for (bus = 0; bus < 256; bus++) {
2728 int idx = ext ? bus * 2 : bus;
2729 u64 val;
2730
2731 if (ctxt_tbls[idx]) {
2732 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733 iommu->root_entry[bus].lo = val;
2734 }
2735
2736 if (!ext || !ctxt_tbls[idx + 1])
2737 continue;
2738
2739 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740 iommu->root_entry[bus].hi = val;
2741 }
2742
2743 spin_unlock(&iommu->lock);
2744
2745 kfree(ctxt_tbls);
2746
2747 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748
2749 ret = 0;
2750
2751 out_unmap:
2752 memunmap(old_rt);
2753
2754 return ret;
2755 }
2756
init_dmars(void)2757 static int __init init_dmars(void)
2758 {
2759 struct dmar_drhd_unit *drhd;
2760 struct intel_iommu *iommu;
2761 int ret;
2762
2763 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764 if (ret)
2765 goto free_iommu;
2766
2767 for_each_iommu(iommu, drhd) {
2768 if (drhd->ignored) {
2769 iommu_disable_translation(iommu);
2770 continue;
2771 }
2772
2773 /*
2774 * Find the max pasid size of all IOMMU's in the system.
2775 * We need to ensure the system pasid table is no bigger
2776 * than the smallest supported.
2777 */
2778 if (pasid_supported(iommu)) {
2779 u32 temp = 2 << ecap_pss(iommu->ecap);
2780
2781 intel_pasid_max_id = min_t(u32, temp,
2782 intel_pasid_max_id);
2783 }
2784
2785 intel_iommu_init_qi(iommu);
2786
2787 ret = iommu_init_domains(iommu);
2788 if (ret)
2789 goto free_iommu;
2790
2791 init_translation_status(iommu);
2792
2793 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794 iommu_disable_translation(iommu);
2795 clear_translation_pre_enabled(iommu);
2796 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797 iommu->name);
2798 }
2799
2800 /*
2801 * TBD:
2802 * we could share the same root & context tables
2803 * among all IOMMU's. Need to Split it later.
2804 */
2805 ret = iommu_alloc_root_entry(iommu);
2806 if (ret)
2807 goto free_iommu;
2808
2809 if (translation_pre_enabled(iommu)) {
2810 pr_info("Translation already enabled - trying to copy translation structures\n");
2811
2812 ret = copy_translation_tables(iommu);
2813 if (ret) {
2814 /*
2815 * We found the IOMMU with translation
2816 * enabled - but failed to copy over the
2817 * old root-entry table. Try to proceed
2818 * by disabling translation now and
2819 * allocating a clean root-entry table.
2820 * This might cause DMAR faults, but
2821 * probably the dump will still succeed.
2822 */
2823 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824 iommu->name);
2825 iommu_disable_translation(iommu);
2826 clear_translation_pre_enabled(iommu);
2827 } else {
2828 pr_info("Copied translation tables from previous kernel for %s\n",
2829 iommu->name);
2830 }
2831 }
2832
2833 if (!ecap_pass_through(iommu->ecap))
2834 hw_pass_through = 0;
2835 intel_svm_check(iommu);
2836 }
2837
2838 /*
2839 * Now that qi is enabled on all iommus, set the root entry and flush
2840 * caches. This is required on some Intel X58 chipsets, otherwise the
2841 * flush_context function will loop forever and the boot hangs.
2842 */
2843 for_each_active_iommu(iommu, drhd) {
2844 iommu_flush_write_buffer(iommu);
2845 iommu_set_root_entry(iommu);
2846 }
2847
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849 dmar_map_gfx = 0;
2850 #endif
2851
2852 if (!dmar_map_gfx)
2853 iommu_identity_mapping |= IDENTMAP_GFX;
2854
2855 check_tylersburg_isoch();
2856
2857 ret = si_domain_init(hw_pass_through);
2858 if (ret)
2859 goto free_iommu;
2860
2861 /*
2862 * for each drhd
2863 * enable fault log
2864 * global invalidate context cache
2865 * global invalidate iotlb
2866 * enable translation
2867 */
2868 for_each_iommu(iommu, drhd) {
2869 if (drhd->ignored) {
2870 /*
2871 * we always have to disable PMRs or DMA may fail on
2872 * this device
2873 */
2874 if (force_on)
2875 iommu_disable_protect_mem_regions(iommu);
2876 continue;
2877 }
2878
2879 iommu_flush_write_buffer(iommu);
2880
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883 /*
2884 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885 * could cause possible lock race condition.
2886 */
2887 up_write(&dmar_global_lock);
2888 ret = intel_svm_enable_prq(iommu);
2889 down_write(&dmar_global_lock);
2890 if (ret)
2891 goto free_iommu;
2892 }
2893 #endif
2894 ret = dmar_set_interrupt(iommu);
2895 if (ret)
2896 goto free_iommu;
2897 }
2898
2899 return 0;
2900
2901 free_iommu:
2902 for_each_active_iommu(iommu, drhd) {
2903 disable_dmar_iommu(iommu);
2904 free_dmar_iommu(iommu);
2905 }
2906 if (si_domain) {
2907 domain_exit(si_domain);
2908 si_domain = NULL;
2909 }
2910
2911 return ret;
2912 }
2913
init_no_remapping_devices(void)2914 static void __init init_no_remapping_devices(void)
2915 {
2916 struct dmar_drhd_unit *drhd;
2917 struct device *dev;
2918 int i;
2919
2920 for_each_drhd_unit(drhd) {
2921 if (!drhd->include_all) {
2922 for_each_active_dev_scope(drhd->devices,
2923 drhd->devices_cnt, i, dev)
2924 break;
2925 /* ignore DMAR unit if no devices exist */
2926 if (i == drhd->devices_cnt)
2927 drhd->ignored = 1;
2928 }
2929 }
2930
2931 for_each_active_drhd_unit(drhd) {
2932 if (drhd->include_all)
2933 continue;
2934
2935 for_each_active_dev_scope(drhd->devices,
2936 drhd->devices_cnt, i, dev)
2937 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938 break;
2939 if (i < drhd->devices_cnt)
2940 continue;
2941
2942 /* This IOMMU has *only* gfx devices. Either bypass it or
2943 set the gfx_mapped flag, as appropriate */
2944 drhd->gfx_dedicated = 1;
2945 if (!dmar_map_gfx)
2946 drhd->ignored = 1;
2947 }
2948 }
2949
2950 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2951 static int init_iommu_hw(void)
2952 {
2953 struct dmar_drhd_unit *drhd;
2954 struct intel_iommu *iommu = NULL;
2955 int ret;
2956
2957 for_each_active_iommu(iommu, drhd) {
2958 if (iommu->qi) {
2959 ret = dmar_reenable_qi(iommu);
2960 if (ret)
2961 return ret;
2962 }
2963 }
2964
2965 for_each_iommu(iommu, drhd) {
2966 if (drhd->ignored) {
2967 /*
2968 * we always have to disable PMRs or DMA may fail on
2969 * this device
2970 */
2971 if (force_on)
2972 iommu_disable_protect_mem_regions(iommu);
2973 continue;
2974 }
2975
2976 iommu_flush_write_buffer(iommu);
2977 iommu_set_root_entry(iommu);
2978 iommu_enable_translation(iommu);
2979 iommu_disable_protect_mem_regions(iommu);
2980 }
2981
2982 return 0;
2983 }
2984
iommu_flush_all(void)2985 static void iommu_flush_all(void)
2986 {
2987 struct dmar_drhd_unit *drhd;
2988 struct intel_iommu *iommu;
2989
2990 for_each_active_iommu(iommu, drhd) {
2991 iommu->flush.flush_context(iommu, 0, 0, 0,
2992 DMA_CCMD_GLOBAL_INVL);
2993 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994 DMA_TLB_GLOBAL_FLUSH);
2995 }
2996 }
2997
iommu_suspend(void)2998 static int iommu_suspend(void)
2999 {
3000 struct dmar_drhd_unit *drhd;
3001 struct intel_iommu *iommu = NULL;
3002 unsigned long flag;
3003
3004 iommu_flush_all();
3005
3006 for_each_active_iommu(iommu, drhd) {
3007 iommu_disable_translation(iommu);
3008
3009 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010
3011 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012 readl(iommu->reg + DMAR_FECTL_REG);
3013 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014 readl(iommu->reg + DMAR_FEDATA_REG);
3015 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016 readl(iommu->reg + DMAR_FEADDR_REG);
3017 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018 readl(iommu->reg + DMAR_FEUADDR_REG);
3019
3020 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021 }
3022 return 0;
3023 }
3024
iommu_resume(void)3025 static void iommu_resume(void)
3026 {
3027 struct dmar_drhd_unit *drhd;
3028 struct intel_iommu *iommu = NULL;
3029 unsigned long flag;
3030
3031 if (init_iommu_hw()) {
3032 if (force_on)
3033 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034 else
3035 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036 return;
3037 }
3038
3039 for_each_active_iommu(iommu, drhd) {
3040
3041 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042
3043 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044 iommu->reg + DMAR_FECTL_REG);
3045 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046 iommu->reg + DMAR_FEDATA_REG);
3047 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048 iommu->reg + DMAR_FEADDR_REG);
3049 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050 iommu->reg + DMAR_FEUADDR_REG);
3051
3052 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053 }
3054 }
3055
3056 static struct syscore_ops iommu_syscore_ops = {
3057 .resume = iommu_resume,
3058 .suspend = iommu_suspend,
3059 };
3060
init_iommu_pm_ops(void)3061 static void __init init_iommu_pm_ops(void)
3062 {
3063 register_syscore_ops(&iommu_syscore_ops);
3064 }
3065
3066 #else
init_iommu_pm_ops(void)3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif /* CONFIG_PM */
3069
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071 {
3072 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074 rmrr->end_address <= rmrr->base_address ||
3075 arch_rmrr_sanity_check(rmrr))
3076 return -EINVAL;
3077
3078 return 0;
3079 }
3080
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082 {
3083 struct acpi_dmar_reserved_memory *rmrr;
3084 struct dmar_rmrr_unit *rmrru;
3085
3086 rmrr = (struct acpi_dmar_reserved_memory *)header;
3087 if (rmrr_sanity_check(rmrr)) {
3088 pr_warn(FW_BUG
3089 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091 rmrr->base_address, rmrr->end_address,
3092 dmi_get_system_info(DMI_BIOS_VENDOR),
3093 dmi_get_system_info(DMI_BIOS_VERSION),
3094 dmi_get_system_info(DMI_PRODUCT_VERSION));
3095 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096 }
3097
3098 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099 if (!rmrru)
3100 goto out;
3101
3102 rmrru->hdr = header;
3103
3104 rmrru->base_address = rmrr->base_address;
3105 rmrru->end_address = rmrr->end_address;
3106
3107 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108 ((void *)rmrr) + rmrr->header.length,
3109 &rmrru->devices_cnt);
3110 if (rmrru->devices_cnt && rmrru->devices == NULL)
3111 goto free_rmrru;
3112
3113 list_add(&rmrru->list, &dmar_rmrr_units);
3114
3115 return 0;
3116 free_rmrru:
3117 kfree(rmrru);
3118 out:
3119 return -ENOMEM;
3120 }
3121
dmar_find_atsr(struct acpi_dmar_atsr * atsr)3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123 {
3124 struct dmar_atsr_unit *atsru;
3125 struct acpi_dmar_atsr *tmp;
3126
3127 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128 dmar_rcu_check()) {
3129 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130 if (atsr->segment != tmp->segment)
3131 continue;
3132 if (atsr->header.length != tmp->header.length)
3133 continue;
3134 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135 return atsru;
3136 }
3137
3138 return NULL;
3139 }
3140
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142 {
3143 struct acpi_dmar_atsr *atsr;
3144 struct dmar_atsr_unit *atsru;
3145
3146 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147 return 0;
3148
3149 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150 atsru = dmar_find_atsr(atsr);
3151 if (atsru)
3152 return 0;
3153
3154 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155 if (!atsru)
3156 return -ENOMEM;
3157
3158 /*
3159 * If memory is allocated from slab by ACPI _DSM method, we need to
3160 * copy the memory content because the memory buffer will be freed
3161 * on return.
3162 */
3163 atsru->hdr = (void *)(atsru + 1);
3164 memcpy(atsru->hdr, hdr, hdr->length);
3165 atsru->include_all = atsr->flags & 0x1;
3166 if (!atsru->include_all) {
3167 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168 (void *)atsr + atsr->header.length,
3169 &atsru->devices_cnt);
3170 if (atsru->devices_cnt && atsru->devices == NULL) {
3171 kfree(atsru);
3172 return -ENOMEM;
3173 }
3174 }
3175
3176 list_add_rcu(&atsru->list, &dmar_atsr_units);
3177
3178 return 0;
3179 }
3180
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182 {
3183 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184 kfree(atsru);
3185 }
3186
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188 {
3189 struct acpi_dmar_atsr *atsr;
3190 struct dmar_atsr_unit *atsru;
3191
3192 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193 atsru = dmar_find_atsr(atsr);
3194 if (atsru) {
3195 list_del_rcu(&atsru->list);
3196 synchronize_rcu();
3197 intel_iommu_free_atsr(atsru);
3198 }
3199
3200 return 0;
3201 }
3202
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204 {
3205 int i;
3206 struct device *dev;
3207 struct acpi_dmar_atsr *atsr;
3208 struct dmar_atsr_unit *atsru;
3209
3210 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211 atsru = dmar_find_atsr(atsr);
3212 if (!atsru)
3213 return 0;
3214
3215 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217 i, dev)
3218 return -EBUSY;
3219 }
3220
3221 return 0;
3222 }
3223
dmar_find_satc(struct acpi_dmar_satc * satc)3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225 {
3226 struct dmar_satc_unit *satcu;
3227 struct acpi_dmar_satc *tmp;
3228
3229 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230 dmar_rcu_check()) {
3231 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232 if (satc->segment != tmp->segment)
3233 continue;
3234 if (satc->header.length != tmp->header.length)
3235 continue;
3236 if (memcmp(satc, tmp, satc->header.length) == 0)
3237 return satcu;
3238 }
3239
3240 return NULL;
3241 }
3242
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244 {
3245 struct acpi_dmar_satc *satc;
3246 struct dmar_satc_unit *satcu;
3247
3248 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249 return 0;
3250
3251 satc = container_of(hdr, struct acpi_dmar_satc, header);
3252 satcu = dmar_find_satc(satc);
3253 if (satcu)
3254 return 0;
3255
3256 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257 if (!satcu)
3258 return -ENOMEM;
3259
3260 satcu->hdr = (void *)(satcu + 1);
3261 memcpy(satcu->hdr, hdr, hdr->length);
3262 satcu->atc_required = satc->flags & 0x1;
3263 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264 (void *)satc + satc->header.length,
3265 &satcu->devices_cnt);
3266 if (satcu->devices_cnt && !satcu->devices) {
3267 kfree(satcu);
3268 return -ENOMEM;
3269 }
3270 list_add_rcu(&satcu->list, &dmar_satc_units);
3271
3272 return 0;
3273 }
3274
intel_iommu_add(struct dmar_drhd_unit * dmaru)3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276 {
3277 int sp, ret;
3278 struct intel_iommu *iommu = dmaru->iommu;
3279
3280 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281 if (ret)
3282 goto out;
3283
3284 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285 pr_warn("%s: Doesn't support hardware pass through.\n",
3286 iommu->name);
3287 return -ENXIO;
3288 }
3289
3290 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292 pr_warn("%s: Doesn't support large page.\n",
3293 iommu->name);
3294 return -ENXIO;
3295 }
3296
3297 /*
3298 * Disable translation if already enabled prior to OS handover.
3299 */
3300 if (iommu->gcmd & DMA_GCMD_TE)
3301 iommu_disable_translation(iommu);
3302
3303 ret = iommu_init_domains(iommu);
3304 if (ret == 0)
3305 ret = iommu_alloc_root_entry(iommu);
3306 if (ret)
3307 goto out;
3308
3309 intel_svm_check(iommu);
3310
3311 if (dmaru->ignored) {
3312 /*
3313 * we always have to disable PMRs or DMA may fail on this device
3314 */
3315 if (force_on)
3316 iommu_disable_protect_mem_regions(iommu);
3317 return 0;
3318 }
3319
3320 intel_iommu_init_qi(iommu);
3321 iommu_flush_write_buffer(iommu);
3322
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325 ret = intel_svm_enable_prq(iommu);
3326 if (ret)
3327 goto disable_iommu;
3328 }
3329 #endif
3330 ret = dmar_set_interrupt(iommu);
3331 if (ret)
3332 goto disable_iommu;
3333
3334 iommu_set_root_entry(iommu);
3335 iommu_enable_translation(iommu);
3336
3337 iommu_disable_protect_mem_regions(iommu);
3338 return 0;
3339
3340 disable_iommu:
3341 disable_dmar_iommu(iommu);
3342 out:
3343 free_dmar_iommu(iommu);
3344 return ret;
3345 }
3346
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348 {
3349 int ret = 0;
3350 struct intel_iommu *iommu = dmaru->iommu;
3351
3352 if (!intel_iommu_enabled)
3353 return 0;
3354 if (iommu == NULL)
3355 return -EINVAL;
3356
3357 if (insert) {
3358 ret = intel_iommu_add(dmaru);
3359 } else {
3360 disable_dmar_iommu(iommu);
3361 free_dmar_iommu(iommu);
3362 }
3363
3364 return ret;
3365 }
3366
intel_iommu_free_dmars(void)3367 static void intel_iommu_free_dmars(void)
3368 {
3369 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370 struct dmar_atsr_unit *atsru, *atsr_n;
3371 struct dmar_satc_unit *satcu, *satc_n;
3372
3373 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374 list_del(&rmrru->list);
3375 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376 kfree(rmrru);
3377 }
3378
3379 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380 list_del(&atsru->list);
3381 intel_iommu_free_atsr(atsru);
3382 }
3383 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384 list_del(&satcu->list);
3385 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386 kfree(satcu);
3387 }
3388 }
3389
dmar_find_matched_satc_unit(struct pci_dev * dev)3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391 {
3392 struct dmar_satc_unit *satcu;
3393 struct acpi_dmar_satc *satc;
3394 struct device *tmp;
3395 int i;
3396
3397 dev = pci_physfn(dev);
3398 rcu_read_lock();
3399
3400 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402 if (satc->segment != pci_domain_nr(dev->bus))
3403 continue;
3404 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405 if (to_pci_dev(tmp) == dev)
3406 goto out;
3407 }
3408 satcu = NULL;
3409 out:
3410 rcu_read_unlock();
3411 return satcu;
3412 }
3413
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415 {
3416 int i, ret = 1;
3417 struct pci_bus *bus;
3418 struct pci_dev *bridge = NULL;
3419 struct device *tmp;
3420 struct acpi_dmar_atsr *atsr;
3421 struct dmar_atsr_unit *atsru;
3422 struct dmar_satc_unit *satcu;
3423
3424 dev = pci_physfn(dev);
3425 satcu = dmar_find_matched_satc_unit(dev);
3426 if (satcu)
3427 /*
3428 * This device supports ATS as it is in SATC table.
3429 * When IOMMU is in legacy mode, enabling ATS is done
3430 * automatically by HW for the device that requires
3431 * ATS, hence OS should not enable this device ATS
3432 * to avoid duplicated TLB invalidation.
3433 */
3434 return !(satcu->atc_required && !sm_supported(iommu));
3435
3436 for (bus = dev->bus; bus; bus = bus->parent) {
3437 bridge = bus->self;
3438 /* If it's an integrated device, allow ATS */
3439 if (!bridge)
3440 return 1;
3441 /* Connected via non-PCIe: no ATS */
3442 if (!pci_is_pcie(bridge) ||
3443 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444 return 0;
3445 /* If we found the root port, look it up in the ATSR */
3446 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447 break;
3448 }
3449
3450 rcu_read_lock();
3451 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453 if (atsr->segment != pci_domain_nr(dev->bus))
3454 continue;
3455
3456 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457 if (tmp == &bridge->dev)
3458 goto out;
3459
3460 if (atsru->include_all)
3461 goto out;
3462 }
3463 ret = 0;
3464 out:
3465 rcu_read_unlock();
3466
3467 return ret;
3468 }
3469
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471 {
3472 int ret;
3473 struct dmar_rmrr_unit *rmrru;
3474 struct dmar_atsr_unit *atsru;
3475 struct dmar_satc_unit *satcu;
3476 struct acpi_dmar_atsr *atsr;
3477 struct acpi_dmar_reserved_memory *rmrr;
3478 struct acpi_dmar_satc *satc;
3479
3480 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481 return 0;
3482
3483 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484 rmrr = container_of(rmrru->hdr,
3485 struct acpi_dmar_reserved_memory, header);
3486 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488 ((void *)rmrr) + rmrr->header.length,
3489 rmrr->segment, rmrru->devices,
3490 rmrru->devices_cnt);
3491 if (ret < 0)
3492 return ret;
3493 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494 dmar_remove_dev_scope(info, rmrr->segment,
3495 rmrru->devices, rmrru->devices_cnt);
3496 }
3497 }
3498
3499 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500 if (atsru->include_all)
3501 continue;
3502
3503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506 (void *)atsr + atsr->header.length,
3507 atsr->segment, atsru->devices,
3508 atsru->devices_cnt);
3509 if (ret > 0)
3510 break;
3511 else if (ret < 0)
3512 return ret;
3513 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514 if (dmar_remove_dev_scope(info, atsr->segment,
3515 atsru->devices, atsru->devices_cnt))
3516 break;
3517 }
3518 }
3519 list_for_each_entry(satcu, &dmar_satc_units, list) {
3520 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523 (void *)satc + satc->header.length,
3524 satc->segment, satcu->devices,
3525 satcu->devices_cnt);
3526 if (ret > 0)
3527 break;
3528 else if (ret < 0)
3529 return ret;
3530 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531 if (dmar_remove_dev_scope(info, satc->segment,
3532 satcu->devices, satcu->devices_cnt))
3533 break;
3534 }
3535 }
3536
3537 return 0;
3538 }
3539
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541 unsigned long val, void *v)
3542 {
3543 struct memory_notify *mhp = v;
3544 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546 mhp->nr_pages - 1);
3547
3548 switch (val) {
3549 case MEM_GOING_ONLINE:
3550 if (iommu_domain_identity_map(si_domain,
3551 start_vpfn, last_vpfn)) {
3552 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553 start_vpfn, last_vpfn);
3554 return NOTIFY_BAD;
3555 }
3556 break;
3557
3558 case MEM_OFFLINE:
3559 case MEM_CANCEL_ONLINE:
3560 {
3561 struct dmar_drhd_unit *drhd;
3562 struct intel_iommu *iommu;
3563 LIST_HEAD(freelist);
3564
3565 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566
3567 rcu_read_lock();
3568 for_each_active_iommu(iommu, drhd)
3569 iommu_flush_iotlb_psi(iommu, si_domain,
3570 start_vpfn, mhp->nr_pages,
3571 list_empty(&freelist), 0);
3572 rcu_read_unlock();
3573 put_pages_list(&freelist);
3574 }
3575 break;
3576 }
3577
3578 return NOTIFY_OK;
3579 }
3580
3581 static struct notifier_block intel_iommu_memory_nb = {
3582 .notifier_call = intel_iommu_memory_notifier,
3583 .priority = 0
3584 };
3585
intel_disable_iommus(void)3586 static void intel_disable_iommus(void)
3587 {
3588 struct intel_iommu *iommu = NULL;
3589 struct dmar_drhd_unit *drhd;
3590
3591 for_each_iommu(iommu, drhd)
3592 iommu_disable_translation(iommu);
3593 }
3594
intel_iommu_shutdown(void)3595 void intel_iommu_shutdown(void)
3596 {
3597 struct dmar_drhd_unit *drhd;
3598 struct intel_iommu *iommu = NULL;
3599
3600 if (no_iommu || dmar_disabled)
3601 return;
3602
3603 down_write(&dmar_global_lock);
3604
3605 /* Disable PMRs explicitly here. */
3606 for_each_iommu(iommu, drhd)
3607 iommu_disable_protect_mem_regions(iommu);
3608
3609 /* Make sure the IOMMUs are switched off */
3610 intel_disable_iommus();
3611
3612 up_write(&dmar_global_lock);
3613 }
3614
dev_to_intel_iommu(struct device * dev)3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616 {
3617 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618
3619 return container_of(iommu_dev, struct intel_iommu, iommu);
3620 }
3621
version_show(struct device * dev,struct device_attribute * attr,char * buf)3622 static ssize_t version_show(struct device *dev,
3623 struct device_attribute *attr, char *buf)
3624 {
3625 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627 return sysfs_emit(buf, "%d:%d\n",
3628 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629 }
3630 static DEVICE_ATTR_RO(version);
3631
address_show(struct device * dev,struct device_attribute * attr,char * buf)3632 static ssize_t address_show(struct device *dev,
3633 struct device_attribute *attr, char *buf)
3634 {
3635 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637 }
3638 static DEVICE_ATTR_RO(address);
3639
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3640 static ssize_t cap_show(struct device *dev,
3641 struct device_attribute *attr, char *buf)
3642 {
3643 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644 return sysfs_emit(buf, "%llx\n", iommu->cap);
3645 }
3646 static DEVICE_ATTR_RO(cap);
3647
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3648 static ssize_t ecap_show(struct device *dev,
3649 struct device_attribute *attr, char *buf)
3650 {
3651 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653 }
3654 static DEVICE_ATTR_RO(ecap);
3655
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3656 static ssize_t domains_supported_show(struct device *dev,
3657 struct device_attribute *attr, char *buf)
3658 {
3659 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661 }
3662 static DEVICE_ATTR_RO(domains_supported);
3663
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3664 static ssize_t domains_used_show(struct device *dev,
3665 struct device_attribute *attr, char *buf)
3666 {
3667 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668 return sysfs_emit(buf, "%d\n",
3669 bitmap_weight(iommu->domain_ids,
3670 cap_ndoms(iommu->cap)));
3671 }
3672 static DEVICE_ATTR_RO(domains_used);
3673
3674 static struct attribute *intel_iommu_attrs[] = {
3675 &dev_attr_version.attr,
3676 &dev_attr_address.attr,
3677 &dev_attr_cap.attr,
3678 &dev_attr_ecap.attr,
3679 &dev_attr_domains_supported.attr,
3680 &dev_attr_domains_used.attr,
3681 NULL,
3682 };
3683
3684 static struct attribute_group intel_iommu_group = {
3685 .name = "intel-iommu",
3686 .attrs = intel_iommu_attrs,
3687 };
3688
3689 const struct attribute_group *intel_iommu_groups[] = {
3690 &intel_iommu_group,
3691 NULL,
3692 };
3693
has_external_pci(void)3694 static inline bool has_external_pci(void)
3695 {
3696 struct pci_dev *pdev = NULL;
3697
3698 for_each_pci_dev(pdev)
3699 if (pdev->external_facing) {
3700 pci_dev_put(pdev);
3701 return true;
3702 }
3703
3704 return false;
3705 }
3706
platform_optin_force_iommu(void)3707 static int __init platform_optin_force_iommu(void)
3708 {
3709 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710 return 0;
3711
3712 if (no_iommu || dmar_disabled)
3713 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714
3715 /*
3716 * If Intel-IOMMU is disabled by default, we will apply identity
3717 * map for all devices except those marked as being untrusted.
3718 */
3719 if (dmar_disabled)
3720 iommu_set_default_passthrough(false);
3721
3722 dmar_disabled = 0;
3723 no_iommu = 0;
3724
3725 return 1;
3726 }
3727
probe_acpi_namespace_devices(void)3728 static int __init probe_acpi_namespace_devices(void)
3729 {
3730 struct dmar_drhd_unit *drhd;
3731 /* To avoid a -Wunused-but-set-variable warning. */
3732 struct intel_iommu *iommu __maybe_unused;
3733 struct device *dev;
3734 int i, ret = 0;
3735
3736 for_each_active_iommu(iommu, drhd) {
3737 for_each_active_dev_scope(drhd->devices,
3738 drhd->devices_cnt, i, dev) {
3739 struct acpi_device_physical_node *pn;
3740 struct acpi_device *adev;
3741
3742 if (dev->bus != &acpi_bus_type)
3743 continue;
3744
3745 adev = to_acpi_device(dev);
3746 mutex_lock(&adev->physical_node_lock);
3747 list_for_each_entry(pn,
3748 &adev->physical_node_list, node) {
3749 ret = iommu_probe_device(pn->dev);
3750 if (ret)
3751 break;
3752 }
3753 mutex_unlock(&adev->physical_node_lock);
3754
3755 if (ret)
3756 return ret;
3757 }
3758 }
3759
3760 return 0;
3761 }
3762
tboot_force_iommu(void)3763 static __init int tboot_force_iommu(void)
3764 {
3765 if (!tboot_enabled())
3766 return 0;
3767
3768 if (no_iommu || dmar_disabled)
3769 pr_warn("Forcing Intel-IOMMU to enabled\n");
3770
3771 dmar_disabled = 0;
3772 no_iommu = 0;
3773
3774 return 1;
3775 }
3776
intel_iommu_init(void)3777 int __init intel_iommu_init(void)
3778 {
3779 int ret = -ENODEV;
3780 struct dmar_drhd_unit *drhd;
3781 struct intel_iommu *iommu;
3782
3783 /*
3784 * Intel IOMMU is required for a TXT/tboot launch or platform
3785 * opt in, so enforce that.
3786 */
3787 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788 platform_optin_force_iommu();
3789
3790 down_write(&dmar_global_lock);
3791 if (dmar_table_init()) {
3792 if (force_on)
3793 panic("tboot: Failed to initialize DMAR table\n");
3794 goto out_free_dmar;
3795 }
3796
3797 if (dmar_dev_scope_init() < 0) {
3798 if (force_on)
3799 panic("tboot: Failed to initialize DMAR device scope\n");
3800 goto out_free_dmar;
3801 }
3802
3803 up_write(&dmar_global_lock);
3804
3805 /*
3806 * The bus notifier takes the dmar_global_lock, so lockdep will
3807 * complain later when we register it under the lock.
3808 */
3809 dmar_register_bus_notifier();
3810
3811 down_write(&dmar_global_lock);
3812
3813 if (!no_iommu)
3814 intel_iommu_debugfs_init();
3815
3816 if (no_iommu || dmar_disabled) {
3817 /*
3818 * We exit the function here to ensure IOMMU's remapping and
3819 * mempool aren't setup, which means that the IOMMU's PMRs
3820 * won't be disabled via the call to init_dmars(). So disable
3821 * it explicitly here. The PMRs were setup by tboot prior to
3822 * calling SENTER, but the kernel is expected to reset/tear
3823 * down the PMRs.
3824 */
3825 if (intel_iommu_tboot_noforce) {
3826 for_each_iommu(iommu, drhd)
3827 iommu_disable_protect_mem_regions(iommu);
3828 }
3829
3830 /*
3831 * Make sure the IOMMUs are switched off, even when we
3832 * boot into a kexec kernel and the previous kernel left
3833 * them enabled
3834 */
3835 intel_disable_iommus();
3836 goto out_free_dmar;
3837 }
3838
3839 if (list_empty(&dmar_rmrr_units))
3840 pr_info("No RMRR found\n");
3841
3842 if (list_empty(&dmar_atsr_units))
3843 pr_info("No ATSR found\n");
3844
3845 if (list_empty(&dmar_satc_units))
3846 pr_info("No SATC found\n");
3847
3848 init_no_remapping_devices();
3849
3850 ret = init_dmars();
3851 if (ret) {
3852 if (force_on)
3853 panic("tboot: Failed to initialize DMARs\n");
3854 pr_err("Initialization failed\n");
3855 goto out_free_dmar;
3856 }
3857 up_write(&dmar_global_lock);
3858
3859 init_iommu_pm_ops();
3860
3861 down_read(&dmar_global_lock);
3862 for_each_active_iommu(iommu, drhd) {
3863 /*
3864 * The flush queue implementation does not perform
3865 * page-selective invalidations that are required for efficient
3866 * TLB flushes in virtual environments. The benefit of batching
3867 * is likely to be much lower than the overhead of synchronizing
3868 * the virtual and physical IOMMU page-tables.
3869 */
3870 if (cap_caching_mode(iommu->cap) &&
3871 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873 iommu_set_dma_strict();
3874 }
3875 iommu_device_sysfs_add(&iommu->iommu, NULL,
3876 intel_iommu_groups,
3877 "%s", iommu->name);
3878 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879
3880 iommu_pmu_register(iommu);
3881 }
3882 up_read(&dmar_global_lock);
3883
3884 if (si_domain && !hw_pass_through)
3885 register_memory_notifier(&intel_iommu_memory_nb);
3886
3887 down_read(&dmar_global_lock);
3888 if (probe_acpi_namespace_devices())
3889 pr_warn("ACPI name space devices didn't probe correctly\n");
3890
3891 /* Finally, we enable the DMA remapping hardware. */
3892 for_each_iommu(iommu, drhd) {
3893 if (!drhd->ignored && !translation_pre_enabled(iommu))
3894 iommu_enable_translation(iommu);
3895
3896 iommu_disable_protect_mem_regions(iommu);
3897 }
3898 up_read(&dmar_global_lock);
3899
3900 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901
3902 intel_iommu_enabled = 1;
3903
3904 return 0;
3905
3906 out_free_dmar:
3907 intel_iommu_free_dmars();
3908 up_write(&dmar_global_lock);
3909 return ret;
3910 }
3911
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913 {
3914 struct device_domain_info *info = opaque;
3915
3916 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917 return 0;
3918 }
3919
3920 /*
3921 * NB - intel-iommu lacks any sort of reference counting for the users of
3922 * dependent devices. If multiple endpoints have intersecting dependent
3923 * devices, unbinding the driver from any one of them will possibly leave
3924 * the others unable to operate.
3925 */
domain_context_clear(struct device_domain_info * info)3926 static void domain_context_clear(struct device_domain_info *info)
3927 {
3928 if (!dev_is_pci(info->dev)) {
3929 domain_context_clear_one(info, info->bus, info->devfn);
3930 return;
3931 }
3932
3933 pci_for_each_dma_alias(to_pci_dev(info->dev),
3934 &domain_context_clear_one_cb, info);
3935 }
3936
dmar_remove_one_dev_info(struct device * dev)3937 static void dmar_remove_one_dev_info(struct device *dev)
3938 {
3939 struct device_domain_info *info = dev_iommu_priv_get(dev);
3940 struct dmar_domain *domain = info->domain;
3941 struct intel_iommu *iommu = info->iommu;
3942 unsigned long flags;
3943
3944 if (!dev_is_real_dma_subdevice(info->dev)) {
3945 if (dev_is_pci(info->dev) && sm_supported(iommu))
3946 intel_pasid_tear_down_entry(iommu, info->dev,
3947 IOMMU_NO_PASID, false);
3948
3949 iommu_disable_pci_caps(info);
3950 domain_context_clear(info);
3951 }
3952
3953 spin_lock_irqsave(&domain->lock, flags);
3954 list_del(&info->link);
3955 spin_unlock_irqrestore(&domain->lock, flags);
3956
3957 domain_detach_iommu(domain, iommu);
3958 info->domain = NULL;
3959 }
3960
3961 /*
3962 * Clear the page table pointer in context or pasid table entries so that
3963 * all DMA requests without PASID from the device are blocked. If the page
3964 * table has been set, clean up the data structures.
3965 */
device_block_translation(struct device * dev)3966 static void device_block_translation(struct device *dev)
3967 {
3968 struct device_domain_info *info = dev_iommu_priv_get(dev);
3969 struct intel_iommu *iommu = info->iommu;
3970 unsigned long flags;
3971
3972 iommu_disable_pci_caps(info);
3973 if (!dev_is_real_dma_subdevice(dev)) {
3974 if (sm_supported(iommu))
3975 intel_pasid_tear_down_entry(iommu, dev,
3976 IOMMU_NO_PASID, false);
3977 else
3978 domain_context_clear(info);
3979 }
3980
3981 if (!info->domain)
3982 return;
3983
3984 spin_lock_irqsave(&info->domain->lock, flags);
3985 list_del(&info->link);
3986 spin_unlock_irqrestore(&info->domain->lock, flags);
3987
3988 domain_detach_iommu(info->domain, iommu);
3989 info->domain = NULL;
3990 }
3991
md_domain_init(struct dmar_domain * domain,int guest_width)3992 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3993 {
3994 int adjust_width;
3995
3996 /* calculate AGAW */
3997 domain->gaw = guest_width;
3998 adjust_width = guestwidth_to_adjustwidth(guest_width);
3999 domain->agaw = width_to_agaw(adjust_width);
4000
4001 domain->iommu_coherency = false;
4002 domain->iommu_superpage = 0;
4003 domain->max_addr = 0;
4004
4005 /* always allocate the top pgd */
4006 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4007 if (!domain->pgd)
4008 return -ENOMEM;
4009 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4010 return 0;
4011 }
4012
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4013 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4014 struct device *dev)
4015 {
4016 device_block_translation(dev);
4017 return 0;
4018 }
4019
4020 static struct iommu_domain blocking_domain = {
4021 .ops = &(const struct iommu_domain_ops) {
4022 .attach_dev = blocking_domain_attach_dev,
4023 .free = intel_iommu_domain_free
4024 }
4025 };
4026
intel_iommu_domain_alloc(unsigned type)4027 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4028 {
4029 struct dmar_domain *dmar_domain;
4030 struct iommu_domain *domain;
4031
4032 switch (type) {
4033 case IOMMU_DOMAIN_BLOCKED:
4034 return &blocking_domain;
4035 case IOMMU_DOMAIN_DMA:
4036 case IOMMU_DOMAIN_UNMANAGED:
4037 dmar_domain = alloc_domain(type);
4038 if (!dmar_domain) {
4039 pr_err("Can't allocate dmar_domain\n");
4040 return NULL;
4041 }
4042 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4043 pr_err("Domain initialization failed\n");
4044 domain_exit(dmar_domain);
4045 return NULL;
4046 }
4047
4048 domain = &dmar_domain->domain;
4049 domain->geometry.aperture_start = 0;
4050 domain->geometry.aperture_end =
4051 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4052 domain->geometry.force_aperture = true;
4053
4054 return domain;
4055 case IOMMU_DOMAIN_IDENTITY:
4056 return &si_domain->domain;
4057 case IOMMU_DOMAIN_SVA:
4058 return intel_svm_domain_alloc();
4059 default:
4060 return NULL;
4061 }
4062
4063 return NULL;
4064 }
4065
intel_iommu_domain_free(struct iommu_domain * domain)4066 static void intel_iommu_domain_free(struct iommu_domain *domain)
4067 {
4068 if (domain != &si_domain->domain && domain != &blocking_domain)
4069 domain_exit(to_dmar_domain(domain));
4070 }
4071
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)4072 static int prepare_domain_attach_device(struct iommu_domain *domain,
4073 struct device *dev)
4074 {
4075 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4076 struct intel_iommu *iommu;
4077 int addr_width;
4078
4079 iommu = device_to_iommu(dev, NULL, NULL);
4080 if (!iommu)
4081 return -ENODEV;
4082
4083 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4084 return -EINVAL;
4085
4086 /* check if this iommu agaw is sufficient for max mapped address */
4087 addr_width = agaw_to_width(iommu->agaw);
4088 if (addr_width > cap_mgaw(iommu->cap))
4089 addr_width = cap_mgaw(iommu->cap);
4090
4091 if (dmar_domain->max_addr > (1LL << addr_width))
4092 return -EINVAL;
4093 dmar_domain->gaw = addr_width;
4094
4095 /*
4096 * Knock out extra levels of page tables if necessary
4097 */
4098 while (iommu->agaw < dmar_domain->agaw) {
4099 struct dma_pte *pte;
4100
4101 pte = dmar_domain->pgd;
4102 if (dma_pte_present(pte)) {
4103 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4104 free_pgtable_page(pte);
4105 }
4106 dmar_domain->agaw--;
4107 }
4108
4109 return 0;
4110 }
4111
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4112 static int intel_iommu_attach_device(struct iommu_domain *domain,
4113 struct device *dev)
4114 {
4115 struct device_domain_info *info = dev_iommu_priv_get(dev);
4116 int ret;
4117
4118 if (info->domain)
4119 device_block_translation(dev);
4120
4121 ret = prepare_domain_attach_device(domain, dev);
4122 if (ret)
4123 return ret;
4124
4125 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4126 }
4127
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)4128 static int intel_iommu_map(struct iommu_domain *domain,
4129 unsigned long iova, phys_addr_t hpa,
4130 size_t size, int iommu_prot, gfp_t gfp)
4131 {
4132 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4133 u64 max_addr;
4134 int prot = 0;
4135
4136 if (iommu_prot & IOMMU_READ)
4137 prot |= DMA_PTE_READ;
4138 if (iommu_prot & IOMMU_WRITE)
4139 prot |= DMA_PTE_WRITE;
4140 if (dmar_domain->set_pte_snp)
4141 prot |= DMA_PTE_SNP;
4142
4143 max_addr = iova + size;
4144 if (dmar_domain->max_addr < max_addr) {
4145 u64 end;
4146
4147 /* check if minimum agaw is sufficient for mapped address */
4148 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4149 if (end < max_addr) {
4150 pr_err("%s: iommu width (%d) is not "
4151 "sufficient for the mapped address (%llx)\n",
4152 __func__, dmar_domain->gaw, max_addr);
4153 return -EFAULT;
4154 }
4155 dmar_domain->max_addr = max_addr;
4156 }
4157 /* Round up size to next multiple of PAGE_SIZE, if it and
4158 the low bits of hpa would take us onto the next page */
4159 size = aligned_nrpages(hpa, size);
4160 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4161 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4162 }
4163
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)4164 static int intel_iommu_map_pages(struct iommu_domain *domain,
4165 unsigned long iova, phys_addr_t paddr,
4166 size_t pgsize, size_t pgcount,
4167 int prot, gfp_t gfp, size_t *mapped)
4168 {
4169 unsigned long pgshift = __ffs(pgsize);
4170 size_t size = pgcount << pgshift;
4171 int ret;
4172
4173 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4174 return -EINVAL;
4175
4176 if (!IS_ALIGNED(iova | paddr, pgsize))
4177 return -EINVAL;
4178
4179 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4180 if (!ret && mapped)
4181 *mapped = size;
4182
4183 return ret;
4184 }
4185
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)4186 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4187 unsigned long iova, size_t size,
4188 struct iommu_iotlb_gather *gather)
4189 {
4190 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4191 unsigned long start_pfn, last_pfn;
4192 int level = 0;
4193
4194 /* Cope with horrid API which requires us to unmap more than the
4195 size argument if it happens to be a large-page mapping. */
4196 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4197 &level, GFP_ATOMIC)))
4198 return 0;
4199
4200 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4201 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4202
4203 start_pfn = iova >> VTD_PAGE_SHIFT;
4204 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4205
4206 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4207
4208 if (dmar_domain->max_addr == iova + size)
4209 dmar_domain->max_addr = iova;
4210
4211 /*
4212 * We do not use page-selective IOTLB invalidation in flush queue,
4213 * so there is no need to track page and sync iotlb.
4214 */
4215 if (!iommu_iotlb_gather_queued(gather))
4216 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4217
4218 return size;
4219 }
4220
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)4221 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4222 unsigned long iova,
4223 size_t pgsize, size_t pgcount,
4224 struct iommu_iotlb_gather *gather)
4225 {
4226 unsigned long pgshift = __ffs(pgsize);
4227 size_t size = pgcount << pgshift;
4228
4229 return intel_iommu_unmap(domain, iova, size, gather);
4230 }
4231
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)4232 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4233 struct iommu_iotlb_gather *gather)
4234 {
4235 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4236 unsigned long iova_pfn = IOVA_PFN(gather->start);
4237 size_t size = gather->end - gather->start;
4238 struct iommu_domain_info *info;
4239 unsigned long start_pfn;
4240 unsigned long nrpages;
4241 unsigned long i;
4242
4243 nrpages = aligned_nrpages(gather->start, size);
4244 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4245
4246 xa_for_each(&dmar_domain->iommu_array, i, info)
4247 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4248 start_pfn, nrpages,
4249 list_empty(&gather->freelist), 0);
4250
4251 put_pages_list(&gather->freelist);
4252 }
4253
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)4254 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4255 dma_addr_t iova)
4256 {
4257 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4258 struct dma_pte *pte;
4259 int level = 0;
4260 u64 phys = 0;
4261
4262 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4263 GFP_ATOMIC);
4264 if (pte && dma_pte_present(pte))
4265 phys = dma_pte_addr(pte) +
4266 (iova & (BIT_MASK(level_to_offset_bits(level) +
4267 VTD_PAGE_SHIFT) - 1));
4268
4269 return phys;
4270 }
4271
domain_support_force_snooping(struct dmar_domain * domain)4272 static bool domain_support_force_snooping(struct dmar_domain *domain)
4273 {
4274 struct device_domain_info *info;
4275 bool support = true;
4276
4277 assert_spin_locked(&domain->lock);
4278 list_for_each_entry(info, &domain->devices, link) {
4279 if (!ecap_sc_support(info->iommu->ecap)) {
4280 support = false;
4281 break;
4282 }
4283 }
4284
4285 return support;
4286 }
4287
domain_set_force_snooping(struct dmar_domain * domain)4288 static void domain_set_force_snooping(struct dmar_domain *domain)
4289 {
4290 struct device_domain_info *info;
4291
4292 assert_spin_locked(&domain->lock);
4293 /*
4294 * Second level page table supports per-PTE snoop control. The
4295 * iommu_map() interface will handle this by setting SNP bit.
4296 */
4297 if (!domain->use_first_level) {
4298 domain->set_pte_snp = true;
4299 return;
4300 }
4301
4302 list_for_each_entry(info, &domain->devices, link)
4303 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4304 IOMMU_NO_PASID);
4305 }
4306
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)4307 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4308 {
4309 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4310 unsigned long flags;
4311
4312 if (dmar_domain->force_snooping)
4313 return true;
4314
4315 spin_lock_irqsave(&dmar_domain->lock, flags);
4316 if (!domain_support_force_snooping(dmar_domain) ||
4317 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4318 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4319 return false;
4320 }
4321
4322 domain_set_force_snooping(dmar_domain);
4323 dmar_domain->force_snooping = true;
4324 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4325
4326 return true;
4327 }
4328
intel_iommu_capable(struct device * dev,enum iommu_cap cap)4329 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4330 {
4331 struct device_domain_info *info = dev_iommu_priv_get(dev);
4332
4333 switch (cap) {
4334 case IOMMU_CAP_CACHE_COHERENCY:
4335 case IOMMU_CAP_DEFERRED_FLUSH:
4336 return true;
4337 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4338 return dmar_platform_optin();
4339 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4340 return ecap_sc_support(info->iommu->ecap);
4341 default:
4342 return false;
4343 }
4344 }
4345
intel_iommu_probe_device(struct device * dev)4346 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4347 {
4348 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4349 struct device_domain_info *info;
4350 struct intel_iommu *iommu;
4351 u8 bus, devfn;
4352 int ret;
4353
4354 iommu = device_to_iommu(dev, &bus, &devfn);
4355 if (!iommu || !iommu->iommu.ops)
4356 return ERR_PTR(-ENODEV);
4357
4358 info = kzalloc(sizeof(*info), GFP_KERNEL);
4359 if (!info)
4360 return ERR_PTR(-ENOMEM);
4361
4362 if (dev_is_real_dma_subdevice(dev)) {
4363 info->bus = pdev->bus->number;
4364 info->devfn = pdev->devfn;
4365 info->segment = pci_domain_nr(pdev->bus);
4366 } else {
4367 info->bus = bus;
4368 info->devfn = devfn;
4369 info->segment = iommu->segment;
4370 }
4371
4372 info->dev = dev;
4373 info->iommu = iommu;
4374 if (dev_is_pci(dev)) {
4375 if (ecap_dev_iotlb_support(iommu->ecap) &&
4376 pci_ats_supported(pdev) &&
4377 dmar_ats_supported(pdev, iommu)) {
4378 info->ats_supported = 1;
4379 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4380
4381 /*
4382 * For IOMMU that supports device IOTLB throttling
4383 * (DIT), we assign PFSID to the invalidation desc
4384 * of a VF such that IOMMU HW can gauge queue depth
4385 * at PF level. If DIT is not set, PFSID will be
4386 * treated as reserved, which should be set to 0.
4387 */
4388 if (ecap_dit(iommu->ecap))
4389 info->pfsid = pci_dev_id(pci_physfn(pdev));
4390 info->ats_qdep = pci_ats_queue_depth(pdev);
4391 }
4392 if (sm_supported(iommu)) {
4393 if (pasid_supported(iommu)) {
4394 int features = pci_pasid_features(pdev);
4395
4396 if (features >= 0)
4397 info->pasid_supported = features | 1;
4398 }
4399
4400 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4401 pci_pri_supported(pdev))
4402 info->pri_supported = 1;
4403 }
4404 }
4405
4406 dev_iommu_priv_set(dev, info);
4407
4408 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4409 ret = intel_pasid_alloc_table(dev);
4410 if (ret) {
4411 dev_err(dev, "PASID table allocation failed\n");
4412 dev_iommu_priv_set(dev, NULL);
4413 kfree(info);
4414 return ERR_PTR(ret);
4415 }
4416 }
4417
4418 return &iommu->iommu;
4419 }
4420
intel_iommu_release_device(struct device * dev)4421 static void intel_iommu_release_device(struct device *dev)
4422 {
4423 struct device_domain_info *info = dev_iommu_priv_get(dev);
4424
4425 dmar_remove_one_dev_info(dev);
4426 intel_pasid_free_table(dev);
4427 dev_iommu_priv_set(dev, NULL);
4428 kfree(info);
4429 set_dma_ops(dev, NULL);
4430 }
4431
intel_iommu_probe_finalize(struct device * dev)4432 static void intel_iommu_probe_finalize(struct device *dev)
4433 {
4434 set_dma_ops(dev, NULL);
4435 iommu_setup_dma_ops(dev, 0, U64_MAX);
4436 }
4437
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4438 static void intel_iommu_get_resv_regions(struct device *device,
4439 struct list_head *head)
4440 {
4441 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4442 struct iommu_resv_region *reg;
4443 struct dmar_rmrr_unit *rmrr;
4444 struct device *i_dev;
4445 int i;
4446
4447 rcu_read_lock();
4448 for_each_rmrr_units(rmrr) {
4449 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4450 i, i_dev) {
4451 struct iommu_resv_region *resv;
4452 enum iommu_resv_type type;
4453 size_t length;
4454
4455 if (i_dev != device &&
4456 !is_downstream_to_pci_bridge(device, i_dev))
4457 continue;
4458
4459 length = rmrr->end_address - rmrr->base_address + 1;
4460
4461 type = device_rmrr_is_relaxable(device) ?
4462 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4463
4464 resv = iommu_alloc_resv_region(rmrr->base_address,
4465 length, prot, type,
4466 GFP_ATOMIC);
4467 if (!resv)
4468 break;
4469
4470 list_add_tail(&resv->list, head);
4471 }
4472 }
4473 rcu_read_unlock();
4474
4475 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4476 if (dev_is_pci(device)) {
4477 struct pci_dev *pdev = to_pci_dev(device);
4478
4479 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4480 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4481 IOMMU_RESV_DIRECT_RELAXABLE,
4482 GFP_KERNEL);
4483 if (reg)
4484 list_add_tail(®->list, head);
4485 }
4486 }
4487 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4488
4489 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4490 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4491 0, IOMMU_RESV_MSI, GFP_KERNEL);
4492 if (!reg)
4493 return;
4494 list_add_tail(®->list, head);
4495 }
4496
intel_iommu_device_group(struct device * dev)4497 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4498 {
4499 if (dev_is_pci(dev))
4500 return pci_device_group(dev);
4501 return generic_device_group(dev);
4502 }
4503
intel_iommu_enable_sva(struct device * dev)4504 static int intel_iommu_enable_sva(struct device *dev)
4505 {
4506 struct device_domain_info *info = dev_iommu_priv_get(dev);
4507 struct intel_iommu *iommu;
4508
4509 if (!info || dmar_disabled)
4510 return -EINVAL;
4511
4512 iommu = info->iommu;
4513 if (!iommu)
4514 return -EINVAL;
4515
4516 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4517 return -ENODEV;
4518
4519 if (!info->pasid_enabled || !info->ats_enabled)
4520 return -EINVAL;
4521
4522 /*
4523 * Devices having device-specific I/O fault handling should not
4524 * support PCI/PRI. The IOMMU side has no means to check the
4525 * capability of device-specific IOPF. Therefore, IOMMU can only
4526 * default that if the device driver enables SVA on a non-PRI
4527 * device, it will handle IOPF in its own way.
4528 */
4529 if (!info->pri_supported)
4530 return 0;
4531
4532 /* Devices supporting PRI should have it enabled. */
4533 if (!info->pri_enabled)
4534 return -EINVAL;
4535
4536 return 0;
4537 }
4538
intel_iommu_enable_iopf(struct device * dev)4539 static int intel_iommu_enable_iopf(struct device *dev)
4540 {
4541 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4542 struct device_domain_info *info = dev_iommu_priv_get(dev);
4543 struct intel_iommu *iommu;
4544 int ret;
4545
4546 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4547 return -ENODEV;
4548
4549 if (info->pri_enabled)
4550 return -EBUSY;
4551
4552 iommu = info->iommu;
4553 if (!iommu)
4554 return -EINVAL;
4555
4556 /* PASID is required in PRG Response Message. */
4557 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4558 return -EINVAL;
4559
4560 ret = pci_reset_pri(pdev);
4561 if (ret)
4562 return ret;
4563
4564 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4565 if (ret)
4566 return ret;
4567
4568 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4569 if (ret)
4570 goto iopf_remove_device;
4571
4572 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4573 if (ret)
4574 goto iopf_unregister_handler;
4575 info->pri_enabled = 1;
4576
4577 return 0;
4578
4579 iopf_unregister_handler:
4580 iommu_unregister_device_fault_handler(dev);
4581 iopf_remove_device:
4582 iopf_queue_remove_device(iommu->iopf_queue, dev);
4583
4584 return ret;
4585 }
4586
intel_iommu_disable_iopf(struct device * dev)4587 static int intel_iommu_disable_iopf(struct device *dev)
4588 {
4589 struct device_domain_info *info = dev_iommu_priv_get(dev);
4590 struct intel_iommu *iommu = info->iommu;
4591
4592 if (!info->pri_enabled)
4593 return -EINVAL;
4594
4595 /*
4596 * PCIe spec states that by clearing PRI enable bit, the Page
4597 * Request Interface will not issue new page requests, but has
4598 * outstanding page requests that have been transmitted or are
4599 * queued for transmission. This is supposed to be called after
4600 * the device driver has stopped DMA, all PASIDs have been
4601 * unbound and the outstanding PRQs have been drained.
4602 */
4603 pci_disable_pri(to_pci_dev(dev));
4604 info->pri_enabled = 0;
4605
4606 /*
4607 * With PRI disabled and outstanding PRQs drained, unregistering
4608 * fault handler and removing device from iopf queue should never
4609 * fail.
4610 */
4611 WARN_ON(iommu_unregister_device_fault_handler(dev));
4612 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4613
4614 return 0;
4615 }
4616
4617 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4618 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4619 {
4620 switch (feat) {
4621 case IOMMU_DEV_FEAT_IOPF:
4622 return intel_iommu_enable_iopf(dev);
4623
4624 case IOMMU_DEV_FEAT_SVA:
4625 return intel_iommu_enable_sva(dev);
4626
4627 default:
4628 return -ENODEV;
4629 }
4630 }
4631
4632 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4633 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4634 {
4635 switch (feat) {
4636 case IOMMU_DEV_FEAT_IOPF:
4637 return intel_iommu_disable_iopf(dev);
4638
4639 case IOMMU_DEV_FEAT_SVA:
4640 return 0;
4641
4642 default:
4643 return -ENODEV;
4644 }
4645 }
4646
intel_iommu_is_attach_deferred(struct device * dev)4647 static bool intel_iommu_is_attach_deferred(struct device *dev)
4648 {
4649 struct device_domain_info *info = dev_iommu_priv_get(dev);
4650
4651 return translation_pre_enabled(info->iommu) && !info->domain;
4652 }
4653
4654 /*
4655 * Check that the device does not live on an external facing PCI port that is
4656 * marked as untrusted. Such devices should not be able to apply quirks and
4657 * thus not be able to bypass the IOMMU restrictions.
4658 */
risky_device(struct pci_dev * pdev)4659 static bool risky_device(struct pci_dev *pdev)
4660 {
4661 if (pdev->untrusted) {
4662 pci_info(pdev,
4663 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4664 pdev->vendor, pdev->device);
4665 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4666 return true;
4667 }
4668 return false;
4669 }
4670
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4671 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4672 unsigned long iova, size_t size)
4673 {
4674 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4675 unsigned long pages = aligned_nrpages(iova, size);
4676 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4677 struct iommu_domain_info *info;
4678 unsigned long i;
4679
4680 xa_for_each(&dmar_domain->iommu_array, i, info)
4681 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4682 }
4683
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid)4684 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4685 {
4686 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4687 struct dev_pasid_info *curr, *dev_pasid = NULL;
4688 struct dmar_domain *dmar_domain;
4689 struct iommu_domain *domain;
4690 unsigned long flags;
4691
4692 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4693 if (WARN_ON_ONCE(!domain))
4694 goto out_tear_down;
4695
4696 /*
4697 * The SVA implementation needs to handle its own stuffs like the mm
4698 * notification. Before consolidating that code into iommu core, let
4699 * the intel sva code handle it.
4700 */
4701 if (domain->type == IOMMU_DOMAIN_SVA) {
4702 intel_svm_remove_dev_pasid(dev, pasid);
4703 goto out_tear_down;
4704 }
4705
4706 dmar_domain = to_dmar_domain(domain);
4707 spin_lock_irqsave(&dmar_domain->lock, flags);
4708 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4709 if (curr->dev == dev && curr->pasid == pasid) {
4710 list_del(&curr->link_domain);
4711 dev_pasid = curr;
4712 break;
4713 }
4714 }
4715 WARN_ON_ONCE(!dev_pasid);
4716 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4717
4718 domain_detach_iommu(dmar_domain, iommu);
4719 kfree(dev_pasid);
4720 out_tear_down:
4721 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4722 intel_drain_pasid_prq(dev, pasid);
4723 }
4724
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4725 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4726 struct device *dev, ioasid_t pasid)
4727 {
4728 struct device_domain_info *info = dev_iommu_priv_get(dev);
4729 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4730 struct intel_iommu *iommu = info->iommu;
4731 struct dev_pasid_info *dev_pasid;
4732 unsigned long flags;
4733 int ret;
4734
4735 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4736 return -EOPNOTSUPP;
4737
4738 if (context_copied(iommu, info->bus, info->devfn))
4739 return -EBUSY;
4740
4741 ret = prepare_domain_attach_device(domain, dev);
4742 if (ret)
4743 return ret;
4744
4745 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4746 if (!dev_pasid)
4747 return -ENOMEM;
4748
4749 ret = domain_attach_iommu(dmar_domain, iommu);
4750 if (ret)
4751 goto out_free;
4752
4753 if (domain_type_is_si(dmar_domain))
4754 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4755 dev, pasid);
4756 else if (dmar_domain->use_first_level)
4757 ret = domain_setup_first_level(iommu, dmar_domain,
4758 dev, pasid);
4759 else
4760 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4761 dev, pasid);
4762 if (ret)
4763 goto out_detach_iommu;
4764
4765 dev_pasid->dev = dev;
4766 dev_pasid->pasid = pasid;
4767 spin_lock_irqsave(&dmar_domain->lock, flags);
4768 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4769 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4770
4771 return 0;
4772 out_detach_iommu:
4773 domain_detach_iommu(dmar_domain, iommu);
4774 out_free:
4775 kfree(dev_pasid);
4776 return ret;
4777 }
4778
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4779 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4780 {
4781 struct device_domain_info *info = dev_iommu_priv_get(dev);
4782 struct intel_iommu *iommu = info->iommu;
4783 struct iommu_hw_info_vtd *vtd;
4784
4785 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4786 if (!vtd)
4787 return ERR_PTR(-ENOMEM);
4788
4789 vtd->cap_reg = iommu->cap;
4790 vtd->ecap_reg = iommu->ecap;
4791 *length = sizeof(*vtd);
4792 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4793 return vtd;
4794 }
4795
4796 const struct iommu_ops intel_iommu_ops = {
4797 .capable = intel_iommu_capable,
4798 .hw_info = intel_iommu_hw_info,
4799 .domain_alloc = intel_iommu_domain_alloc,
4800 .probe_device = intel_iommu_probe_device,
4801 .probe_finalize = intel_iommu_probe_finalize,
4802 .release_device = intel_iommu_release_device,
4803 .get_resv_regions = intel_iommu_get_resv_regions,
4804 .device_group = intel_iommu_device_group,
4805 .dev_enable_feat = intel_iommu_dev_enable_feat,
4806 .dev_disable_feat = intel_iommu_dev_disable_feat,
4807 .is_attach_deferred = intel_iommu_is_attach_deferred,
4808 .def_domain_type = device_def_domain_type,
4809 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4810 .pgsize_bitmap = SZ_4K,
4811 #ifdef CONFIG_INTEL_IOMMU_SVM
4812 .page_response = intel_svm_page_response,
4813 #endif
4814 .default_domain_ops = &(const struct iommu_domain_ops) {
4815 .attach_dev = intel_iommu_attach_device,
4816 .set_dev_pasid = intel_iommu_set_dev_pasid,
4817 .map_pages = intel_iommu_map_pages,
4818 .unmap_pages = intel_iommu_unmap_pages,
4819 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4820 .flush_iotlb_all = intel_flush_iotlb_all,
4821 .iotlb_sync = intel_iommu_tlb_sync,
4822 .iova_to_phys = intel_iommu_iova_to_phys,
4823 .free = intel_iommu_domain_free,
4824 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4825 }
4826 };
4827
quirk_iommu_igfx(struct pci_dev * dev)4828 static void quirk_iommu_igfx(struct pci_dev *dev)
4829 {
4830 if (risky_device(dev))
4831 return;
4832
4833 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4834 dmar_map_gfx = 0;
4835 }
4836
4837 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4845
4846 /* Broadwell igfx malfunctions with dmar */
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4871
quirk_iommu_rwbf(struct pci_dev * dev)4872 static void quirk_iommu_rwbf(struct pci_dev *dev)
4873 {
4874 if (risky_device(dev))
4875 return;
4876
4877 /*
4878 * Mobile 4 Series Chipset neglects to set RWBF capability,
4879 * but needs it. Same seems to hold for the desktop versions.
4880 */
4881 pci_info(dev, "Forcing write-buffer flush capability\n");
4882 rwbf_quirk = 1;
4883 }
4884
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4892
4893 #define GGC 0x52
4894 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4895 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4896 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4897 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4898 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4899 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4900 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4901 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4902
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4903 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4904 {
4905 unsigned short ggc;
4906
4907 if (risky_device(dev))
4908 return;
4909
4910 if (pci_read_config_word(dev, GGC, &ggc))
4911 return;
4912
4913 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4914 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4915 dmar_map_gfx = 0;
4916 } else if (dmar_map_gfx) {
4917 /* we have to ensure the gfx device is idle before we flush */
4918 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4919 iommu_set_dma_strict();
4920 }
4921 }
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4926
quirk_igfx_skip_te_disable(struct pci_dev * dev)4927 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4928 {
4929 unsigned short ver;
4930
4931 if (!IS_GFX_DEVICE(dev))
4932 return;
4933
4934 ver = (dev->device >> 8) & 0xff;
4935 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4936 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4937 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4938 return;
4939
4940 if (risky_device(dev))
4941 return;
4942
4943 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4944 iommu_skip_te_disable = 1;
4945 }
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4947
4948 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4949 ISOCH DMAR unit for the Azalia sound device, but not give it any
4950 TLB entries, which causes it to deadlock. Check for that. We do
4951 this in a function called from init_dmars(), instead of in a PCI
4952 quirk, because we don't want to print the obnoxious "BIOS broken"
4953 message if VT-d is actually disabled.
4954 */
check_tylersburg_isoch(void)4955 static void __init check_tylersburg_isoch(void)
4956 {
4957 struct pci_dev *pdev;
4958 uint32_t vtisochctrl;
4959
4960 /* If there's no Azalia in the system anyway, forget it. */
4961 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4962 if (!pdev)
4963 return;
4964
4965 if (risky_device(pdev)) {
4966 pci_dev_put(pdev);
4967 return;
4968 }
4969
4970 pci_dev_put(pdev);
4971
4972 /* System Management Registers. Might be hidden, in which case
4973 we can't do the sanity check. But that's OK, because the
4974 known-broken BIOSes _don't_ actually hide it, so far. */
4975 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4976 if (!pdev)
4977 return;
4978
4979 if (risky_device(pdev)) {
4980 pci_dev_put(pdev);
4981 return;
4982 }
4983
4984 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4985 pci_dev_put(pdev);
4986 return;
4987 }
4988
4989 pci_dev_put(pdev);
4990
4991 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4992 if (vtisochctrl & 1)
4993 return;
4994
4995 /* Drop all bits other than the number of TLB entries */
4996 vtisochctrl &= 0x1c;
4997
4998 /* If we have the recommended number of TLB entries (16), fine. */
4999 if (vtisochctrl == 0x10)
5000 return;
5001
5002 /* Zero TLB entries? You get to ride the short bus to school. */
5003 if (!vtisochctrl) {
5004 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5005 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5006 dmi_get_system_info(DMI_BIOS_VENDOR),
5007 dmi_get_system_info(DMI_BIOS_VERSION),
5008 dmi_get_system_info(DMI_PRODUCT_VERSION));
5009 iommu_identity_mapping |= IDENTMAP_AZALIA;
5010 return;
5011 }
5012
5013 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5014 vtisochctrl);
5015 }
5016
5017 /*
5018 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5019 * invalidation completion before posted writes initiated with translated address
5020 * that utilized translations matching the invalidation address range, violating
5021 * the invalidation completion ordering.
5022 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5023 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5024 * under the control of the trusted/privileged host device driver must use this
5025 * quirk.
5026 * Device TLBs are invalidated under the following six conditions:
5027 * 1. Device driver does DMA API unmap IOVA
5028 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5029 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5030 * exit_mmap() due to crash
5031 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5032 * VM has to free pages that were unmapped
5033 * 5. Userspace driver unmaps a DMA buffer
5034 * 6. Cache invalidation in vSVA usage (upcoming)
5035 *
5036 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5037 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5038 * invalidate TLB the same way as normal user unmap which will use this quirk.
5039 * The dTLB invalidation after PASID cache flush does not need this quirk.
5040 *
5041 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5042 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)5043 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5044 unsigned long address, unsigned long mask,
5045 u32 pasid, u16 qdep)
5046 {
5047 u16 sid;
5048
5049 if (likely(!info->dtlb_extra_inval))
5050 return;
5051
5052 sid = PCI_DEVID(info->bus, info->devfn);
5053 if (pasid == IOMMU_NO_PASID) {
5054 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5055 qdep, address, mask);
5056 } else {
5057 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5058 pasid, qdep, address, mask);
5059 }
5060 }
5061
5062 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5063
5064 /*
5065 * Function to submit a command to the enhanced command interface. The
5066 * valid enhanced command descriptions are defined in Table 47 of the
5067 * VT-d spec. The VT-d hardware implementation may support some but not
5068 * all commands, which can be determined by checking the Enhanced
5069 * Command Capability Register.
5070 *
5071 * Return values:
5072 * - 0: Command successful without any error;
5073 * - Negative: software error value;
5074 * - Nonzero positive: failure status code defined in Table 48.
5075 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)5076 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5077 {
5078 unsigned long flags;
5079 u64 res;
5080 int ret;
5081
5082 if (!cap_ecmds(iommu->cap))
5083 return -ENODEV;
5084
5085 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5086
5087 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5088 if (res & DMA_ECMD_ECRSP_IP) {
5089 ret = -EBUSY;
5090 goto err;
5091 }
5092
5093 /*
5094 * Unconditionally write the operand B, because
5095 * - There is no side effect if an ecmd doesn't require an
5096 * operand B, but we set the register to some value.
5097 * - It's not invoked in any critical path. The extra MMIO
5098 * write doesn't bring any performance concerns.
5099 */
5100 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5101 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5102
5103 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5104 !(res & DMA_ECMD_ECRSP_IP), res);
5105
5106 if (res & DMA_ECMD_ECRSP_IP) {
5107 ret = -ETIMEDOUT;
5108 goto err;
5109 }
5110
5111 ret = ecmd_get_status_code(res);
5112 err:
5113 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5114
5115 return ret;
5116 }
5117