1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN (1)
63
64 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE (9)
68 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69
agaw_to_level(int agaw)70 static inline int agaw_to_level(int agaw)
71 {
72 return agaw + 2;
73 }
74
agaw_to_width(int agaw)75 static inline int agaw_to_width(int agaw)
76 {
77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
width_to_agaw(int width)80 static inline int width_to_agaw(int width)
81 {
82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
level_to_offset_bits(int level)85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 return (level - 1) * LEVEL_STRIDE;
88 }
89
pfn_level_offset(u64 pfn,int level)90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
level_mask(int level)95 static inline u64 level_mask(int level)
96 {
97 return -1ULL << level_to_offset_bits(level);
98 }
99
level_size(int level)100 static inline u64 level_size(int level)
101 {
102 return 1ULL << level_to_offset_bits(level);
103 }
104
align_to_level(u64 pfn,int level)105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
lvl_to_nr_pages(unsigned int lvl)110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116 are never going to work. */
mm_to_dma_pfn_start(unsigned long mm_pfn)117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
mm_to_dma_pfn_end(unsigned long mm_pfn)121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
page_to_dma_pfn(struct page * pg)125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
virt_to_dma_pfn(void * p)129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
140 */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149 * if marked present.
150 */
root_entry_lctp(struct root_entry * re)151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 if (!(re->lo & 1))
154 return 0;
155
156 return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161 * if marked present.
162 */
root_entry_uctp(struct root_entry * re)163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 if (!(re->hi & 1))
166 return 0;
167
168 return re->hi & VTD_PAGE_MASK;
169 }
170
context_set_present(struct context_entry * context)171 static inline void context_set_present(struct context_entry *context)
172 {
173 context->lo |= 1;
174 }
175
context_set_fault_enable(struct context_entry * context)176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 context->lo &= (((u64)-1) << 2) | 1;
179 }
180
context_set_translation_type(struct context_entry * context,unsigned long value)181 static inline void context_set_translation_type(struct context_entry *context,
182 unsigned long value)
183 {
184 context->lo &= (((u64)-1) << 4) | 3;
185 context->lo |= (value & 3) << 2;
186 }
187
context_set_address_root(struct context_entry * context,unsigned long value)188 static inline void context_set_address_root(struct context_entry *context,
189 unsigned long value)
190 {
191 context->lo &= ~VTD_PAGE_MASK;
192 context->lo |= value & VTD_PAGE_MASK;
193 }
194
context_set_address_width(struct context_entry * context,unsigned long value)195 static inline void context_set_address_width(struct context_entry *context,
196 unsigned long value)
197 {
198 context->hi |= value & 7;
199 }
200
context_set_domain_id(struct context_entry * context,unsigned long value)201 static inline void context_set_domain_id(struct context_entry *context,
202 unsigned long value)
203 {
204 context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
context_set_pasid(struct context_entry * context)207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 context->lo |= CONTEXT_PASIDE;
210 }
211
context_domain_id(struct context_entry * c)212 static inline int context_domain_id(struct context_entry *c)
213 {
214 return((c->hi >> 8) & 0xffff);
215 }
216
context_clear_entry(struct context_entry * context)217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 context->lo = 0;
220 context->hi = 0;
221 }
222
context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 if (!iommu->copied_tables)
226 return false;
227
228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
set_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
clear_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244 * This domain is a statically identity mapping domain.
245 * 1. This domain creats a static 1:1 mapping to all usable memory.
246 * 2. It maps to each iommu if successful.
247 * 3. Each iommu mapps to this domain if successful.
248 */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253 struct list_head list; /* list of rmrr units */
254 struct acpi_dmar_header *hdr; /* ACPI header */
255 u64 base_address; /* reserved base address*/
256 u64 end_address; /* reserved end address */
257 struct dmar_dev_scope *devices; /* target devices */
258 int devices_cnt; /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262 struct list_head list; /* list of ATSR units */
263 struct acpi_dmar_header *hdr; /* ACPI header */
264 struct dmar_dev_scope *devices; /* target devices */
265 int devices_cnt; /* target device count */
266 u8 include_all:1; /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270 struct list_head list; /* list of SATC units */
271 struct acpi_dmar_header *hdr; /* ACPI header */
272 struct dmar_dev_scope *devices; /* target devices */
273 struct intel_iommu *iommu; /* the corresponding iommu */
274 int devices_cnt; /* target device count */
275 u8 atc_required:1; /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298
299 #define IDENTMAP_GFX 2
300 #define IDENTMAP_AZALIA 4
301
302 const struct iommu_ops intel_iommu_ops;
303
translation_pre_enabled(struct intel_iommu * iommu)304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
clear_translation_pre_enabled(struct intel_iommu * iommu)309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
init_translation_status(struct intel_iommu * iommu)314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 u32 gsts;
317
318 gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 if (gsts & DMA_GSTS_TES)
320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
intel_iommu_setup(char * str)323 static int __init intel_iommu_setup(char *str)
324 {
325 if (!str)
326 return -EINVAL;
327
328 while (*str) {
329 if (!strncmp(str, "on", 2)) {
330 dmar_disabled = 0;
331 pr_info("IOMMU enabled\n");
332 } else if (!strncmp(str, "off", 3)) {
333 dmar_disabled = 1;
334 no_platform_optin = 1;
335 pr_info("IOMMU disabled\n");
336 } else if (!strncmp(str, "igfx_off", 8)) {
337 dmar_map_gfx = 0;
338 pr_info("Disable GFX device mapping\n");
339 } else if (!strncmp(str, "forcedac", 8)) {
340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 iommu_dma_forcedac = true;
342 } else if (!strncmp(str, "strict", 6)) {
343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 iommu_set_dma_strict();
345 } else if (!strncmp(str, "sp_off", 6)) {
346 pr_info("Disable supported super page\n");
347 intel_iommu_superpage = 0;
348 } else if (!strncmp(str, "sm_on", 5)) {
349 pr_info("Enable scalable mode if hardware supports\n");
350 intel_iommu_sm = 1;
351 } else if (!strncmp(str, "sm_off", 6)) {
352 pr_info("Scalable mode is disallowed\n");
353 intel_iommu_sm = 0;
354 } else if (!strncmp(str, "tboot_noforce", 13)) {
355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 intel_iommu_tboot_noforce = 1;
357 } else {
358 pr_notice("Unknown option - '%s'\n", str);
359 }
360
361 str += strcspn(str, ",");
362 while (*str == ',')
363 str++;
364 }
365
366 return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
alloc_pgtable_page(int node,gfp_t gfp)370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 struct page *page;
373 void *vaddr = NULL;
374
375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 if (page)
377 vaddr = page_address(page);
378 return vaddr;
379 }
380
free_pgtable_page(void * vaddr)381 void free_pgtable_page(void *vaddr)
382 {
383 free_page((unsigned long)vaddr);
384 }
385
domain_type_is_si(struct dmar_domain * domain)386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 unsigned long pfn)
393 {
394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402 * the returned SAGAW.
403 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 unsigned long fl_sagaw, sl_sagaw;
407
408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 sl_sagaw = cap_sagaw(iommu->cap);
410
411 /* Second level only. */
412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 return sl_sagaw;
414
415 /* First level only. */
416 if (!ecap_slts(iommu->ecap))
417 return fl_sagaw;
418
419 return fl_sagaw & sl_sagaw;
420 }
421
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 unsigned long sagaw;
425 int agaw;
426
427 sagaw = __iommu_calculate_sagaw(iommu);
428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 if (test_bit(agaw, &sagaw))
430 break;
431 }
432
433 return agaw;
434 }
435
436 /*
437 * Calculate max SAGAW for each iommu.
438 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445 * calculate agaw for each iommu.
446 * "SAGAW" may be different across iommus, use a default agaw, and
447 * get a supported less agaw for iommus that don't support the default agaw.
448 */
iommu_calculate_agaw(struct intel_iommu * iommu)449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
iommu_paging_structure_coherency(struct intel_iommu * iommu)454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 return sm_supported(iommu) ?
457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
domain_update_iommu_coherency(struct dmar_domain * domain)460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 struct iommu_domain_info *info;
463 struct dmar_drhd_unit *drhd;
464 struct intel_iommu *iommu;
465 bool found = false;
466 unsigned long i;
467
468 domain->iommu_coherency = true;
469 xa_for_each(&domain->iommu_array, i, info) {
470 found = true;
471 if (!iommu_paging_structure_coherency(info->iommu)) {
472 domain->iommu_coherency = false;
473 break;
474 }
475 }
476 if (found)
477 return;
478
479 /* No hardware attached; use lowest common denominator */
480 rcu_read_lock();
481 for_each_active_iommu(iommu, drhd) {
482 if (!iommu_paging_structure_coherency(iommu)) {
483 domain->iommu_coherency = false;
484 break;
485 }
486 }
487 rcu_read_unlock();
488 }
489
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 struct intel_iommu *skip)
492 {
493 struct dmar_drhd_unit *drhd;
494 struct intel_iommu *iommu;
495 int mask = 0x3;
496
497 if (!intel_iommu_superpage)
498 return 0;
499
500 /* set iommu_superpage to the smallest common denominator */
501 rcu_read_lock();
502 for_each_active_iommu(iommu, drhd) {
503 if (iommu != skip) {
504 if (domain && domain->use_first_level) {
505 if (!cap_fl1gp_support(iommu->cap))
506 mask = 0x1;
507 } else {
508 mask &= cap_super_page_val(iommu->cap);
509 }
510
511 if (!mask)
512 break;
513 }
514 }
515 rcu_read_unlock();
516
517 return fls(mask);
518 }
519
domain_update_device_node(struct dmar_domain * domain)520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 struct device_domain_info *info;
523 int nid = NUMA_NO_NODE;
524 unsigned long flags;
525
526 spin_lock_irqsave(&domain->lock, flags);
527 list_for_each_entry(info, &domain->devices, link) {
528 /*
529 * There could possibly be multiple device numa nodes as devices
530 * within the same domain may sit behind different IOMMUs. There
531 * isn't perfect answer in such situation, so we select first
532 * come first served policy.
533 */
534 nid = dev_to_node(info->dev);
535 if (nid != NUMA_NO_NODE)
536 break;
537 }
538 spin_unlock_irqrestore(&domain->lock, flags);
539
540 return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 unsigned long bitmap = 0;
549
550 /*
551 * 1-level super page supports page size of 2MiB, 2-level super page
552 * supports page size of both 2MiB and 1GiB.
553 */
554 if (domain->iommu_superpage == 1)
555 bitmap |= SZ_2M;
556 else if (domain->iommu_superpage == 2)
557 bitmap |= SZ_2M | SZ_1G;
558
559 return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 domain_update_iommu_coherency(domain);
566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568 /*
569 * If RHSA is missing, we should default to the device numa domain
570 * as fall back.
571 */
572 if (domain->nid == NUMA_NO_NODE)
573 domain->nid = domain_update_device_node(domain);
574
575 /*
576 * First-level translation restricts the input-address to a
577 * canonical address (i.e., address bits 63:N have the same
578 * value as address bit [N-1], where N is 48-bits with 4-level
579 * paging and 57-bits with 5-level paging). Hence, skip bit
580 * [N-1].
581 */
582 if (domain->use_first_level)
583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 else
585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 domain_update_iotlb(domain);
589 }
590
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 u8 devfn, int alloc)
593 {
594 struct root_entry *root = &iommu->root_entry[bus];
595 struct context_entry *context;
596 u64 *entry;
597
598 /*
599 * Except that the caller requested to allocate a new entry,
600 * returning a copied context entry makes no sense.
601 */
602 if (!alloc && context_copied(iommu, bus, devfn))
603 return NULL;
604
605 entry = &root->lo;
606 if (sm_supported(iommu)) {
607 if (devfn >= 0x80) {
608 devfn -= 0x80;
609 entry = &root->hi;
610 }
611 devfn *= 2;
612 }
613 if (*entry & 1)
614 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 else {
616 unsigned long phy_addr;
617 if (!alloc)
618 return NULL;
619
620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 if (!context)
622 return NULL;
623
624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 phy_addr = virt_to_phys((void *)context);
626 *entry = phy_addr | 1;
627 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628 }
629 return &context[devfn];
630 }
631
632 /**
633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634 * sub-hierarchy of a candidate PCI-PCI bridge
635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636 * @bridge: the candidate PCI-PCI bridge
637 *
638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639 */
640 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 struct pci_dev *pdev, *pbridge;
644
645 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 return false;
647
648 pdev = to_pci_dev(dev);
649 pbridge = to_pci_dev(bridge);
650
651 if (pbridge->subordinate &&
652 pbridge->subordinate->number <= pdev->bus->number &&
653 pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 return true;
655
656 return false;
657 }
658
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 struct dmar_drhd_unit *drhd;
662 u32 vtbar;
663 int rc;
664
665 /* We know that this device on this chipset has its own IOMMU.
666 * If we find it under a different IOMMU, then the BIOS is lying
667 * to us. Hope that the IOMMU for this device is actually
668 * disabled, and it needs no translation...
669 */
670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 if (rc) {
672 /* "can't" happen */
673 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 return false;
675 }
676 vtbar &= 0xffff0000;
677
678 /* we know that the this iommu should be at offset 0xa000 from vtbar */
679 drhd = dmar_find_matched_drhd_unit(pdev);
680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 return true;
684 }
685
686 return false;
687 }
688
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 if (!iommu || iommu->drhd->ignored)
692 return true;
693
694 if (dev_is_pci(dev)) {
695 struct pci_dev *pdev = to_pci_dev(dev);
696
697 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 quirk_ioat_snb_local_iommu(pdev))
700 return true;
701 }
702
703 return false;
704 }
705
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 struct dmar_drhd_unit *drhd = NULL;
709 struct pci_dev *pdev = NULL;
710 struct intel_iommu *iommu;
711 struct device *tmp;
712 u16 segment = 0;
713 int i;
714
715 if (!dev)
716 return NULL;
717
718 if (dev_is_pci(dev)) {
719 struct pci_dev *pf_pdev;
720
721 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723 /* VFs aren't listed in scope tables; we need to look up
724 * the PF instead to find the IOMMU. */
725 pf_pdev = pci_physfn(pdev);
726 dev = &pf_pdev->dev;
727 segment = pci_domain_nr(pdev->bus);
728 } else if (has_acpi_companion(dev))
729 dev = &ACPI_COMPANION(dev)->dev;
730
731 rcu_read_lock();
732 for_each_iommu(iommu, drhd) {
733 if (pdev && segment != drhd->segment)
734 continue;
735
736 for_each_active_dev_scope(drhd->devices,
737 drhd->devices_cnt, i, tmp) {
738 if (tmp == dev) {
739 /* For a VF use its original BDF# not that of the PF
740 * which we used for the IOMMU lookup. Strictly speaking
741 * we could do this for all PCI devices; we only need to
742 * get the BDF# from the scope table for ACPI matches. */
743 if (pdev && pdev->is_virtfn)
744 goto got_pdev;
745
746 if (bus && devfn) {
747 *bus = drhd->devices[i].bus;
748 *devfn = drhd->devices[i].devfn;
749 }
750 goto out;
751 }
752
753 if (is_downstream_to_pci_bridge(dev, tmp))
754 goto got_pdev;
755 }
756
757 if (pdev && drhd->include_all) {
758 got_pdev:
759 if (bus && devfn) {
760 *bus = pdev->bus->number;
761 *devfn = pdev->devfn;
762 }
763 goto out;
764 }
765 }
766 iommu = NULL;
767 out:
768 if (iommu_is_dummy(iommu, dev))
769 iommu = NULL;
770
771 rcu_read_unlock();
772
773 return iommu;
774 }
775
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)776 static void domain_flush_cache(struct dmar_domain *domain,
777 void *addr, int size)
778 {
779 if (!domain->iommu_coherency)
780 clflush_cache_range(addr, size);
781 }
782
free_context_table(struct intel_iommu * iommu)783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 struct context_entry *context;
786 int i;
787
788 if (!iommu->root_entry)
789 return;
790
791 for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 context = iommu_context_addr(iommu, i, 0, 0);
793 if (context)
794 free_pgtable_page(context);
795
796 if (!sm_supported(iommu))
797 continue;
798
799 context = iommu_context_addr(iommu, i, 0x80, 0);
800 if (context)
801 free_pgtable_page(context);
802 }
803
804 free_pgtable_page(iommu->root_entry);
805 iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 struct dma_pte *pte;
813 int offset;
814
815 while (1) {
816 offset = pfn_level_offset(pfn, level);
817 pte = &parent[offset];
818
819 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
820
821 if (!dma_pte_present(pte)) {
822 pr_info("page table not present at level %d\n", level - 1);
823 break;
824 }
825
826 if (level == 1 || dma_pte_superpage(pte))
827 break;
828
829 parent = phys_to_virt(dma_pte_addr(pte));
830 level--;
831 }
832 }
833
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)834 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
835 unsigned long long addr, u32 pasid)
836 {
837 struct pasid_dir_entry *dir, *pde;
838 struct pasid_entry *entries, *pte;
839 struct context_entry *ctx_entry;
840 struct root_entry *rt_entry;
841 int i, dir_index, index, level;
842 u8 devfn = source_id & 0xff;
843 u8 bus = source_id >> 8;
844 struct dma_pte *pgtable;
845
846 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
847
848 /* root entry dump */
849 if (!iommu->root_entry) {
850 pr_info("root table is not present\n");
851 return;
852 }
853 rt_entry = &iommu->root_entry[bus];
854
855 if (sm_supported(iommu))
856 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
857 rt_entry->hi, rt_entry->lo);
858 else
859 pr_info("root entry: 0x%016llx", rt_entry->lo);
860
861 /* context entry dump */
862 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
863 if (!ctx_entry) {
864 pr_info("context table is not present\n");
865 return;
866 }
867
868 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
869 ctx_entry->hi, ctx_entry->lo);
870
871 /* legacy mode does not require PASID entries */
872 if (!sm_supported(iommu)) {
873 if (!context_present(ctx_entry)) {
874 pr_info("legacy mode page table is not present\n");
875 return;
876 }
877 level = agaw_to_level(ctx_entry->hi & 7);
878 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 goto pgtable_walk;
880 }
881
882 if (!context_present(ctx_entry)) {
883 pr_info("pasid directory table is not present\n");
884 return;
885 }
886
887 /* get the pointer to pasid directory entry */
888 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
889
890 /* For request-without-pasid, get the pasid from context entry */
891 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
892 pasid = IOMMU_NO_PASID;
893
894 dir_index = pasid >> PASID_PDE_SHIFT;
895 pde = &dir[dir_index];
896 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
897
898 /* get the pointer to the pasid table entry */
899 entries = get_pasid_table_from_pde(pde);
900 if (!entries) {
901 pr_info("pasid table is not present\n");
902 return;
903 }
904 index = pasid & PASID_PTE_MASK;
905 pte = &entries[index];
906 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
907 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
908
909 if (!pasid_pte_is_present(pte)) {
910 pr_info("scalable mode page table is not present\n");
911 return;
912 }
913
914 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 } else {
918 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 }
921
922 pgtable_walk:
923 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 unsigned long pfn, int *target_level,
929 gfp_t gfp)
930 {
931 struct dma_pte *parent, *pte;
932 int level = agaw_to_level(domain->agaw);
933 int offset;
934
935 if (!domain_pfn_supported(domain, pfn))
936 /* Address beyond IOMMU's addressing capabilities. */
937 return NULL;
938
939 parent = domain->pgd;
940
941 while (1) {
942 void *tmp_page;
943
944 offset = pfn_level_offset(pfn, level);
945 pte = &parent[offset];
946 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
947 break;
948 if (level == *target_level)
949 break;
950
951 if (!dma_pte_present(pte)) {
952 uint64_t pteval;
953
954 tmp_page = alloc_pgtable_page(domain->nid, gfp);
955
956 if (!tmp_page)
957 return NULL;
958
959 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
960 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
961 if (domain->use_first_level)
962 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
963
964 if (cmpxchg64(&pte->val, 0ULL, pteval))
965 /* Someone else set it while we were thinking; use theirs. */
966 free_pgtable_page(tmp_page);
967 else
968 domain_flush_cache(domain, pte, sizeof(*pte));
969 }
970 if (level == 1)
971 break;
972
973 parent = phys_to_virt(dma_pte_addr(pte));
974 level--;
975 }
976
977 if (!*target_level)
978 *target_level = level;
979
980 return pte;
981 }
982
983 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)984 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
985 unsigned long pfn,
986 int level, int *large_page)
987 {
988 struct dma_pte *parent, *pte;
989 int total = agaw_to_level(domain->agaw);
990 int offset;
991
992 parent = domain->pgd;
993 while (level <= total) {
994 offset = pfn_level_offset(pfn, total);
995 pte = &parent[offset];
996 if (level == total)
997 return pte;
998
999 if (!dma_pte_present(pte)) {
1000 *large_page = total;
1001 break;
1002 }
1003
1004 if (dma_pte_superpage(pte)) {
1005 *large_page = total;
1006 return pte;
1007 }
1008
1009 parent = phys_to_virt(dma_pte_addr(pte));
1010 total--;
1011 }
1012 return NULL;
1013 }
1014
1015 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1016 static void dma_pte_clear_range(struct dmar_domain *domain,
1017 unsigned long start_pfn,
1018 unsigned long last_pfn)
1019 {
1020 unsigned int large_page;
1021 struct dma_pte *first_pte, *pte;
1022
1023 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1024 WARN_ON(start_pfn > last_pfn))
1025 return;
1026
1027 /* we don't need lock here; nobody else touches the iova range */
1028 do {
1029 large_page = 1;
1030 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1031 if (!pte) {
1032 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1033 continue;
1034 }
1035 do {
1036 dma_clear_pte(pte);
1037 start_pfn += lvl_to_nr_pages(large_page);
1038 pte++;
1039 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1040
1041 domain_flush_cache(domain, first_pte,
1042 (void *)pte - (void *)first_pte);
1043
1044 } while (start_pfn && start_pfn <= last_pfn);
1045 }
1046
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1047 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1048 int retain_level, struct dma_pte *pte,
1049 unsigned long pfn, unsigned long start_pfn,
1050 unsigned long last_pfn)
1051 {
1052 pfn = max(start_pfn, pfn);
1053 pte = &pte[pfn_level_offset(pfn, level)];
1054
1055 do {
1056 unsigned long level_pfn;
1057 struct dma_pte *level_pte;
1058
1059 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1060 goto next;
1061
1062 level_pfn = pfn & level_mask(level);
1063 level_pte = phys_to_virt(dma_pte_addr(pte));
1064
1065 if (level > 2) {
1066 dma_pte_free_level(domain, level - 1, retain_level,
1067 level_pte, level_pfn, start_pfn,
1068 last_pfn);
1069 }
1070
1071 /*
1072 * Free the page table if we're below the level we want to
1073 * retain and the range covers the entire table.
1074 */
1075 if (level < retain_level && !(start_pfn > level_pfn ||
1076 last_pfn < level_pfn + level_size(level) - 1)) {
1077 dma_clear_pte(pte);
1078 domain_flush_cache(domain, pte, sizeof(*pte));
1079 free_pgtable_page(level_pte);
1080 }
1081 next:
1082 pfn += level_size(level);
1083 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1084 }
1085
1086 /*
1087 * clear last level (leaf) ptes and free page table pages below the
1088 * level we wish to keep intact.
1089 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1090 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1091 unsigned long start_pfn,
1092 unsigned long last_pfn,
1093 int retain_level)
1094 {
1095 dma_pte_clear_range(domain, start_pfn, last_pfn);
1096
1097 /* We don't need lock here; nobody else touches the iova range */
1098 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1099 domain->pgd, 0, start_pfn, last_pfn);
1100
1101 /* free pgd */
1102 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1103 free_pgtable_page(domain->pgd);
1104 domain->pgd = NULL;
1105 }
1106 }
1107
1108 /* When a page at a given level is being unlinked from its parent, we don't
1109 need to *modify* it at all. All we need to do is make a list of all the
1110 pages which can be freed just as soon as we've flushed the IOTLB and we
1111 know the hardware page-walk will no longer touch them.
1112 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1113 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1114 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1115 int level, struct dma_pte *pte,
1116 struct list_head *freelist)
1117 {
1118 struct page *pg;
1119
1120 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1121 list_add_tail(&pg->lru, freelist);
1122
1123 if (level == 1)
1124 return;
1125
1126 pte = page_address(pg);
1127 do {
1128 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1129 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1130 pte++;
1131 } while (!first_pte_in_page(pte));
1132 }
1133
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1134 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1135 struct dma_pte *pte, unsigned long pfn,
1136 unsigned long start_pfn, unsigned long last_pfn,
1137 struct list_head *freelist)
1138 {
1139 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1140
1141 pfn = max(start_pfn, pfn);
1142 pte = &pte[pfn_level_offset(pfn, level)];
1143
1144 do {
1145 unsigned long level_pfn = pfn & level_mask(level);
1146
1147 if (!dma_pte_present(pte))
1148 goto next;
1149
1150 /* If range covers entire pagetable, free it */
1151 if (start_pfn <= level_pfn &&
1152 last_pfn >= level_pfn + level_size(level) - 1) {
1153 /* These suborbinate page tables are going away entirely. Don't
1154 bother to clear them; we're just going to *free* them. */
1155 if (level > 1 && !dma_pte_superpage(pte))
1156 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1157
1158 dma_clear_pte(pte);
1159 if (!first_pte)
1160 first_pte = pte;
1161 last_pte = pte;
1162 } else if (level > 1) {
1163 /* Recurse down into a level that isn't *entirely* obsolete */
1164 dma_pte_clear_level(domain, level - 1,
1165 phys_to_virt(dma_pte_addr(pte)),
1166 level_pfn, start_pfn, last_pfn,
1167 freelist);
1168 }
1169 next:
1170 pfn = level_pfn + level_size(level);
1171 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1172
1173 if (first_pte)
1174 domain_flush_cache(domain, first_pte,
1175 (void *)++last_pte - (void *)first_pte);
1176 }
1177
1178 /* We can't just free the pages because the IOMMU may still be walking
1179 the page tables, and may have cached the intermediate levels. The
1180 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1181 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1182 unsigned long last_pfn, struct list_head *freelist)
1183 {
1184 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1185 WARN_ON(start_pfn > last_pfn))
1186 return;
1187
1188 /* we don't need lock here; nobody else touches the iova range */
1189 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1190 domain->pgd, 0, start_pfn, last_pfn, freelist);
1191
1192 /* free pgd */
1193 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1194 struct page *pgd_page = virt_to_page(domain->pgd);
1195 list_add_tail(&pgd_page->lru, freelist);
1196 domain->pgd = NULL;
1197 }
1198 }
1199
1200 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1201 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1202 {
1203 struct root_entry *root;
1204
1205 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1206 if (!root) {
1207 pr_err("Allocating root entry for %s failed\n",
1208 iommu->name);
1209 return -ENOMEM;
1210 }
1211
1212 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1213 iommu->root_entry = root;
1214
1215 return 0;
1216 }
1217
iommu_set_root_entry(struct intel_iommu * iommu)1218 static void iommu_set_root_entry(struct intel_iommu *iommu)
1219 {
1220 u64 addr;
1221 u32 sts;
1222 unsigned long flag;
1223
1224 addr = virt_to_phys(iommu->root_entry);
1225 if (sm_supported(iommu))
1226 addr |= DMA_RTADDR_SMT;
1227
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1230
1231 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1232
1233 /* Make sure hardware complete it */
1234 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235 readl, (sts & DMA_GSTS_RTPS), sts);
1236
1237 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238
1239 /*
1240 * Hardware invalidates all DMA remapping hardware translation
1241 * caches as part of SRTP flow.
1242 */
1243 if (cap_esrtps(iommu->cap))
1244 return;
1245
1246 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1247 if (sm_supported(iommu))
1248 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1249 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1250 }
1251
iommu_flush_write_buffer(struct intel_iommu * iommu)1252 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1253 {
1254 u32 val;
1255 unsigned long flag;
1256
1257 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1258 return;
1259
1260 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1262
1263 /* Make sure hardware complete it */
1264 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1265 readl, (!(val & DMA_GSTS_WBFS)), val);
1266
1267 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269
1270 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1271 static void __iommu_flush_context(struct intel_iommu *iommu,
1272 u16 did, u16 source_id, u8 function_mask,
1273 u64 type)
1274 {
1275 u64 val = 0;
1276 unsigned long flag;
1277
1278 switch (type) {
1279 case DMA_CCMD_GLOBAL_INVL:
1280 val = DMA_CCMD_GLOBAL_INVL;
1281 break;
1282 case DMA_CCMD_DOMAIN_INVL:
1283 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1284 break;
1285 case DMA_CCMD_DEVICE_INVL:
1286 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1287 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1288 break;
1289 default:
1290 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1291 iommu->name, type);
1292 return;
1293 }
1294 val |= DMA_CCMD_ICC;
1295
1296 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1297 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1298
1299 /* Make sure hardware complete it */
1300 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1301 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1302
1303 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1304 }
1305
1306 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1307 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1308 u64 addr, unsigned int size_order, u64 type)
1309 {
1310 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1311 u64 val = 0, val_iva = 0;
1312 unsigned long flag;
1313
1314 switch (type) {
1315 case DMA_TLB_GLOBAL_FLUSH:
1316 /* global flush doesn't need set IVA_REG */
1317 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1318 break;
1319 case DMA_TLB_DSI_FLUSH:
1320 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1321 break;
1322 case DMA_TLB_PSI_FLUSH:
1323 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1324 /* IH bit is passed in as part of address */
1325 val_iva = size_order | addr;
1326 break;
1327 default:
1328 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1329 iommu->name, type);
1330 return;
1331 }
1332
1333 if (cap_write_drain(iommu->cap))
1334 val |= DMA_TLB_WRITE_DRAIN;
1335
1336 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337 /* Note: Only uses first TLB reg currently */
1338 if (val_iva)
1339 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1340 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1341
1342 /* Make sure hardware complete it */
1343 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1344 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1345
1346 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347
1348 /* check IOTLB invalidation granularity */
1349 if (DMA_TLB_IAIG(val) == 0)
1350 pr_err("Flush IOTLB failed\n");
1351 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1352 pr_debug("TLB flush request %Lx, actual %Lx\n",
1353 (unsigned long long)DMA_TLB_IIRG(type),
1354 (unsigned long long)DMA_TLB_IAIG(val));
1355 }
1356
1357 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1358 domain_lookup_dev_info(struct dmar_domain *domain,
1359 struct intel_iommu *iommu, u8 bus, u8 devfn)
1360 {
1361 struct device_domain_info *info;
1362 unsigned long flags;
1363
1364 spin_lock_irqsave(&domain->lock, flags);
1365 list_for_each_entry(info, &domain->devices, link) {
1366 if (info->iommu == iommu && info->bus == bus &&
1367 info->devfn == devfn) {
1368 spin_unlock_irqrestore(&domain->lock, flags);
1369 return info;
1370 }
1371 }
1372 spin_unlock_irqrestore(&domain->lock, flags);
1373
1374 return NULL;
1375 }
1376
domain_update_iotlb(struct dmar_domain * domain)1377 static void domain_update_iotlb(struct dmar_domain *domain)
1378 {
1379 struct dev_pasid_info *dev_pasid;
1380 struct device_domain_info *info;
1381 bool has_iotlb_device = false;
1382 unsigned long flags;
1383
1384 spin_lock_irqsave(&domain->lock, flags);
1385 list_for_each_entry(info, &domain->devices, link) {
1386 if (info->ats_enabled) {
1387 has_iotlb_device = true;
1388 break;
1389 }
1390 }
1391
1392 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1393 info = dev_iommu_priv_get(dev_pasid->dev);
1394 if (info->ats_enabled) {
1395 has_iotlb_device = true;
1396 break;
1397 }
1398 }
1399 domain->has_iotlb_device = has_iotlb_device;
1400 spin_unlock_irqrestore(&domain->lock, flags);
1401 }
1402
1403 /*
1404 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1405 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1406 * check because it applies only to the built-in QAT devices and it doesn't
1407 * grant additional privileges.
1408 */
1409 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1410 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1411 {
1412 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1413 return false;
1414
1415 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1416 return false;
1417
1418 return true;
1419 }
1420
iommu_enable_pci_caps(struct device_domain_info * info)1421 static void iommu_enable_pci_caps(struct device_domain_info *info)
1422 {
1423 struct pci_dev *pdev;
1424
1425 if (!dev_is_pci(info->dev))
1426 return;
1427
1428 pdev = to_pci_dev(info->dev);
1429
1430 /* The PCIe spec, in its wisdom, declares that the behaviour of
1431 the device if you enable PASID support after ATS support is
1432 undefined. So always enable PASID support on devices which
1433 have it, even if we can't yet know if we're ever going to
1434 use it. */
1435 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1436 info->pasid_enabled = 1;
1437
1438 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1439 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1440 info->ats_enabled = 1;
1441 domain_update_iotlb(info->domain);
1442 }
1443 }
1444
iommu_disable_pci_caps(struct device_domain_info * info)1445 static void iommu_disable_pci_caps(struct device_domain_info *info)
1446 {
1447 struct pci_dev *pdev;
1448
1449 if (!dev_is_pci(info->dev))
1450 return;
1451
1452 pdev = to_pci_dev(info->dev);
1453
1454 if (info->ats_enabled) {
1455 pci_disable_ats(pdev);
1456 info->ats_enabled = 0;
1457 domain_update_iotlb(info->domain);
1458 }
1459
1460 if (info->pasid_enabled) {
1461 pci_disable_pasid(pdev);
1462 info->pasid_enabled = 0;
1463 }
1464 }
1465
__iommu_flush_dev_iotlb(struct device_domain_info * info,u64 addr,unsigned int mask)1466 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1467 u64 addr, unsigned int mask)
1468 {
1469 u16 sid, qdep;
1470
1471 if (!info || !info->ats_enabled)
1472 return;
1473
1474 sid = info->bus << 8 | info->devfn;
1475 qdep = info->ats_qdep;
1476 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1477 qdep, addr, mask);
1478 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1479 }
1480
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1481 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1482 u64 addr, unsigned mask)
1483 {
1484 struct dev_pasid_info *dev_pasid;
1485 struct device_domain_info *info;
1486 unsigned long flags;
1487
1488 if (!domain->has_iotlb_device)
1489 return;
1490
1491 spin_lock_irqsave(&domain->lock, flags);
1492 list_for_each_entry(info, &domain->devices, link)
1493 __iommu_flush_dev_iotlb(info, addr, mask);
1494
1495 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1496 info = dev_iommu_priv_get(dev_pasid->dev);
1497
1498 if (!info->ats_enabled)
1499 continue;
1500
1501 qi_flush_dev_iotlb_pasid(info->iommu,
1502 PCI_DEVID(info->bus, info->devfn),
1503 info->pfsid, dev_pasid->pasid,
1504 info->ats_qdep, addr,
1505 mask);
1506 }
1507 spin_unlock_irqrestore(&domain->lock, flags);
1508 }
1509
domain_flush_pasid_iotlb(struct intel_iommu * iommu,struct dmar_domain * domain,u64 addr,unsigned long npages,bool ih)1510 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1511 struct dmar_domain *domain, u64 addr,
1512 unsigned long npages, bool ih)
1513 {
1514 u16 did = domain_id_iommu(domain, iommu);
1515 struct dev_pasid_info *dev_pasid;
1516 unsigned long flags;
1517
1518 spin_lock_irqsave(&domain->lock, flags);
1519 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1520 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1521
1522 if (!list_empty(&domain->devices))
1523 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1524 spin_unlock_irqrestore(&domain->lock, flags);
1525 }
1526
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1527 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1528 struct dmar_domain *domain,
1529 unsigned long pfn, unsigned int pages,
1530 int ih, int map)
1531 {
1532 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1533 unsigned int mask = ilog2(aligned_pages);
1534 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1535 u16 did = domain_id_iommu(domain, iommu);
1536
1537 if (WARN_ON(!pages))
1538 return;
1539
1540 if (ih)
1541 ih = 1 << 6;
1542
1543 if (domain->use_first_level) {
1544 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1545 } else {
1546 unsigned long bitmask = aligned_pages - 1;
1547
1548 /*
1549 * PSI masks the low order bits of the base address. If the
1550 * address isn't aligned to the mask, then compute a mask value
1551 * needed to ensure the target range is flushed.
1552 */
1553 if (unlikely(bitmask & pfn)) {
1554 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1555
1556 /*
1557 * Since end_pfn <= pfn + bitmask, the only way bits
1558 * higher than bitmask can differ in pfn and end_pfn is
1559 * by carrying. This means after masking out bitmask,
1560 * high bits starting with the first set bit in
1561 * shared_bits are all equal in both pfn and end_pfn.
1562 */
1563 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1564 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1565 }
1566
1567 /*
1568 * Fallback to domain selective flush if no PSI support or
1569 * the size is too big.
1570 */
1571 if (!cap_pgsel_inv(iommu->cap) ||
1572 mask > cap_max_amask_val(iommu->cap))
1573 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1574 DMA_TLB_DSI_FLUSH);
1575 else
1576 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1577 DMA_TLB_PSI_FLUSH);
1578 }
1579
1580 /*
1581 * In caching mode, changes of pages from non-present to present require
1582 * flush. However, device IOTLB doesn't need to be flushed in this case.
1583 */
1584 if (!cap_caching_mode(iommu->cap) || !map)
1585 iommu_flush_dev_iotlb(domain, addr, mask);
1586 }
1587
1588 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1589 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1590 struct dmar_domain *domain,
1591 unsigned long pfn, unsigned int pages)
1592 {
1593 /*
1594 * It's a non-present to present mapping. Only flush if caching mode
1595 * and second level.
1596 */
1597 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1598 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1599 else
1600 iommu_flush_write_buffer(iommu);
1601 }
1602
intel_flush_iotlb_all(struct iommu_domain * domain)1603 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1604 {
1605 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1606 struct iommu_domain_info *info;
1607 unsigned long idx;
1608
1609 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1610 struct intel_iommu *iommu = info->iommu;
1611 u16 did = domain_id_iommu(dmar_domain, iommu);
1612
1613 if (dmar_domain->use_first_level)
1614 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1615 else
1616 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1617 DMA_TLB_DSI_FLUSH);
1618
1619 if (!cap_caching_mode(iommu->cap))
1620 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1621 }
1622 }
1623
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1624 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1625 {
1626 u32 pmen;
1627 unsigned long flags;
1628
1629 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1630 return;
1631
1632 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1633 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1634 pmen &= ~DMA_PMEN_EPM;
1635 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1636
1637 /* wait for the protected region status bit to clear */
1638 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1639 readl, !(pmen & DMA_PMEN_PRS), pmen);
1640
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
iommu_enable_translation(struct intel_iommu * iommu)1644 static void iommu_enable_translation(struct intel_iommu *iommu)
1645 {
1646 u32 sts;
1647 unsigned long flags;
1648
1649 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1650 iommu->gcmd |= DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (sts & DMA_GSTS_TES), sts);
1656
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1658 }
1659
iommu_disable_translation(struct intel_iommu * iommu)1660 static void iommu_disable_translation(struct intel_iommu *iommu)
1661 {
1662 u32 sts;
1663 unsigned long flag;
1664
1665 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1666 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1667 return;
1668
1669 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1670 iommu->gcmd &= ~DMA_GCMD_TE;
1671 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1672
1673 /* Make sure hardware complete it */
1674 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1675 readl, (!(sts & DMA_GSTS_TES)), sts);
1676
1677 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1678 }
1679
iommu_init_domains(struct intel_iommu * iommu)1680 static int iommu_init_domains(struct intel_iommu *iommu)
1681 {
1682 u32 ndomains;
1683
1684 ndomains = cap_ndoms(iommu->cap);
1685 pr_debug("%s: Number of Domains supported <%d>\n",
1686 iommu->name, ndomains);
1687
1688 spin_lock_init(&iommu->lock);
1689
1690 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1691 if (!iommu->domain_ids)
1692 return -ENOMEM;
1693
1694 /*
1695 * If Caching mode is set, then invalid translations are tagged
1696 * with domain-id 0, hence we need to pre-allocate it. We also
1697 * use domain-id 0 as a marker for non-allocated domain-id, so
1698 * make sure it is not used for a real domain.
1699 */
1700 set_bit(0, iommu->domain_ids);
1701
1702 /*
1703 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1704 * entry for first-level or pass-through translation modes should
1705 * be programmed with a domain id different from those used for
1706 * second-level or nested translation. We reserve a domain id for
1707 * this purpose. This domain id is also used for identity domain
1708 * in legacy mode.
1709 */
1710 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1711
1712 return 0;
1713 }
1714
disable_dmar_iommu(struct intel_iommu * iommu)1715 static void disable_dmar_iommu(struct intel_iommu *iommu)
1716 {
1717 if (!iommu->domain_ids)
1718 return;
1719
1720 /*
1721 * All iommu domains must have been detached from the devices,
1722 * hence there should be no domain IDs in use.
1723 */
1724 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1725 > NUM_RESERVED_DID))
1726 return;
1727
1728 if (iommu->gcmd & DMA_GCMD_TE)
1729 iommu_disable_translation(iommu);
1730 }
1731
free_dmar_iommu(struct intel_iommu * iommu)1732 static void free_dmar_iommu(struct intel_iommu *iommu)
1733 {
1734 if (iommu->domain_ids) {
1735 bitmap_free(iommu->domain_ids);
1736 iommu->domain_ids = NULL;
1737 }
1738
1739 if (iommu->copied_tables) {
1740 bitmap_free(iommu->copied_tables);
1741 iommu->copied_tables = NULL;
1742 }
1743
1744 /* free context mapping */
1745 free_context_table(iommu);
1746
1747 #ifdef CONFIG_INTEL_IOMMU_SVM
1748 if (pasid_supported(iommu)) {
1749 if (ecap_prs(iommu->ecap))
1750 intel_svm_finish_prq(iommu);
1751 }
1752 #endif
1753 }
1754
1755 /*
1756 * Check and return whether first level is used by default for
1757 * DMA translation.
1758 */
first_level_by_default(unsigned int type)1759 static bool first_level_by_default(unsigned int type)
1760 {
1761 /* Only SL is available in legacy mode */
1762 if (!scalable_mode_support())
1763 return false;
1764
1765 /* Only level (either FL or SL) is available, just use it */
1766 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1767 return intel_cap_flts_sanity();
1768
1769 /* Both levels are available, decide it based on domain type */
1770 return type != IOMMU_DOMAIN_UNMANAGED;
1771 }
1772
alloc_domain(unsigned int type)1773 static struct dmar_domain *alloc_domain(unsigned int type)
1774 {
1775 struct dmar_domain *domain;
1776
1777 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1778 if (!domain)
1779 return NULL;
1780
1781 domain->nid = NUMA_NO_NODE;
1782 if (first_level_by_default(type))
1783 domain->use_first_level = true;
1784 domain->has_iotlb_device = false;
1785 INIT_LIST_HEAD(&domain->devices);
1786 INIT_LIST_HEAD(&domain->dev_pasids);
1787 spin_lock_init(&domain->lock);
1788 xa_init(&domain->iommu_array);
1789
1790 return domain;
1791 }
1792
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1793 static int domain_attach_iommu(struct dmar_domain *domain,
1794 struct intel_iommu *iommu)
1795 {
1796 struct iommu_domain_info *info, *curr;
1797 unsigned long ndomains;
1798 int num, ret = -ENOSPC;
1799
1800 info = kzalloc(sizeof(*info), GFP_KERNEL);
1801 if (!info)
1802 return -ENOMEM;
1803
1804 spin_lock(&iommu->lock);
1805 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1806 if (curr) {
1807 curr->refcnt++;
1808 spin_unlock(&iommu->lock);
1809 kfree(info);
1810 return 0;
1811 }
1812
1813 ndomains = cap_ndoms(iommu->cap);
1814 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1815 if (num >= ndomains) {
1816 pr_err("%s: No free domain ids\n", iommu->name);
1817 goto err_unlock;
1818 }
1819
1820 set_bit(num, iommu->domain_ids);
1821 info->refcnt = 1;
1822 info->did = num;
1823 info->iommu = iommu;
1824 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1825 NULL, info, GFP_ATOMIC);
1826 if (curr) {
1827 ret = xa_err(curr) ? : -EBUSY;
1828 goto err_clear;
1829 }
1830 domain_update_iommu_cap(domain);
1831
1832 spin_unlock(&iommu->lock);
1833 return 0;
1834
1835 err_clear:
1836 clear_bit(info->did, iommu->domain_ids);
1837 err_unlock:
1838 spin_unlock(&iommu->lock);
1839 kfree(info);
1840 return ret;
1841 }
1842
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1843 static void domain_detach_iommu(struct dmar_domain *domain,
1844 struct intel_iommu *iommu)
1845 {
1846 struct iommu_domain_info *info;
1847
1848 spin_lock(&iommu->lock);
1849 info = xa_load(&domain->iommu_array, iommu->seq_id);
1850 if (--info->refcnt == 0) {
1851 clear_bit(info->did, iommu->domain_ids);
1852 xa_erase(&domain->iommu_array, iommu->seq_id);
1853 domain->nid = NUMA_NO_NODE;
1854 domain_update_iommu_cap(domain);
1855 kfree(info);
1856 }
1857 spin_unlock(&iommu->lock);
1858 }
1859
guestwidth_to_adjustwidth(int gaw)1860 static inline int guestwidth_to_adjustwidth(int gaw)
1861 {
1862 int agaw;
1863 int r = (gaw - 12) % 9;
1864
1865 if (r == 0)
1866 agaw = gaw;
1867 else
1868 agaw = gaw + 9 - r;
1869 if (agaw > 64)
1870 agaw = 64;
1871 return agaw;
1872 }
1873
domain_exit(struct dmar_domain * domain)1874 static void domain_exit(struct dmar_domain *domain)
1875 {
1876 if (domain->pgd) {
1877 LIST_HEAD(freelist);
1878
1879 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1880 put_pages_list(&freelist);
1881 }
1882
1883 if (WARN_ON(!list_empty(&domain->devices)))
1884 return;
1885
1886 kfree(domain);
1887 }
1888
1889 /*
1890 * Get the PASID directory size for scalable mode context entry.
1891 * Value of X in the PDTS field of a scalable mode context entry
1892 * indicates PASID directory with 2^(X + 7) entries.
1893 */
context_get_sm_pds(struct pasid_table * table)1894 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1895 {
1896 unsigned long pds, max_pde;
1897
1898 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1899 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1900 if (pds < 7)
1901 return 0;
1902
1903 return pds - 7;
1904 }
1905
1906 /*
1907 * Set the RID_PASID field of a scalable mode context entry. The
1908 * IOMMU hardware will use the PASID value set in this field for
1909 * DMA translations of DMA requests without PASID.
1910 */
1911 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)1912 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1913 {
1914 context->hi |= pasid & ((1 << 20) - 1);
1915 }
1916
1917 /*
1918 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1919 * entry.
1920 */
context_set_sm_dte(struct context_entry * context)1921 static inline void context_set_sm_dte(struct context_entry *context)
1922 {
1923 context->lo |= BIT_ULL(2);
1924 }
1925
1926 /*
1927 * Set the PRE(Page Request Enable) field of a scalable mode context
1928 * entry.
1929 */
context_set_sm_pre(struct context_entry * context)1930 static inline void context_set_sm_pre(struct context_entry *context)
1931 {
1932 context->lo |= BIT_ULL(4);
1933 }
1934
1935 /* Convert value to context PASID directory size field coding. */
1936 #define context_pdts(pds) (((pds) & 0x7) << 9)
1937
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)1938 static int domain_context_mapping_one(struct dmar_domain *domain,
1939 struct intel_iommu *iommu,
1940 struct pasid_table *table,
1941 u8 bus, u8 devfn)
1942 {
1943 struct device_domain_info *info =
1944 domain_lookup_dev_info(domain, iommu, bus, devfn);
1945 u16 did = domain_id_iommu(domain, iommu);
1946 int translation = CONTEXT_TT_MULTI_LEVEL;
1947 struct context_entry *context;
1948 int ret;
1949
1950 if (hw_pass_through && domain_type_is_si(domain))
1951 translation = CONTEXT_TT_PASS_THROUGH;
1952
1953 pr_debug("Set context mapping for %02x:%02x.%d\n",
1954 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1955
1956 spin_lock(&iommu->lock);
1957 ret = -ENOMEM;
1958 context = iommu_context_addr(iommu, bus, devfn, 1);
1959 if (!context)
1960 goto out_unlock;
1961
1962 ret = 0;
1963 if (context_present(context) && !context_copied(iommu, bus, devfn))
1964 goto out_unlock;
1965
1966 /*
1967 * For kdump cases, old valid entries may be cached due to the
1968 * in-flight DMA and copied pgtable, but there is no unmapping
1969 * behaviour for them, thus we need an explicit cache flush for
1970 * the newly-mapped device. For kdump, at this point, the device
1971 * is supposed to finish reset at its driver probe stage, so no
1972 * in-flight DMA will exist, and we don't need to worry anymore
1973 * hereafter.
1974 */
1975 if (context_copied(iommu, bus, devfn)) {
1976 u16 did_old = context_domain_id(context);
1977
1978 if (did_old < cap_ndoms(iommu->cap)) {
1979 iommu->flush.flush_context(iommu, did_old,
1980 (((u16)bus) << 8) | devfn,
1981 DMA_CCMD_MASK_NOBIT,
1982 DMA_CCMD_DEVICE_INVL);
1983 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1984 DMA_TLB_DSI_FLUSH);
1985 }
1986
1987 clear_context_copied(iommu, bus, devfn);
1988 }
1989
1990 context_clear_entry(context);
1991
1992 if (sm_supported(iommu)) {
1993 unsigned long pds;
1994
1995 /* Setup the PASID DIR pointer: */
1996 pds = context_get_sm_pds(table);
1997 context->lo = (u64)virt_to_phys(table->table) |
1998 context_pdts(pds);
1999
2000 /* Setup the RID_PASID field: */
2001 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
2002
2003 /*
2004 * Setup the Device-TLB enable bit and Page request
2005 * Enable bit:
2006 */
2007 if (info && info->ats_supported)
2008 context_set_sm_dte(context);
2009 if (info && info->pri_supported)
2010 context_set_sm_pre(context);
2011 if (info && info->pasid_supported)
2012 context_set_pasid(context);
2013 } else {
2014 struct dma_pte *pgd = domain->pgd;
2015 int agaw;
2016
2017 context_set_domain_id(context, did);
2018
2019 if (translation != CONTEXT_TT_PASS_THROUGH) {
2020 /*
2021 * Skip top levels of page tables for iommu which has
2022 * less agaw than default. Unnecessary for PT mode.
2023 */
2024 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2025 ret = -ENOMEM;
2026 pgd = phys_to_virt(dma_pte_addr(pgd));
2027 if (!dma_pte_present(pgd))
2028 goto out_unlock;
2029 }
2030
2031 if (info && info->ats_supported)
2032 translation = CONTEXT_TT_DEV_IOTLB;
2033 else
2034 translation = CONTEXT_TT_MULTI_LEVEL;
2035
2036 context_set_address_root(context, virt_to_phys(pgd));
2037 context_set_address_width(context, agaw);
2038 } else {
2039 /*
2040 * In pass through mode, AW must be programmed to
2041 * indicate the largest AGAW value supported by
2042 * hardware. And ASR is ignored by hardware.
2043 */
2044 context_set_address_width(context, iommu->msagaw);
2045 }
2046
2047 context_set_translation_type(context, translation);
2048 }
2049
2050 context_set_fault_enable(context);
2051 context_set_present(context);
2052 if (!ecap_coherent(iommu->ecap))
2053 clflush_cache_range(context, sizeof(*context));
2054
2055 /*
2056 * It's a non-present to present mapping. If hardware doesn't cache
2057 * non-present entry we only need to flush the write-buffer. If the
2058 * _does_ cache non-present entries, then it does so in the special
2059 * domain #0, which we have to flush:
2060 */
2061 if (cap_caching_mode(iommu->cap)) {
2062 iommu->flush.flush_context(iommu, 0,
2063 (((u16)bus) << 8) | devfn,
2064 DMA_CCMD_MASK_NOBIT,
2065 DMA_CCMD_DEVICE_INVL);
2066 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2067 } else {
2068 iommu_flush_write_buffer(iommu);
2069 }
2070
2071 ret = 0;
2072
2073 out_unlock:
2074 spin_unlock(&iommu->lock);
2075
2076 return ret;
2077 }
2078
2079 struct domain_context_mapping_data {
2080 struct dmar_domain *domain;
2081 struct intel_iommu *iommu;
2082 struct pasid_table *table;
2083 };
2084
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2085 static int domain_context_mapping_cb(struct pci_dev *pdev,
2086 u16 alias, void *opaque)
2087 {
2088 struct domain_context_mapping_data *data = opaque;
2089
2090 return domain_context_mapping_one(data->domain, data->iommu,
2091 data->table, PCI_BUS_NUM(alias),
2092 alias & 0xff);
2093 }
2094
2095 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2096 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2097 {
2098 struct domain_context_mapping_data data;
2099 struct pasid_table *table;
2100 struct intel_iommu *iommu;
2101 u8 bus, devfn;
2102
2103 iommu = device_to_iommu(dev, &bus, &devfn);
2104 if (!iommu)
2105 return -ENODEV;
2106
2107 table = intel_pasid_get_table(dev);
2108
2109 if (!dev_is_pci(dev))
2110 return domain_context_mapping_one(domain, iommu, table,
2111 bus, devfn);
2112
2113 data.domain = domain;
2114 data.iommu = iommu;
2115 data.table = table;
2116
2117 return pci_for_each_dma_alias(to_pci_dev(dev),
2118 &domain_context_mapping_cb, &data);
2119 }
2120
2121 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2122 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2123 size_t size)
2124 {
2125 host_addr &= ~PAGE_MASK;
2126 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2127 }
2128
2129 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2130 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2131 unsigned long iov_pfn,
2132 unsigned long phy_pfn,
2133 unsigned long pages)
2134 {
2135 int support, level = 1;
2136 unsigned long pfnmerge;
2137
2138 support = domain->iommu_superpage;
2139
2140 /* To use a large page, the virtual *and* physical addresses
2141 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2142 of them will mean we have to use smaller pages. So just
2143 merge them and check both at once. */
2144 pfnmerge = iov_pfn | phy_pfn;
2145
2146 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2147 pages >>= VTD_STRIDE_SHIFT;
2148 if (!pages)
2149 break;
2150 pfnmerge >>= VTD_STRIDE_SHIFT;
2151 level++;
2152 support--;
2153 }
2154 return level;
2155 }
2156
2157 /*
2158 * Ensure that old small page tables are removed to make room for superpage(s).
2159 * We're going to add new large pages, so make sure we don't remove their parent
2160 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2161 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)2162 static void switch_to_super_page(struct dmar_domain *domain,
2163 unsigned long start_pfn,
2164 unsigned long end_pfn, int level)
2165 {
2166 unsigned long lvl_pages = lvl_to_nr_pages(level);
2167 struct iommu_domain_info *info;
2168 struct dma_pte *pte = NULL;
2169 unsigned long i;
2170
2171 while (start_pfn <= end_pfn) {
2172 if (!pte)
2173 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2174 GFP_ATOMIC);
2175
2176 if (dma_pte_present(pte)) {
2177 dma_pte_free_pagetable(domain, start_pfn,
2178 start_pfn + lvl_pages - 1,
2179 level + 1);
2180
2181 xa_for_each(&domain->iommu_array, i, info)
2182 iommu_flush_iotlb_psi(info->iommu, domain,
2183 start_pfn, lvl_pages,
2184 0, 0);
2185 }
2186
2187 pte++;
2188 start_pfn += lvl_pages;
2189 if (first_pte_in_page(pte))
2190 pte = NULL;
2191 }
2192 }
2193
2194 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)2195 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2196 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2197 gfp_t gfp)
2198 {
2199 struct dma_pte *first_pte = NULL, *pte = NULL;
2200 unsigned int largepage_lvl = 0;
2201 unsigned long lvl_pages = 0;
2202 phys_addr_t pteval;
2203 u64 attr;
2204
2205 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2206 return -EINVAL;
2207
2208 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2209 return -EINVAL;
2210
2211 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2212 attr |= DMA_FL_PTE_PRESENT;
2213 if (domain->use_first_level) {
2214 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2215 if (prot & DMA_PTE_WRITE)
2216 attr |= DMA_FL_PTE_DIRTY;
2217 }
2218
2219 domain->has_mappings = true;
2220
2221 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2222
2223 while (nr_pages > 0) {
2224 uint64_t tmp;
2225
2226 if (!pte) {
2227 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2228 phys_pfn, nr_pages);
2229
2230 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2231 gfp);
2232 if (!pte)
2233 return -ENOMEM;
2234 first_pte = pte;
2235
2236 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2237
2238 /* It is large page*/
2239 if (largepage_lvl > 1) {
2240 unsigned long end_pfn;
2241 unsigned long pages_to_remove;
2242
2243 pteval |= DMA_PTE_LARGE_PAGE;
2244 pages_to_remove = min_t(unsigned long, nr_pages,
2245 nr_pte_to_next_page(pte) * lvl_pages);
2246 end_pfn = iov_pfn + pages_to_remove - 1;
2247 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2248 } else {
2249 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2250 }
2251
2252 }
2253 /* We don't need lock here, nobody else
2254 * touches the iova range
2255 */
2256 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2257 if (tmp) {
2258 static int dumps = 5;
2259 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2260 iov_pfn, tmp, (unsigned long long)pteval);
2261 if (dumps) {
2262 dumps--;
2263 debug_dma_dump_mappings(NULL);
2264 }
2265 WARN_ON(1);
2266 }
2267
2268 nr_pages -= lvl_pages;
2269 iov_pfn += lvl_pages;
2270 phys_pfn += lvl_pages;
2271 pteval += lvl_pages * VTD_PAGE_SIZE;
2272
2273 /* If the next PTE would be the first in a new page, then we
2274 * need to flush the cache on the entries we've just written.
2275 * And then we'll need to recalculate 'pte', so clear it and
2276 * let it get set again in the if (!pte) block above.
2277 *
2278 * If we're done (!nr_pages) we need to flush the cache too.
2279 *
2280 * Also if we've been setting superpages, we may need to
2281 * recalculate 'pte' and switch back to smaller pages for the
2282 * end of the mapping, if the trailing size is not enough to
2283 * use another superpage (i.e. nr_pages < lvl_pages).
2284 */
2285 pte++;
2286 if (!nr_pages || first_pte_in_page(pte) ||
2287 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2288 domain_flush_cache(domain, first_pte,
2289 (void *)pte - (void *)first_pte);
2290 pte = NULL;
2291 }
2292 }
2293
2294 return 0;
2295 }
2296
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)2297 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2298 {
2299 struct intel_iommu *iommu = info->iommu;
2300 struct context_entry *context;
2301 u16 did_old;
2302
2303 if (!iommu)
2304 return;
2305
2306 spin_lock(&iommu->lock);
2307 context = iommu_context_addr(iommu, bus, devfn, 0);
2308 if (!context) {
2309 spin_unlock(&iommu->lock);
2310 return;
2311 }
2312
2313 if (sm_supported(iommu)) {
2314 if (hw_pass_through && domain_type_is_si(info->domain))
2315 did_old = FLPT_DEFAULT_DID;
2316 else
2317 did_old = domain_id_iommu(info->domain, iommu);
2318 } else {
2319 did_old = context_domain_id(context);
2320 }
2321
2322 context_clear_entry(context);
2323 __iommu_flush_cache(iommu, context, sizeof(*context));
2324 spin_unlock(&iommu->lock);
2325 iommu->flush.flush_context(iommu,
2326 did_old,
2327 (((u16)bus) << 8) | devfn,
2328 DMA_CCMD_MASK_NOBIT,
2329 DMA_CCMD_DEVICE_INVL);
2330
2331 if (sm_supported(iommu))
2332 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2333
2334 iommu->flush.flush_iotlb(iommu,
2335 did_old,
2336 0,
2337 0,
2338 DMA_TLB_DSI_FLUSH);
2339
2340 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2341 }
2342
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2343 static int domain_setup_first_level(struct intel_iommu *iommu,
2344 struct dmar_domain *domain,
2345 struct device *dev,
2346 u32 pasid)
2347 {
2348 struct dma_pte *pgd = domain->pgd;
2349 int agaw, level;
2350 int flags = 0;
2351
2352 /*
2353 * Skip top levels of page tables for iommu which has
2354 * less agaw than default. Unnecessary for PT mode.
2355 */
2356 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2357 pgd = phys_to_virt(dma_pte_addr(pgd));
2358 if (!dma_pte_present(pgd))
2359 return -ENOMEM;
2360 }
2361
2362 level = agaw_to_level(agaw);
2363 if (level != 4 && level != 5)
2364 return -EINVAL;
2365
2366 if (level == 5)
2367 flags |= PASID_FLAG_FL5LP;
2368
2369 if (domain->force_snooping)
2370 flags |= PASID_FLAG_PAGE_SNOOP;
2371
2372 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2373 domain_id_iommu(domain, iommu),
2374 flags);
2375 }
2376
dev_is_real_dma_subdevice(struct device * dev)2377 static bool dev_is_real_dma_subdevice(struct device *dev)
2378 {
2379 return dev && dev_is_pci(dev) &&
2380 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2381 }
2382
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2383 static int iommu_domain_identity_map(struct dmar_domain *domain,
2384 unsigned long first_vpfn,
2385 unsigned long last_vpfn)
2386 {
2387 /*
2388 * RMRR range might have overlap with physical memory range,
2389 * clear it first
2390 */
2391 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2392
2393 return __domain_mapping(domain, first_vpfn,
2394 first_vpfn, last_vpfn - first_vpfn + 1,
2395 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2396 }
2397
2398 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2399
si_domain_init(int hw)2400 static int __init si_domain_init(int hw)
2401 {
2402 struct dmar_rmrr_unit *rmrr;
2403 struct device *dev;
2404 int i, nid, ret;
2405
2406 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2407 if (!si_domain)
2408 return -EFAULT;
2409
2410 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2411 domain_exit(si_domain);
2412 si_domain = NULL;
2413 return -EFAULT;
2414 }
2415
2416 if (hw)
2417 return 0;
2418
2419 for_each_online_node(nid) {
2420 unsigned long start_pfn, end_pfn;
2421 int i;
2422
2423 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2424 ret = iommu_domain_identity_map(si_domain,
2425 mm_to_dma_pfn_start(start_pfn),
2426 mm_to_dma_pfn_end(end_pfn-1));
2427 if (ret)
2428 return ret;
2429 }
2430 }
2431
2432 /*
2433 * Identity map the RMRRs so that devices with RMRRs could also use
2434 * the si_domain.
2435 */
2436 for_each_rmrr_units(rmrr) {
2437 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2438 i, dev) {
2439 unsigned long long start = rmrr->base_address;
2440 unsigned long long end = rmrr->end_address;
2441
2442 if (WARN_ON(end < start ||
2443 end >> agaw_to_width(si_domain->agaw)))
2444 continue;
2445
2446 ret = iommu_domain_identity_map(si_domain,
2447 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2448 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2449 if (ret)
2450 return ret;
2451 }
2452 }
2453
2454 return 0;
2455 }
2456
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)2457 static int dmar_domain_attach_device(struct dmar_domain *domain,
2458 struct device *dev)
2459 {
2460 struct device_domain_info *info = dev_iommu_priv_get(dev);
2461 struct intel_iommu *iommu;
2462 unsigned long flags;
2463 u8 bus, devfn;
2464 int ret;
2465
2466 iommu = device_to_iommu(dev, &bus, &devfn);
2467 if (!iommu)
2468 return -ENODEV;
2469
2470 ret = domain_attach_iommu(domain, iommu);
2471 if (ret)
2472 return ret;
2473 info->domain = domain;
2474 spin_lock_irqsave(&domain->lock, flags);
2475 list_add(&info->link, &domain->devices);
2476 spin_unlock_irqrestore(&domain->lock, flags);
2477
2478 /* PASID table is mandatory for a PCI device in scalable mode. */
2479 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2480 /* Setup the PASID entry for requests without PASID: */
2481 if (hw_pass_through && domain_type_is_si(domain))
2482 ret = intel_pasid_setup_pass_through(iommu, domain,
2483 dev, IOMMU_NO_PASID);
2484 else if (domain->use_first_level)
2485 ret = domain_setup_first_level(iommu, domain, dev,
2486 IOMMU_NO_PASID);
2487 else
2488 ret = intel_pasid_setup_second_level(iommu, domain,
2489 dev, IOMMU_NO_PASID);
2490 if (ret) {
2491 dev_err(dev, "Setup RID2PASID failed\n");
2492 device_block_translation(dev);
2493 return ret;
2494 }
2495 }
2496
2497 ret = domain_context_mapping(domain, dev);
2498 if (ret) {
2499 dev_err(dev, "Domain context map failed\n");
2500 device_block_translation(dev);
2501 return ret;
2502 }
2503
2504 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2505 iommu_enable_pci_caps(info);
2506
2507 return 0;
2508 }
2509
2510 /**
2511 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2512 * is relaxable (ie. is allowed to be not enforced under some conditions)
2513 * @dev: device handle
2514 *
2515 * We assume that PCI USB devices with RMRRs have them largely
2516 * for historical reasons and that the RMRR space is not actively used post
2517 * boot. This exclusion may change if vendors begin to abuse it.
2518 *
2519 * The same exception is made for graphics devices, with the requirement that
2520 * any use of the RMRR regions will be torn down before assigning the device
2521 * to a guest.
2522 *
2523 * Return: true if the RMRR is relaxable, false otherwise
2524 */
device_rmrr_is_relaxable(struct device * dev)2525 static bool device_rmrr_is_relaxable(struct device *dev)
2526 {
2527 struct pci_dev *pdev;
2528
2529 if (!dev_is_pci(dev))
2530 return false;
2531
2532 pdev = to_pci_dev(dev);
2533 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2534 return true;
2535 else
2536 return false;
2537 }
2538
2539 /*
2540 * Return the required default domain type for a specific device.
2541 *
2542 * @dev: the device in query
2543 * @startup: true if this is during early boot
2544 *
2545 * Returns:
2546 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2547 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2548 * - 0: both identity and dynamic domains work for this device
2549 */
device_def_domain_type(struct device * dev)2550 static int device_def_domain_type(struct device *dev)
2551 {
2552 if (dev_is_pci(dev)) {
2553 struct pci_dev *pdev = to_pci_dev(dev);
2554
2555 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2556 return IOMMU_DOMAIN_IDENTITY;
2557
2558 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2559 return IOMMU_DOMAIN_IDENTITY;
2560 }
2561
2562 return 0;
2563 }
2564
intel_iommu_init_qi(struct intel_iommu * iommu)2565 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2566 {
2567 /*
2568 * Start from the sane iommu hardware state.
2569 * If the queued invalidation is already initialized by us
2570 * (for example, while enabling interrupt-remapping) then
2571 * we got the things already rolling from a sane state.
2572 */
2573 if (!iommu->qi) {
2574 /*
2575 * Clear any previous faults.
2576 */
2577 dmar_fault(-1, iommu);
2578 /*
2579 * Disable queued invalidation if supported and already enabled
2580 * before OS handover.
2581 */
2582 dmar_disable_qi(iommu);
2583 }
2584
2585 if (dmar_enable_qi(iommu)) {
2586 /*
2587 * Queued Invalidate not enabled, use Register Based Invalidate
2588 */
2589 iommu->flush.flush_context = __iommu_flush_context;
2590 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2591 pr_info("%s: Using Register based invalidation\n",
2592 iommu->name);
2593 } else {
2594 iommu->flush.flush_context = qi_flush_context;
2595 iommu->flush.flush_iotlb = qi_flush_iotlb;
2596 pr_info("%s: Using Queued invalidation\n", iommu->name);
2597 }
2598 }
2599
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2600 static int copy_context_table(struct intel_iommu *iommu,
2601 struct root_entry *old_re,
2602 struct context_entry **tbl,
2603 int bus, bool ext)
2604 {
2605 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2606 struct context_entry *new_ce = NULL, ce;
2607 struct context_entry *old_ce = NULL;
2608 struct root_entry re;
2609 phys_addr_t old_ce_phys;
2610
2611 tbl_idx = ext ? bus * 2 : bus;
2612 memcpy(&re, old_re, sizeof(re));
2613
2614 for (devfn = 0; devfn < 256; devfn++) {
2615 /* First calculate the correct index */
2616 idx = (ext ? devfn * 2 : devfn) % 256;
2617
2618 if (idx == 0) {
2619 /* First save what we may have and clean up */
2620 if (new_ce) {
2621 tbl[tbl_idx] = new_ce;
2622 __iommu_flush_cache(iommu, new_ce,
2623 VTD_PAGE_SIZE);
2624 pos = 1;
2625 }
2626
2627 if (old_ce)
2628 memunmap(old_ce);
2629
2630 ret = 0;
2631 if (devfn < 0x80)
2632 old_ce_phys = root_entry_lctp(&re);
2633 else
2634 old_ce_phys = root_entry_uctp(&re);
2635
2636 if (!old_ce_phys) {
2637 if (ext && devfn == 0) {
2638 /* No LCTP, try UCTP */
2639 devfn = 0x7f;
2640 continue;
2641 } else {
2642 goto out;
2643 }
2644 }
2645
2646 ret = -ENOMEM;
2647 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2648 MEMREMAP_WB);
2649 if (!old_ce)
2650 goto out;
2651
2652 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2653 if (!new_ce)
2654 goto out_unmap;
2655
2656 ret = 0;
2657 }
2658
2659 /* Now copy the context entry */
2660 memcpy(&ce, old_ce + idx, sizeof(ce));
2661
2662 if (!context_present(&ce))
2663 continue;
2664
2665 did = context_domain_id(&ce);
2666 if (did >= 0 && did < cap_ndoms(iommu->cap))
2667 set_bit(did, iommu->domain_ids);
2668
2669 set_context_copied(iommu, bus, devfn);
2670 new_ce[idx] = ce;
2671 }
2672
2673 tbl[tbl_idx + pos] = new_ce;
2674
2675 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2676
2677 out_unmap:
2678 memunmap(old_ce);
2679
2680 out:
2681 return ret;
2682 }
2683
copy_translation_tables(struct intel_iommu * iommu)2684 static int copy_translation_tables(struct intel_iommu *iommu)
2685 {
2686 struct context_entry **ctxt_tbls;
2687 struct root_entry *old_rt;
2688 phys_addr_t old_rt_phys;
2689 int ctxt_table_entries;
2690 u64 rtaddr_reg;
2691 int bus, ret;
2692 bool new_ext, ext;
2693
2694 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2695 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2696 new_ext = !!sm_supported(iommu);
2697
2698 /*
2699 * The RTT bit can only be changed when translation is disabled,
2700 * but disabling translation means to open a window for data
2701 * corruption. So bail out and don't copy anything if we would
2702 * have to change the bit.
2703 */
2704 if (new_ext != ext)
2705 return -EINVAL;
2706
2707 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2708 if (!iommu->copied_tables)
2709 return -ENOMEM;
2710
2711 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2712 if (!old_rt_phys)
2713 return -EINVAL;
2714
2715 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2716 if (!old_rt)
2717 return -ENOMEM;
2718
2719 /* This is too big for the stack - allocate it from slab */
2720 ctxt_table_entries = ext ? 512 : 256;
2721 ret = -ENOMEM;
2722 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2723 if (!ctxt_tbls)
2724 goto out_unmap;
2725
2726 for (bus = 0; bus < 256; bus++) {
2727 ret = copy_context_table(iommu, &old_rt[bus],
2728 ctxt_tbls, bus, ext);
2729 if (ret) {
2730 pr_err("%s: Failed to copy context table for bus %d\n",
2731 iommu->name, bus);
2732 continue;
2733 }
2734 }
2735
2736 spin_lock(&iommu->lock);
2737
2738 /* Context tables are copied, now write them to the root_entry table */
2739 for (bus = 0; bus < 256; bus++) {
2740 int idx = ext ? bus * 2 : bus;
2741 u64 val;
2742
2743 if (ctxt_tbls[idx]) {
2744 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2745 iommu->root_entry[bus].lo = val;
2746 }
2747
2748 if (!ext || !ctxt_tbls[idx + 1])
2749 continue;
2750
2751 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2752 iommu->root_entry[bus].hi = val;
2753 }
2754
2755 spin_unlock(&iommu->lock);
2756
2757 kfree(ctxt_tbls);
2758
2759 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2760
2761 ret = 0;
2762
2763 out_unmap:
2764 memunmap(old_rt);
2765
2766 return ret;
2767 }
2768
init_dmars(void)2769 static int __init init_dmars(void)
2770 {
2771 struct dmar_drhd_unit *drhd;
2772 struct intel_iommu *iommu;
2773 int ret;
2774
2775 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2776 if (ret)
2777 goto free_iommu;
2778
2779 for_each_iommu(iommu, drhd) {
2780 if (drhd->ignored) {
2781 iommu_disable_translation(iommu);
2782 continue;
2783 }
2784
2785 /*
2786 * Find the max pasid size of all IOMMU's in the system.
2787 * We need to ensure the system pasid table is no bigger
2788 * than the smallest supported.
2789 */
2790 if (pasid_supported(iommu)) {
2791 u32 temp = 2 << ecap_pss(iommu->ecap);
2792
2793 intel_pasid_max_id = min_t(u32, temp,
2794 intel_pasid_max_id);
2795 }
2796
2797 intel_iommu_init_qi(iommu);
2798
2799 ret = iommu_init_domains(iommu);
2800 if (ret)
2801 goto free_iommu;
2802
2803 init_translation_status(iommu);
2804
2805 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2806 iommu_disable_translation(iommu);
2807 clear_translation_pre_enabled(iommu);
2808 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2809 iommu->name);
2810 }
2811
2812 /*
2813 * TBD:
2814 * we could share the same root & context tables
2815 * among all IOMMU's. Need to Split it later.
2816 */
2817 ret = iommu_alloc_root_entry(iommu);
2818 if (ret)
2819 goto free_iommu;
2820
2821 if (translation_pre_enabled(iommu)) {
2822 pr_info("Translation already enabled - trying to copy translation structures\n");
2823
2824 ret = copy_translation_tables(iommu);
2825 if (ret) {
2826 /*
2827 * We found the IOMMU with translation
2828 * enabled - but failed to copy over the
2829 * old root-entry table. Try to proceed
2830 * by disabling translation now and
2831 * allocating a clean root-entry table.
2832 * This might cause DMAR faults, but
2833 * probably the dump will still succeed.
2834 */
2835 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2836 iommu->name);
2837 iommu_disable_translation(iommu);
2838 clear_translation_pre_enabled(iommu);
2839 } else {
2840 pr_info("Copied translation tables from previous kernel for %s\n",
2841 iommu->name);
2842 }
2843 }
2844
2845 if (!ecap_pass_through(iommu->ecap))
2846 hw_pass_through = 0;
2847 intel_svm_check(iommu);
2848 }
2849
2850 /*
2851 * Now that qi is enabled on all iommus, set the root entry and flush
2852 * caches. This is required on some Intel X58 chipsets, otherwise the
2853 * flush_context function will loop forever and the boot hangs.
2854 */
2855 for_each_active_iommu(iommu, drhd) {
2856 iommu_flush_write_buffer(iommu);
2857 iommu_set_root_entry(iommu);
2858 }
2859
2860 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2861 dmar_map_gfx = 0;
2862 #endif
2863
2864 if (!dmar_map_gfx)
2865 iommu_identity_mapping |= IDENTMAP_GFX;
2866
2867 check_tylersburg_isoch();
2868
2869 ret = si_domain_init(hw_pass_through);
2870 if (ret)
2871 goto free_iommu;
2872
2873 /*
2874 * for each drhd
2875 * enable fault log
2876 * global invalidate context cache
2877 * global invalidate iotlb
2878 * enable translation
2879 */
2880 for_each_iommu(iommu, drhd) {
2881 if (drhd->ignored) {
2882 /*
2883 * we always have to disable PMRs or DMA may fail on
2884 * this device
2885 */
2886 if (force_on)
2887 iommu_disable_protect_mem_regions(iommu);
2888 continue;
2889 }
2890
2891 iommu_flush_write_buffer(iommu);
2892
2893 #ifdef CONFIG_INTEL_IOMMU_SVM
2894 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2895 /*
2896 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2897 * could cause possible lock race condition.
2898 */
2899 up_write(&dmar_global_lock);
2900 ret = intel_svm_enable_prq(iommu);
2901 down_write(&dmar_global_lock);
2902 if (ret)
2903 goto free_iommu;
2904 }
2905 #endif
2906 ret = dmar_set_interrupt(iommu);
2907 if (ret)
2908 goto free_iommu;
2909 }
2910
2911 return 0;
2912
2913 free_iommu:
2914 for_each_active_iommu(iommu, drhd) {
2915 disable_dmar_iommu(iommu);
2916 free_dmar_iommu(iommu);
2917 }
2918 if (si_domain) {
2919 domain_exit(si_domain);
2920 si_domain = NULL;
2921 }
2922
2923 return ret;
2924 }
2925
init_no_remapping_devices(void)2926 static void __init init_no_remapping_devices(void)
2927 {
2928 struct dmar_drhd_unit *drhd;
2929 struct device *dev;
2930 int i;
2931
2932 for_each_drhd_unit(drhd) {
2933 if (!drhd->include_all) {
2934 for_each_active_dev_scope(drhd->devices,
2935 drhd->devices_cnt, i, dev)
2936 break;
2937 /* ignore DMAR unit if no devices exist */
2938 if (i == drhd->devices_cnt)
2939 drhd->ignored = 1;
2940 }
2941 }
2942
2943 for_each_active_drhd_unit(drhd) {
2944 if (drhd->include_all)
2945 continue;
2946
2947 for_each_active_dev_scope(drhd->devices,
2948 drhd->devices_cnt, i, dev)
2949 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2950 break;
2951 if (i < drhd->devices_cnt)
2952 continue;
2953
2954 /* This IOMMU has *only* gfx devices. Either bypass it or
2955 set the gfx_mapped flag, as appropriate */
2956 drhd->gfx_dedicated = 1;
2957 if (!dmar_map_gfx)
2958 drhd->ignored = 1;
2959 }
2960 }
2961
2962 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2963 static int init_iommu_hw(void)
2964 {
2965 struct dmar_drhd_unit *drhd;
2966 struct intel_iommu *iommu = NULL;
2967 int ret;
2968
2969 for_each_active_iommu(iommu, drhd) {
2970 if (iommu->qi) {
2971 ret = dmar_reenable_qi(iommu);
2972 if (ret)
2973 return ret;
2974 }
2975 }
2976
2977 for_each_iommu(iommu, drhd) {
2978 if (drhd->ignored) {
2979 /*
2980 * we always have to disable PMRs or DMA may fail on
2981 * this device
2982 */
2983 if (force_on)
2984 iommu_disable_protect_mem_regions(iommu);
2985 continue;
2986 }
2987
2988 iommu_flush_write_buffer(iommu);
2989 iommu_set_root_entry(iommu);
2990 iommu_enable_translation(iommu);
2991 iommu_disable_protect_mem_regions(iommu);
2992 }
2993
2994 return 0;
2995 }
2996
iommu_flush_all(void)2997 static void iommu_flush_all(void)
2998 {
2999 struct dmar_drhd_unit *drhd;
3000 struct intel_iommu *iommu;
3001
3002 for_each_active_iommu(iommu, drhd) {
3003 iommu->flush.flush_context(iommu, 0, 0, 0,
3004 DMA_CCMD_GLOBAL_INVL);
3005 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3006 DMA_TLB_GLOBAL_FLUSH);
3007 }
3008 }
3009
iommu_suspend(void)3010 static int iommu_suspend(void)
3011 {
3012 struct dmar_drhd_unit *drhd;
3013 struct intel_iommu *iommu = NULL;
3014 unsigned long flag;
3015
3016 iommu_flush_all();
3017
3018 for_each_active_iommu(iommu, drhd) {
3019 iommu_disable_translation(iommu);
3020
3021 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3022
3023 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3024 readl(iommu->reg + DMAR_FECTL_REG);
3025 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3026 readl(iommu->reg + DMAR_FEDATA_REG);
3027 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3028 readl(iommu->reg + DMAR_FEADDR_REG);
3029 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3030 readl(iommu->reg + DMAR_FEUADDR_REG);
3031
3032 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3033 }
3034 return 0;
3035 }
3036
iommu_resume(void)3037 static void iommu_resume(void)
3038 {
3039 struct dmar_drhd_unit *drhd;
3040 struct intel_iommu *iommu = NULL;
3041 unsigned long flag;
3042
3043 if (init_iommu_hw()) {
3044 if (force_on)
3045 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3046 else
3047 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3048 return;
3049 }
3050
3051 for_each_active_iommu(iommu, drhd) {
3052
3053 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3054
3055 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3056 iommu->reg + DMAR_FECTL_REG);
3057 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3058 iommu->reg + DMAR_FEDATA_REG);
3059 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3060 iommu->reg + DMAR_FEADDR_REG);
3061 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3062 iommu->reg + DMAR_FEUADDR_REG);
3063
3064 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3065 }
3066 }
3067
3068 static struct syscore_ops iommu_syscore_ops = {
3069 .resume = iommu_resume,
3070 .suspend = iommu_suspend,
3071 };
3072
init_iommu_pm_ops(void)3073 static void __init init_iommu_pm_ops(void)
3074 {
3075 register_syscore_ops(&iommu_syscore_ops);
3076 }
3077
3078 #else
init_iommu_pm_ops(void)3079 static inline void init_iommu_pm_ops(void) {}
3080 #endif /* CONFIG_PM */
3081
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)3082 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3083 {
3084 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3085 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3086 rmrr->end_address <= rmrr->base_address ||
3087 arch_rmrr_sanity_check(rmrr))
3088 return -EINVAL;
3089
3090 return 0;
3091 }
3092
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)3093 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3094 {
3095 struct acpi_dmar_reserved_memory *rmrr;
3096 struct dmar_rmrr_unit *rmrru;
3097
3098 rmrr = (struct acpi_dmar_reserved_memory *)header;
3099 if (rmrr_sanity_check(rmrr)) {
3100 pr_warn(FW_BUG
3101 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3102 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3103 rmrr->base_address, rmrr->end_address,
3104 dmi_get_system_info(DMI_BIOS_VENDOR),
3105 dmi_get_system_info(DMI_BIOS_VERSION),
3106 dmi_get_system_info(DMI_PRODUCT_VERSION));
3107 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3108 }
3109
3110 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3111 if (!rmrru)
3112 goto out;
3113
3114 rmrru->hdr = header;
3115
3116 rmrru->base_address = rmrr->base_address;
3117 rmrru->end_address = rmrr->end_address;
3118
3119 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3120 ((void *)rmrr) + rmrr->header.length,
3121 &rmrru->devices_cnt);
3122 if (rmrru->devices_cnt && rmrru->devices == NULL)
3123 goto free_rmrru;
3124
3125 list_add(&rmrru->list, &dmar_rmrr_units);
3126
3127 return 0;
3128 free_rmrru:
3129 kfree(rmrru);
3130 out:
3131 return -ENOMEM;
3132 }
3133
dmar_find_atsr(struct acpi_dmar_atsr * atsr)3134 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3135 {
3136 struct dmar_atsr_unit *atsru;
3137 struct acpi_dmar_atsr *tmp;
3138
3139 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3140 dmar_rcu_check()) {
3141 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3142 if (atsr->segment != tmp->segment)
3143 continue;
3144 if (atsr->header.length != tmp->header.length)
3145 continue;
3146 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3147 return atsru;
3148 }
3149
3150 return NULL;
3151 }
3152
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)3153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3154 {
3155 struct acpi_dmar_atsr *atsr;
3156 struct dmar_atsr_unit *atsru;
3157
3158 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3159 return 0;
3160
3161 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3162 atsru = dmar_find_atsr(atsr);
3163 if (atsru)
3164 return 0;
3165
3166 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3167 if (!atsru)
3168 return -ENOMEM;
3169
3170 /*
3171 * If memory is allocated from slab by ACPI _DSM method, we need to
3172 * copy the memory content because the memory buffer will be freed
3173 * on return.
3174 */
3175 atsru->hdr = (void *)(atsru + 1);
3176 memcpy(atsru->hdr, hdr, hdr->length);
3177 atsru->include_all = atsr->flags & 0x1;
3178 if (!atsru->include_all) {
3179 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3180 (void *)atsr + atsr->header.length,
3181 &atsru->devices_cnt);
3182 if (atsru->devices_cnt && atsru->devices == NULL) {
3183 kfree(atsru);
3184 return -ENOMEM;
3185 }
3186 }
3187
3188 list_add_rcu(&atsru->list, &dmar_atsr_units);
3189
3190 return 0;
3191 }
3192
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3194 {
3195 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3196 kfree(atsru);
3197 }
3198
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)3199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3200 {
3201 struct acpi_dmar_atsr *atsr;
3202 struct dmar_atsr_unit *atsru;
3203
3204 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3205 atsru = dmar_find_atsr(atsr);
3206 if (atsru) {
3207 list_del_rcu(&atsru->list);
3208 synchronize_rcu();
3209 intel_iommu_free_atsr(atsru);
3210 }
3211
3212 return 0;
3213 }
3214
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)3215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3216 {
3217 int i;
3218 struct device *dev;
3219 struct acpi_dmar_atsr *atsr;
3220 struct dmar_atsr_unit *atsru;
3221
3222 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3223 atsru = dmar_find_atsr(atsr);
3224 if (!atsru)
3225 return 0;
3226
3227 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3228 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3229 i, dev)
3230 return -EBUSY;
3231 }
3232
3233 return 0;
3234 }
3235
dmar_find_satc(struct acpi_dmar_satc * satc)3236 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3237 {
3238 struct dmar_satc_unit *satcu;
3239 struct acpi_dmar_satc *tmp;
3240
3241 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3242 dmar_rcu_check()) {
3243 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3244 if (satc->segment != tmp->segment)
3245 continue;
3246 if (satc->header.length != tmp->header.length)
3247 continue;
3248 if (memcmp(satc, tmp, satc->header.length) == 0)
3249 return satcu;
3250 }
3251
3252 return NULL;
3253 }
3254
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)3255 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3256 {
3257 struct acpi_dmar_satc *satc;
3258 struct dmar_satc_unit *satcu;
3259
3260 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3261 return 0;
3262
3263 satc = container_of(hdr, struct acpi_dmar_satc, header);
3264 satcu = dmar_find_satc(satc);
3265 if (satcu)
3266 return 0;
3267
3268 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3269 if (!satcu)
3270 return -ENOMEM;
3271
3272 satcu->hdr = (void *)(satcu + 1);
3273 memcpy(satcu->hdr, hdr, hdr->length);
3274 satcu->atc_required = satc->flags & 0x1;
3275 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3276 (void *)satc + satc->header.length,
3277 &satcu->devices_cnt);
3278 if (satcu->devices_cnt && !satcu->devices) {
3279 kfree(satcu);
3280 return -ENOMEM;
3281 }
3282 list_add_rcu(&satcu->list, &dmar_satc_units);
3283
3284 return 0;
3285 }
3286
intel_iommu_add(struct dmar_drhd_unit * dmaru)3287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3288 {
3289 int sp, ret;
3290 struct intel_iommu *iommu = dmaru->iommu;
3291
3292 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3293 if (ret)
3294 goto out;
3295
3296 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3297 pr_warn("%s: Doesn't support hardware pass through.\n",
3298 iommu->name);
3299 return -ENXIO;
3300 }
3301
3302 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3303 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3304 pr_warn("%s: Doesn't support large page.\n",
3305 iommu->name);
3306 return -ENXIO;
3307 }
3308
3309 /*
3310 * Disable translation if already enabled prior to OS handover.
3311 */
3312 if (iommu->gcmd & DMA_GCMD_TE)
3313 iommu_disable_translation(iommu);
3314
3315 ret = iommu_init_domains(iommu);
3316 if (ret == 0)
3317 ret = iommu_alloc_root_entry(iommu);
3318 if (ret)
3319 goto out;
3320
3321 intel_svm_check(iommu);
3322
3323 if (dmaru->ignored) {
3324 /*
3325 * we always have to disable PMRs or DMA may fail on this device
3326 */
3327 if (force_on)
3328 iommu_disable_protect_mem_regions(iommu);
3329 return 0;
3330 }
3331
3332 intel_iommu_init_qi(iommu);
3333 iommu_flush_write_buffer(iommu);
3334
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337 ret = intel_svm_enable_prq(iommu);
3338 if (ret)
3339 goto disable_iommu;
3340 }
3341 #endif
3342 ret = dmar_set_interrupt(iommu);
3343 if (ret)
3344 goto disable_iommu;
3345
3346 iommu_set_root_entry(iommu);
3347 iommu_enable_translation(iommu);
3348
3349 iommu_disable_protect_mem_regions(iommu);
3350 return 0;
3351
3352 disable_iommu:
3353 disable_dmar_iommu(iommu);
3354 out:
3355 free_dmar_iommu(iommu);
3356 return ret;
3357 }
3358
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)3359 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3360 {
3361 int ret = 0;
3362 struct intel_iommu *iommu = dmaru->iommu;
3363
3364 if (!intel_iommu_enabled)
3365 return 0;
3366 if (iommu == NULL)
3367 return -EINVAL;
3368
3369 if (insert) {
3370 ret = intel_iommu_add(dmaru);
3371 } else {
3372 disable_dmar_iommu(iommu);
3373 free_dmar_iommu(iommu);
3374 }
3375
3376 return ret;
3377 }
3378
intel_iommu_free_dmars(void)3379 static void intel_iommu_free_dmars(void)
3380 {
3381 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3382 struct dmar_atsr_unit *atsru, *atsr_n;
3383 struct dmar_satc_unit *satcu, *satc_n;
3384
3385 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3386 list_del(&rmrru->list);
3387 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3388 kfree(rmrru);
3389 }
3390
3391 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3392 list_del(&atsru->list);
3393 intel_iommu_free_atsr(atsru);
3394 }
3395 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3396 list_del(&satcu->list);
3397 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3398 kfree(satcu);
3399 }
3400 }
3401
dmar_find_matched_satc_unit(struct pci_dev * dev)3402 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3403 {
3404 struct dmar_satc_unit *satcu;
3405 struct acpi_dmar_satc *satc;
3406 struct device *tmp;
3407 int i;
3408
3409 dev = pci_physfn(dev);
3410 rcu_read_lock();
3411
3412 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3413 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3414 if (satc->segment != pci_domain_nr(dev->bus))
3415 continue;
3416 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3417 if (to_pci_dev(tmp) == dev)
3418 goto out;
3419 }
3420 satcu = NULL;
3421 out:
3422 rcu_read_unlock();
3423 return satcu;
3424 }
3425
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)3426 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3427 {
3428 int i, ret = 1;
3429 struct pci_bus *bus;
3430 struct pci_dev *bridge = NULL;
3431 struct device *tmp;
3432 struct acpi_dmar_atsr *atsr;
3433 struct dmar_atsr_unit *atsru;
3434 struct dmar_satc_unit *satcu;
3435
3436 dev = pci_physfn(dev);
3437 satcu = dmar_find_matched_satc_unit(dev);
3438 if (satcu)
3439 /*
3440 * This device supports ATS as it is in SATC table.
3441 * When IOMMU is in legacy mode, enabling ATS is done
3442 * automatically by HW for the device that requires
3443 * ATS, hence OS should not enable this device ATS
3444 * to avoid duplicated TLB invalidation.
3445 */
3446 return !(satcu->atc_required && !sm_supported(iommu));
3447
3448 for (bus = dev->bus; bus; bus = bus->parent) {
3449 bridge = bus->self;
3450 /* If it's an integrated device, allow ATS */
3451 if (!bridge)
3452 return 1;
3453 /* Connected via non-PCIe: no ATS */
3454 if (!pci_is_pcie(bridge) ||
3455 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3456 return 0;
3457 /* If we found the root port, look it up in the ATSR */
3458 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3459 break;
3460 }
3461
3462 rcu_read_lock();
3463 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3464 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3465 if (atsr->segment != pci_domain_nr(dev->bus))
3466 continue;
3467
3468 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3469 if (tmp == &bridge->dev)
3470 goto out;
3471
3472 if (atsru->include_all)
3473 goto out;
3474 }
3475 ret = 0;
3476 out:
3477 rcu_read_unlock();
3478
3479 return ret;
3480 }
3481
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3483 {
3484 int ret;
3485 struct dmar_rmrr_unit *rmrru;
3486 struct dmar_atsr_unit *atsru;
3487 struct dmar_satc_unit *satcu;
3488 struct acpi_dmar_atsr *atsr;
3489 struct acpi_dmar_reserved_memory *rmrr;
3490 struct acpi_dmar_satc *satc;
3491
3492 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3493 return 0;
3494
3495 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3496 rmrr = container_of(rmrru->hdr,
3497 struct acpi_dmar_reserved_memory, header);
3498 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3499 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3500 ((void *)rmrr) + rmrr->header.length,
3501 rmrr->segment, rmrru->devices,
3502 rmrru->devices_cnt);
3503 if (ret < 0)
3504 return ret;
3505 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3506 dmar_remove_dev_scope(info, rmrr->segment,
3507 rmrru->devices, rmrru->devices_cnt);
3508 }
3509 }
3510
3511 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3512 if (atsru->include_all)
3513 continue;
3514
3515 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3516 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3517 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3518 (void *)atsr + atsr->header.length,
3519 atsr->segment, atsru->devices,
3520 atsru->devices_cnt);
3521 if (ret > 0)
3522 break;
3523 else if (ret < 0)
3524 return ret;
3525 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3526 if (dmar_remove_dev_scope(info, atsr->segment,
3527 atsru->devices, atsru->devices_cnt))
3528 break;
3529 }
3530 }
3531 list_for_each_entry(satcu, &dmar_satc_units, list) {
3532 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3533 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3534 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3535 (void *)satc + satc->header.length,
3536 satc->segment, satcu->devices,
3537 satcu->devices_cnt);
3538 if (ret > 0)
3539 break;
3540 else if (ret < 0)
3541 return ret;
3542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3543 if (dmar_remove_dev_scope(info, satc->segment,
3544 satcu->devices, satcu->devices_cnt))
3545 break;
3546 }
3547 }
3548
3549 return 0;
3550 }
3551
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)3552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3553 unsigned long val, void *v)
3554 {
3555 struct memory_notify *mhp = v;
3556 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3557 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3558 mhp->nr_pages - 1);
3559
3560 switch (val) {
3561 case MEM_GOING_ONLINE:
3562 if (iommu_domain_identity_map(si_domain,
3563 start_vpfn, last_vpfn)) {
3564 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3565 start_vpfn, last_vpfn);
3566 return NOTIFY_BAD;
3567 }
3568 break;
3569
3570 case MEM_OFFLINE:
3571 case MEM_CANCEL_ONLINE:
3572 {
3573 struct dmar_drhd_unit *drhd;
3574 struct intel_iommu *iommu;
3575 LIST_HEAD(freelist);
3576
3577 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3578
3579 rcu_read_lock();
3580 for_each_active_iommu(iommu, drhd)
3581 iommu_flush_iotlb_psi(iommu, si_domain,
3582 start_vpfn, mhp->nr_pages,
3583 list_empty(&freelist), 0);
3584 rcu_read_unlock();
3585 put_pages_list(&freelist);
3586 }
3587 break;
3588 }
3589
3590 return NOTIFY_OK;
3591 }
3592
3593 static struct notifier_block intel_iommu_memory_nb = {
3594 .notifier_call = intel_iommu_memory_notifier,
3595 .priority = 0
3596 };
3597
intel_disable_iommus(void)3598 static void intel_disable_iommus(void)
3599 {
3600 struct intel_iommu *iommu = NULL;
3601 struct dmar_drhd_unit *drhd;
3602
3603 for_each_iommu(iommu, drhd)
3604 iommu_disable_translation(iommu);
3605 }
3606
intel_iommu_shutdown(void)3607 void intel_iommu_shutdown(void)
3608 {
3609 struct dmar_drhd_unit *drhd;
3610 struct intel_iommu *iommu = NULL;
3611
3612 if (no_iommu || dmar_disabled)
3613 return;
3614
3615 down_write(&dmar_global_lock);
3616
3617 /* Disable PMRs explicitly here. */
3618 for_each_iommu(iommu, drhd)
3619 iommu_disable_protect_mem_regions(iommu);
3620
3621 /* Make sure the IOMMUs are switched off */
3622 intel_disable_iommus();
3623
3624 up_write(&dmar_global_lock);
3625 }
3626
dev_to_intel_iommu(struct device * dev)3627 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3628 {
3629 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3630
3631 return container_of(iommu_dev, struct intel_iommu, iommu);
3632 }
3633
version_show(struct device * dev,struct device_attribute * attr,char * buf)3634 static ssize_t version_show(struct device *dev,
3635 struct device_attribute *attr, char *buf)
3636 {
3637 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3638 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3639 return sysfs_emit(buf, "%d:%d\n",
3640 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3641 }
3642 static DEVICE_ATTR_RO(version);
3643
address_show(struct device * dev,struct device_attribute * attr,char * buf)3644 static ssize_t address_show(struct device *dev,
3645 struct device_attribute *attr, char *buf)
3646 {
3647 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3648 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3649 }
3650 static DEVICE_ATTR_RO(address);
3651
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3652 static ssize_t cap_show(struct device *dev,
3653 struct device_attribute *attr, char *buf)
3654 {
3655 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3656 return sysfs_emit(buf, "%llx\n", iommu->cap);
3657 }
3658 static DEVICE_ATTR_RO(cap);
3659
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3660 static ssize_t ecap_show(struct device *dev,
3661 struct device_attribute *attr, char *buf)
3662 {
3663 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3664 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3665 }
3666 static DEVICE_ATTR_RO(ecap);
3667
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3668 static ssize_t domains_supported_show(struct device *dev,
3669 struct device_attribute *attr, char *buf)
3670 {
3671 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3672 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3673 }
3674 static DEVICE_ATTR_RO(domains_supported);
3675
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3676 static ssize_t domains_used_show(struct device *dev,
3677 struct device_attribute *attr, char *buf)
3678 {
3679 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3680 return sysfs_emit(buf, "%d\n",
3681 bitmap_weight(iommu->domain_ids,
3682 cap_ndoms(iommu->cap)));
3683 }
3684 static DEVICE_ATTR_RO(domains_used);
3685
3686 static struct attribute *intel_iommu_attrs[] = {
3687 &dev_attr_version.attr,
3688 &dev_attr_address.attr,
3689 &dev_attr_cap.attr,
3690 &dev_attr_ecap.attr,
3691 &dev_attr_domains_supported.attr,
3692 &dev_attr_domains_used.attr,
3693 NULL,
3694 };
3695
3696 static struct attribute_group intel_iommu_group = {
3697 .name = "intel-iommu",
3698 .attrs = intel_iommu_attrs,
3699 };
3700
3701 const struct attribute_group *intel_iommu_groups[] = {
3702 &intel_iommu_group,
3703 NULL,
3704 };
3705
has_external_pci(void)3706 static inline bool has_external_pci(void)
3707 {
3708 struct pci_dev *pdev = NULL;
3709
3710 for_each_pci_dev(pdev)
3711 if (pdev->external_facing) {
3712 pci_dev_put(pdev);
3713 return true;
3714 }
3715
3716 return false;
3717 }
3718
platform_optin_force_iommu(void)3719 static int __init platform_optin_force_iommu(void)
3720 {
3721 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3722 return 0;
3723
3724 if (no_iommu || dmar_disabled)
3725 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3726
3727 /*
3728 * If Intel-IOMMU is disabled by default, we will apply identity
3729 * map for all devices except those marked as being untrusted.
3730 */
3731 if (dmar_disabled)
3732 iommu_set_default_passthrough(false);
3733
3734 dmar_disabled = 0;
3735 no_iommu = 0;
3736
3737 return 1;
3738 }
3739
probe_acpi_namespace_devices(void)3740 static int __init probe_acpi_namespace_devices(void)
3741 {
3742 struct dmar_drhd_unit *drhd;
3743 /* To avoid a -Wunused-but-set-variable warning. */
3744 struct intel_iommu *iommu __maybe_unused;
3745 struct device *dev;
3746 int i, ret = 0;
3747
3748 for_each_active_iommu(iommu, drhd) {
3749 for_each_active_dev_scope(drhd->devices,
3750 drhd->devices_cnt, i, dev) {
3751 struct acpi_device_physical_node *pn;
3752 struct acpi_device *adev;
3753
3754 if (dev->bus != &acpi_bus_type)
3755 continue;
3756
3757 adev = to_acpi_device(dev);
3758 mutex_lock(&adev->physical_node_lock);
3759 list_for_each_entry(pn,
3760 &adev->physical_node_list, node) {
3761 ret = iommu_probe_device(pn->dev);
3762 if (ret)
3763 break;
3764 }
3765 mutex_unlock(&adev->physical_node_lock);
3766
3767 if (ret)
3768 return ret;
3769 }
3770 }
3771
3772 return 0;
3773 }
3774
tboot_force_iommu(void)3775 static __init int tboot_force_iommu(void)
3776 {
3777 if (!tboot_enabled())
3778 return 0;
3779
3780 if (no_iommu || dmar_disabled)
3781 pr_warn("Forcing Intel-IOMMU to enabled\n");
3782
3783 dmar_disabled = 0;
3784 no_iommu = 0;
3785
3786 return 1;
3787 }
3788
intel_iommu_init(void)3789 int __init intel_iommu_init(void)
3790 {
3791 int ret = -ENODEV;
3792 struct dmar_drhd_unit *drhd;
3793 struct intel_iommu *iommu;
3794
3795 /*
3796 * Intel IOMMU is required for a TXT/tboot launch or platform
3797 * opt in, so enforce that.
3798 */
3799 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3800 platform_optin_force_iommu();
3801
3802 down_write(&dmar_global_lock);
3803 if (dmar_table_init()) {
3804 if (force_on)
3805 panic("tboot: Failed to initialize DMAR table\n");
3806 goto out_free_dmar;
3807 }
3808
3809 if (dmar_dev_scope_init() < 0) {
3810 if (force_on)
3811 panic("tboot: Failed to initialize DMAR device scope\n");
3812 goto out_free_dmar;
3813 }
3814
3815 up_write(&dmar_global_lock);
3816
3817 /*
3818 * The bus notifier takes the dmar_global_lock, so lockdep will
3819 * complain later when we register it under the lock.
3820 */
3821 dmar_register_bus_notifier();
3822
3823 down_write(&dmar_global_lock);
3824
3825 if (!no_iommu)
3826 intel_iommu_debugfs_init();
3827
3828 if (no_iommu || dmar_disabled) {
3829 /*
3830 * We exit the function here to ensure IOMMU's remapping and
3831 * mempool aren't setup, which means that the IOMMU's PMRs
3832 * won't be disabled via the call to init_dmars(). So disable
3833 * it explicitly here. The PMRs were setup by tboot prior to
3834 * calling SENTER, but the kernel is expected to reset/tear
3835 * down the PMRs.
3836 */
3837 if (intel_iommu_tboot_noforce) {
3838 for_each_iommu(iommu, drhd)
3839 iommu_disable_protect_mem_regions(iommu);
3840 }
3841
3842 /*
3843 * Make sure the IOMMUs are switched off, even when we
3844 * boot into a kexec kernel and the previous kernel left
3845 * them enabled
3846 */
3847 intel_disable_iommus();
3848 goto out_free_dmar;
3849 }
3850
3851 if (list_empty(&dmar_rmrr_units))
3852 pr_info("No RMRR found\n");
3853
3854 if (list_empty(&dmar_atsr_units))
3855 pr_info("No ATSR found\n");
3856
3857 if (list_empty(&dmar_satc_units))
3858 pr_info("No SATC found\n");
3859
3860 init_no_remapping_devices();
3861
3862 ret = init_dmars();
3863 if (ret) {
3864 if (force_on)
3865 panic("tboot: Failed to initialize DMARs\n");
3866 pr_err("Initialization failed\n");
3867 goto out_free_dmar;
3868 }
3869 up_write(&dmar_global_lock);
3870
3871 init_iommu_pm_ops();
3872
3873 down_read(&dmar_global_lock);
3874 for_each_active_iommu(iommu, drhd) {
3875 /*
3876 * The flush queue implementation does not perform
3877 * page-selective invalidations that are required for efficient
3878 * TLB flushes in virtual environments. The benefit of batching
3879 * is likely to be much lower than the overhead of synchronizing
3880 * the virtual and physical IOMMU page-tables.
3881 */
3882 if (cap_caching_mode(iommu->cap) &&
3883 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3884 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3885 iommu_set_dma_strict();
3886 }
3887 iommu_device_sysfs_add(&iommu->iommu, NULL,
3888 intel_iommu_groups,
3889 "%s", iommu->name);
3890 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3891
3892 iommu_pmu_register(iommu);
3893 }
3894 up_read(&dmar_global_lock);
3895
3896 if (si_domain && !hw_pass_through)
3897 register_memory_notifier(&intel_iommu_memory_nb);
3898
3899 down_read(&dmar_global_lock);
3900 if (probe_acpi_namespace_devices())
3901 pr_warn("ACPI name space devices didn't probe correctly\n");
3902
3903 /* Finally, we enable the DMA remapping hardware. */
3904 for_each_iommu(iommu, drhd) {
3905 if (!drhd->ignored && !translation_pre_enabled(iommu))
3906 iommu_enable_translation(iommu);
3907
3908 iommu_disable_protect_mem_regions(iommu);
3909 }
3910 up_read(&dmar_global_lock);
3911
3912 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3913
3914 intel_iommu_enabled = 1;
3915
3916 return 0;
3917
3918 out_free_dmar:
3919 intel_iommu_free_dmars();
3920 up_write(&dmar_global_lock);
3921 return ret;
3922 }
3923
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3924 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3925 {
3926 struct device_domain_info *info = opaque;
3927
3928 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3929 return 0;
3930 }
3931
3932 /*
3933 * NB - intel-iommu lacks any sort of reference counting for the users of
3934 * dependent devices. If multiple endpoints have intersecting dependent
3935 * devices, unbinding the driver from any one of them will possibly leave
3936 * the others unable to operate.
3937 */
domain_context_clear(struct device_domain_info * info)3938 static void domain_context_clear(struct device_domain_info *info)
3939 {
3940 if (!dev_is_pci(info->dev)) {
3941 domain_context_clear_one(info, info->bus, info->devfn);
3942 return;
3943 }
3944
3945 pci_for_each_dma_alias(to_pci_dev(info->dev),
3946 &domain_context_clear_one_cb, info);
3947 }
3948
dmar_remove_one_dev_info(struct device * dev)3949 static void dmar_remove_one_dev_info(struct device *dev)
3950 {
3951 struct device_domain_info *info = dev_iommu_priv_get(dev);
3952 struct dmar_domain *domain = info->domain;
3953 struct intel_iommu *iommu = info->iommu;
3954 unsigned long flags;
3955
3956 if (!dev_is_real_dma_subdevice(info->dev)) {
3957 if (dev_is_pci(info->dev) && sm_supported(iommu))
3958 intel_pasid_tear_down_entry(iommu, info->dev,
3959 IOMMU_NO_PASID, false);
3960
3961 iommu_disable_pci_caps(info);
3962 domain_context_clear(info);
3963 }
3964
3965 spin_lock_irqsave(&domain->lock, flags);
3966 list_del(&info->link);
3967 spin_unlock_irqrestore(&domain->lock, flags);
3968
3969 domain_detach_iommu(domain, iommu);
3970 info->domain = NULL;
3971 }
3972
3973 /*
3974 * Clear the page table pointer in context or pasid table entries so that
3975 * all DMA requests without PASID from the device are blocked. If the page
3976 * table has been set, clean up the data structures.
3977 */
device_block_translation(struct device * dev)3978 static void device_block_translation(struct device *dev)
3979 {
3980 struct device_domain_info *info = dev_iommu_priv_get(dev);
3981 struct intel_iommu *iommu = info->iommu;
3982 unsigned long flags;
3983
3984 iommu_disable_pci_caps(info);
3985 if (!dev_is_real_dma_subdevice(dev)) {
3986 if (sm_supported(iommu))
3987 intel_pasid_tear_down_entry(iommu, dev,
3988 IOMMU_NO_PASID, false);
3989 else
3990 domain_context_clear(info);
3991 }
3992
3993 if (!info->domain)
3994 return;
3995
3996 spin_lock_irqsave(&info->domain->lock, flags);
3997 list_del(&info->link);
3998 spin_unlock_irqrestore(&info->domain->lock, flags);
3999
4000 domain_detach_iommu(info->domain, iommu);
4001 info->domain = NULL;
4002 }
4003
md_domain_init(struct dmar_domain * domain,int guest_width)4004 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4005 {
4006 int adjust_width;
4007
4008 /* calculate AGAW */
4009 domain->gaw = guest_width;
4010 adjust_width = guestwidth_to_adjustwidth(guest_width);
4011 domain->agaw = width_to_agaw(adjust_width);
4012
4013 domain->iommu_coherency = false;
4014 domain->iommu_superpage = 0;
4015 domain->max_addr = 0;
4016
4017 /* always allocate the top pgd */
4018 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4019 if (!domain->pgd)
4020 return -ENOMEM;
4021 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4022 return 0;
4023 }
4024
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4025 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4026 struct device *dev)
4027 {
4028 device_block_translation(dev);
4029 return 0;
4030 }
4031
4032 static struct iommu_domain blocking_domain = {
4033 .ops = &(const struct iommu_domain_ops) {
4034 .attach_dev = blocking_domain_attach_dev,
4035 .free = intel_iommu_domain_free
4036 }
4037 };
4038
intel_iommu_domain_alloc(unsigned type)4039 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4040 {
4041 struct dmar_domain *dmar_domain;
4042 struct iommu_domain *domain;
4043
4044 switch (type) {
4045 case IOMMU_DOMAIN_BLOCKED:
4046 return &blocking_domain;
4047 case IOMMU_DOMAIN_DMA:
4048 case IOMMU_DOMAIN_UNMANAGED:
4049 dmar_domain = alloc_domain(type);
4050 if (!dmar_domain) {
4051 pr_err("Can't allocate dmar_domain\n");
4052 return NULL;
4053 }
4054 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4055 pr_err("Domain initialization failed\n");
4056 domain_exit(dmar_domain);
4057 return NULL;
4058 }
4059
4060 domain = &dmar_domain->domain;
4061 domain->geometry.aperture_start = 0;
4062 domain->geometry.aperture_end =
4063 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4064 domain->geometry.force_aperture = true;
4065
4066 return domain;
4067 case IOMMU_DOMAIN_IDENTITY:
4068 return &si_domain->domain;
4069 case IOMMU_DOMAIN_SVA:
4070 return intel_svm_domain_alloc();
4071 default:
4072 return NULL;
4073 }
4074
4075 return NULL;
4076 }
4077
intel_iommu_domain_free(struct iommu_domain * domain)4078 static void intel_iommu_domain_free(struct iommu_domain *domain)
4079 {
4080 if (domain != &si_domain->domain && domain != &blocking_domain)
4081 domain_exit(to_dmar_domain(domain));
4082 }
4083
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)4084 static int prepare_domain_attach_device(struct iommu_domain *domain,
4085 struct device *dev)
4086 {
4087 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4088 struct intel_iommu *iommu;
4089 int addr_width;
4090
4091 iommu = device_to_iommu(dev, NULL, NULL);
4092 if (!iommu)
4093 return -ENODEV;
4094
4095 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4096 return -EINVAL;
4097
4098 /* check if this iommu agaw is sufficient for max mapped address */
4099 addr_width = agaw_to_width(iommu->agaw);
4100 if (addr_width > cap_mgaw(iommu->cap))
4101 addr_width = cap_mgaw(iommu->cap);
4102
4103 if (dmar_domain->max_addr > (1LL << addr_width))
4104 return -EINVAL;
4105 dmar_domain->gaw = addr_width;
4106
4107 /*
4108 * Knock out extra levels of page tables if necessary
4109 */
4110 while (iommu->agaw < dmar_domain->agaw) {
4111 struct dma_pte *pte;
4112
4113 pte = dmar_domain->pgd;
4114 if (dma_pte_present(pte)) {
4115 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4116 free_pgtable_page(pte);
4117 }
4118 dmar_domain->agaw--;
4119 }
4120
4121 return 0;
4122 }
4123
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4124 static int intel_iommu_attach_device(struct iommu_domain *domain,
4125 struct device *dev)
4126 {
4127 struct device_domain_info *info = dev_iommu_priv_get(dev);
4128 int ret;
4129
4130 if (info->domain)
4131 device_block_translation(dev);
4132
4133 ret = prepare_domain_attach_device(domain, dev);
4134 if (ret)
4135 return ret;
4136
4137 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4138 }
4139
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)4140 static int intel_iommu_map(struct iommu_domain *domain,
4141 unsigned long iova, phys_addr_t hpa,
4142 size_t size, int iommu_prot, gfp_t gfp)
4143 {
4144 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4145 u64 max_addr;
4146 int prot = 0;
4147
4148 if (iommu_prot & IOMMU_READ)
4149 prot |= DMA_PTE_READ;
4150 if (iommu_prot & IOMMU_WRITE)
4151 prot |= DMA_PTE_WRITE;
4152 if (dmar_domain->set_pte_snp)
4153 prot |= DMA_PTE_SNP;
4154
4155 max_addr = iova + size;
4156 if (dmar_domain->max_addr < max_addr) {
4157 u64 end;
4158
4159 /* check if minimum agaw is sufficient for mapped address */
4160 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4161 if (end < max_addr) {
4162 pr_err("%s: iommu width (%d) is not "
4163 "sufficient for the mapped address (%llx)\n",
4164 __func__, dmar_domain->gaw, max_addr);
4165 return -EFAULT;
4166 }
4167 dmar_domain->max_addr = max_addr;
4168 }
4169 /* Round up size to next multiple of PAGE_SIZE, if it and
4170 the low bits of hpa would take us onto the next page */
4171 size = aligned_nrpages(hpa, size);
4172 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4173 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4174 }
4175
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)4176 static int intel_iommu_map_pages(struct iommu_domain *domain,
4177 unsigned long iova, phys_addr_t paddr,
4178 size_t pgsize, size_t pgcount,
4179 int prot, gfp_t gfp, size_t *mapped)
4180 {
4181 unsigned long pgshift = __ffs(pgsize);
4182 size_t size = pgcount << pgshift;
4183 int ret;
4184
4185 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4186 return -EINVAL;
4187
4188 if (!IS_ALIGNED(iova | paddr, pgsize))
4189 return -EINVAL;
4190
4191 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4192 if (!ret && mapped)
4193 *mapped = size;
4194
4195 return ret;
4196 }
4197
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)4198 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4199 unsigned long iova, size_t size,
4200 struct iommu_iotlb_gather *gather)
4201 {
4202 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203 unsigned long start_pfn, last_pfn;
4204 int level = 0;
4205
4206 /* Cope with horrid API which requires us to unmap more than the
4207 size argument if it happens to be a large-page mapping. */
4208 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4209 &level, GFP_ATOMIC)))
4210 return 0;
4211
4212 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4213 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4214
4215 start_pfn = iova >> VTD_PAGE_SHIFT;
4216 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4217
4218 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4219
4220 if (dmar_domain->max_addr == iova + size)
4221 dmar_domain->max_addr = iova;
4222
4223 /*
4224 * We do not use page-selective IOTLB invalidation in flush queue,
4225 * so there is no need to track page and sync iotlb.
4226 */
4227 if (!iommu_iotlb_gather_queued(gather))
4228 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4229
4230 return size;
4231 }
4232
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)4233 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4234 unsigned long iova,
4235 size_t pgsize, size_t pgcount,
4236 struct iommu_iotlb_gather *gather)
4237 {
4238 unsigned long pgshift = __ffs(pgsize);
4239 size_t size = pgcount << pgshift;
4240
4241 return intel_iommu_unmap(domain, iova, size, gather);
4242 }
4243
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)4244 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4245 struct iommu_iotlb_gather *gather)
4246 {
4247 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4248 unsigned long iova_pfn = IOVA_PFN(gather->start);
4249 size_t size = gather->end - gather->start;
4250 struct iommu_domain_info *info;
4251 unsigned long start_pfn;
4252 unsigned long nrpages;
4253 unsigned long i;
4254
4255 nrpages = aligned_nrpages(gather->start, size);
4256 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4257
4258 xa_for_each(&dmar_domain->iommu_array, i, info)
4259 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4260 start_pfn, nrpages,
4261 list_empty(&gather->freelist), 0);
4262
4263 put_pages_list(&gather->freelist);
4264 }
4265
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)4266 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4267 dma_addr_t iova)
4268 {
4269 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4270 struct dma_pte *pte;
4271 int level = 0;
4272 u64 phys = 0;
4273
4274 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4275 GFP_ATOMIC);
4276 if (pte && dma_pte_present(pte))
4277 phys = dma_pte_addr(pte) +
4278 (iova & (BIT_MASK(level_to_offset_bits(level) +
4279 VTD_PAGE_SHIFT) - 1));
4280
4281 return phys;
4282 }
4283
domain_support_force_snooping(struct dmar_domain * domain)4284 static bool domain_support_force_snooping(struct dmar_domain *domain)
4285 {
4286 struct device_domain_info *info;
4287 bool support = true;
4288
4289 assert_spin_locked(&domain->lock);
4290 list_for_each_entry(info, &domain->devices, link) {
4291 if (!ecap_sc_support(info->iommu->ecap)) {
4292 support = false;
4293 break;
4294 }
4295 }
4296
4297 return support;
4298 }
4299
domain_set_force_snooping(struct dmar_domain * domain)4300 static void domain_set_force_snooping(struct dmar_domain *domain)
4301 {
4302 struct device_domain_info *info;
4303
4304 assert_spin_locked(&domain->lock);
4305 /*
4306 * Second level page table supports per-PTE snoop control. The
4307 * iommu_map() interface will handle this by setting SNP bit.
4308 */
4309 if (!domain->use_first_level) {
4310 domain->set_pte_snp = true;
4311 return;
4312 }
4313
4314 list_for_each_entry(info, &domain->devices, link)
4315 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4316 IOMMU_NO_PASID);
4317 }
4318
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)4319 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4320 {
4321 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4322 unsigned long flags;
4323
4324 if (dmar_domain->force_snooping)
4325 return true;
4326
4327 spin_lock_irqsave(&dmar_domain->lock, flags);
4328 if (!domain_support_force_snooping(dmar_domain) ||
4329 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4330 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4331 return false;
4332 }
4333
4334 domain_set_force_snooping(dmar_domain);
4335 dmar_domain->force_snooping = true;
4336 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4337
4338 return true;
4339 }
4340
intel_iommu_capable(struct device * dev,enum iommu_cap cap)4341 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4342 {
4343 struct device_domain_info *info = dev_iommu_priv_get(dev);
4344
4345 switch (cap) {
4346 case IOMMU_CAP_CACHE_COHERENCY:
4347 case IOMMU_CAP_DEFERRED_FLUSH:
4348 return true;
4349 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4350 return dmar_platform_optin();
4351 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4352 return ecap_sc_support(info->iommu->ecap);
4353 default:
4354 return false;
4355 }
4356 }
4357
intel_iommu_probe_device(struct device * dev)4358 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4359 {
4360 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4361 struct device_domain_info *info;
4362 struct intel_iommu *iommu;
4363 u8 bus, devfn;
4364 int ret;
4365
4366 iommu = device_to_iommu(dev, &bus, &devfn);
4367 if (!iommu || !iommu->iommu.ops)
4368 return ERR_PTR(-ENODEV);
4369
4370 info = kzalloc(sizeof(*info), GFP_KERNEL);
4371 if (!info)
4372 return ERR_PTR(-ENOMEM);
4373
4374 if (dev_is_real_dma_subdevice(dev)) {
4375 info->bus = pdev->bus->number;
4376 info->devfn = pdev->devfn;
4377 info->segment = pci_domain_nr(pdev->bus);
4378 } else {
4379 info->bus = bus;
4380 info->devfn = devfn;
4381 info->segment = iommu->segment;
4382 }
4383
4384 info->dev = dev;
4385 info->iommu = iommu;
4386 if (dev_is_pci(dev)) {
4387 if (ecap_dev_iotlb_support(iommu->ecap) &&
4388 pci_ats_supported(pdev) &&
4389 dmar_ats_supported(pdev, iommu)) {
4390 info->ats_supported = 1;
4391 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4392
4393 /*
4394 * For IOMMU that supports device IOTLB throttling
4395 * (DIT), we assign PFSID to the invalidation desc
4396 * of a VF such that IOMMU HW can gauge queue depth
4397 * at PF level. If DIT is not set, PFSID will be
4398 * treated as reserved, which should be set to 0.
4399 */
4400 if (ecap_dit(iommu->ecap))
4401 info->pfsid = pci_dev_id(pci_physfn(pdev));
4402 info->ats_qdep = pci_ats_queue_depth(pdev);
4403 }
4404 if (sm_supported(iommu)) {
4405 if (pasid_supported(iommu)) {
4406 int features = pci_pasid_features(pdev);
4407
4408 if (features >= 0)
4409 info->pasid_supported = features | 1;
4410 }
4411
4412 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4413 pci_pri_supported(pdev))
4414 info->pri_supported = 1;
4415 }
4416 }
4417
4418 dev_iommu_priv_set(dev, info);
4419
4420 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4421 ret = intel_pasid_alloc_table(dev);
4422 if (ret) {
4423 dev_err(dev, "PASID table allocation failed\n");
4424 dev_iommu_priv_set(dev, NULL);
4425 kfree(info);
4426 return ERR_PTR(ret);
4427 }
4428 }
4429
4430 return &iommu->iommu;
4431 }
4432
intel_iommu_release_device(struct device * dev)4433 static void intel_iommu_release_device(struct device *dev)
4434 {
4435 struct device_domain_info *info = dev_iommu_priv_get(dev);
4436
4437 dmar_remove_one_dev_info(dev);
4438 intel_pasid_free_table(dev);
4439 dev_iommu_priv_set(dev, NULL);
4440 kfree(info);
4441 set_dma_ops(dev, NULL);
4442 }
4443
intel_iommu_probe_finalize(struct device * dev)4444 static void intel_iommu_probe_finalize(struct device *dev)
4445 {
4446 set_dma_ops(dev, NULL);
4447 iommu_setup_dma_ops(dev, 0, U64_MAX);
4448 }
4449
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4450 static void intel_iommu_get_resv_regions(struct device *device,
4451 struct list_head *head)
4452 {
4453 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4454 struct iommu_resv_region *reg;
4455 struct dmar_rmrr_unit *rmrr;
4456 struct device *i_dev;
4457 int i;
4458
4459 rcu_read_lock();
4460 for_each_rmrr_units(rmrr) {
4461 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4462 i, i_dev) {
4463 struct iommu_resv_region *resv;
4464 enum iommu_resv_type type;
4465 size_t length;
4466
4467 if (i_dev != device &&
4468 !is_downstream_to_pci_bridge(device, i_dev))
4469 continue;
4470
4471 length = rmrr->end_address - rmrr->base_address + 1;
4472
4473 type = device_rmrr_is_relaxable(device) ?
4474 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4475
4476 resv = iommu_alloc_resv_region(rmrr->base_address,
4477 length, prot, type,
4478 GFP_ATOMIC);
4479 if (!resv)
4480 break;
4481
4482 list_add_tail(&resv->list, head);
4483 }
4484 }
4485 rcu_read_unlock();
4486
4487 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4488 if (dev_is_pci(device)) {
4489 struct pci_dev *pdev = to_pci_dev(device);
4490
4491 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4492 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4493 IOMMU_RESV_DIRECT_RELAXABLE,
4494 GFP_KERNEL);
4495 if (reg)
4496 list_add_tail(®->list, head);
4497 }
4498 }
4499 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4500
4501 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4502 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4503 0, IOMMU_RESV_MSI, GFP_KERNEL);
4504 if (!reg)
4505 return;
4506 list_add_tail(®->list, head);
4507 }
4508
intel_iommu_device_group(struct device * dev)4509 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4510 {
4511 if (dev_is_pci(dev))
4512 return pci_device_group(dev);
4513 return generic_device_group(dev);
4514 }
4515
intel_iommu_enable_sva(struct device * dev)4516 static int intel_iommu_enable_sva(struct device *dev)
4517 {
4518 struct device_domain_info *info = dev_iommu_priv_get(dev);
4519 struct intel_iommu *iommu;
4520
4521 if (!info || dmar_disabled)
4522 return -EINVAL;
4523
4524 iommu = info->iommu;
4525 if (!iommu)
4526 return -EINVAL;
4527
4528 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4529 return -ENODEV;
4530
4531 if (!info->pasid_enabled || !info->ats_enabled)
4532 return -EINVAL;
4533
4534 /*
4535 * Devices having device-specific I/O fault handling should not
4536 * support PCI/PRI. The IOMMU side has no means to check the
4537 * capability of device-specific IOPF. Therefore, IOMMU can only
4538 * default that if the device driver enables SVA on a non-PRI
4539 * device, it will handle IOPF in its own way.
4540 */
4541 if (!info->pri_supported)
4542 return 0;
4543
4544 /* Devices supporting PRI should have it enabled. */
4545 if (!info->pri_enabled)
4546 return -EINVAL;
4547
4548 return 0;
4549 }
4550
intel_iommu_enable_iopf(struct device * dev)4551 static int intel_iommu_enable_iopf(struct device *dev)
4552 {
4553 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4554 struct device_domain_info *info = dev_iommu_priv_get(dev);
4555 struct intel_iommu *iommu;
4556 int ret;
4557
4558 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4559 return -ENODEV;
4560
4561 if (info->pri_enabled)
4562 return -EBUSY;
4563
4564 iommu = info->iommu;
4565 if (!iommu)
4566 return -EINVAL;
4567
4568 /* PASID is required in PRG Response Message. */
4569 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4570 return -EINVAL;
4571
4572 ret = pci_reset_pri(pdev);
4573 if (ret)
4574 return ret;
4575
4576 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4577 if (ret)
4578 return ret;
4579
4580 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4581 if (ret)
4582 goto iopf_remove_device;
4583
4584 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4585 if (ret)
4586 goto iopf_unregister_handler;
4587 info->pri_enabled = 1;
4588
4589 return 0;
4590
4591 iopf_unregister_handler:
4592 iommu_unregister_device_fault_handler(dev);
4593 iopf_remove_device:
4594 iopf_queue_remove_device(iommu->iopf_queue, dev);
4595
4596 return ret;
4597 }
4598
intel_iommu_disable_iopf(struct device * dev)4599 static int intel_iommu_disable_iopf(struct device *dev)
4600 {
4601 struct device_domain_info *info = dev_iommu_priv_get(dev);
4602 struct intel_iommu *iommu = info->iommu;
4603
4604 if (!info->pri_enabled)
4605 return -EINVAL;
4606
4607 /*
4608 * PCIe spec states that by clearing PRI enable bit, the Page
4609 * Request Interface will not issue new page requests, but has
4610 * outstanding page requests that have been transmitted or are
4611 * queued for transmission. This is supposed to be called after
4612 * the device driver has stopped DMA, all PASIDs have been
4613 * unbound and the outstanding PRQs have been drained.
4614 */
4615 pci_disable_pri(to_pci_dev(dev));
4616 info->pri_enabled = 0;
4617
4618 /*
4619 * With PRI disabled and outstanding PRQs drained, unregistering
4620 * fault handler and removing device from iopf queue should never
4621 * fail.
4622 */
4623 WARN_ON(iommu_unregister_device_fault_handler(dev));
4624 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4625
4626 return 0;
4627 }
4628
4629 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4630 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4631 {
4632 switch (feat) {
4633 case IOMMU_DEV_FEAT_IOPF:
4634 return intel_iommu_enable_iopf(dev);
4635
4636 case IOMMU_DEV_FEAT_SVA:
4637 return intel_iommu_enable_sva(dev);
4638
4639 default:
4640 return -ENODEV;
4641 }
4642 }
4643
4644 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4645 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4646 {
4647 switch (feat) {
4648 case IOMMU_DEV_FEAT_IOPF:
4649 return intel_iommu_disable_iopf(dev);
4650
4651 case IOMMU_DEV_FEAT_SVA:
4652 return 0;
4653
4654 default:
4655 return -ENODEV;
4656 }
4657 }
4658
intel_iommu_is_attach_deferred(struct device * dev)4659 static bool intel_iommu_is_attach_deferred(struct device *dev)
4660 {
4661 struct device_domain_info *info = dev_iommu_priv_get(dev);
4662
4663 return translation_pre_enabled(info->iommu) && !info->domain;
4664 }
4665
4666 /*
4667 * Check that the device does not live on an external facing PCI port that is
4668 * marked as untrusted. Such devices should not be able to apply quirks and
4669 * thus not be able to bypass the IOMMU restrictions.
4670 */
risky_device(struct pci_dev * pdev)4671 static bool risky_device(struct pci_dev *pdev)
4672 {
4673 if (pdev->untrusted) {
4674 pci_info(pdev,
4675 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4676 pdev->vendor, pdev->device);
4677 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4678 return true;
4679 }
4680 return false;
4681 }
4682
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4683 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4684 unsigned long iova, size_t size)
4685 {
4686 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4687 unsigned long pages = aligned_nrpages(iova, size);
4688 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4689 struct iommu_domain_info *info;
4690 unsigned long i;
4691
4692 xa_for_each(&dmar_domain->iommu_array, i, info)
4693 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4694 }
4695
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid)4696 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4697 {
4698 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4699 struct dev_pasid_info *curr, *dev_pasid = NULL;
4700 struct dmar_domain *dmar_domain;
4701 struct iommu_domain *domain;
4702 unsigned long flags;
4703
4704 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4705 if (WARN_ON_ONCE(!domain))
4706 goto out_tear_down;
4707
4708 /*
4709 * The SVA implementation needs to handle its own stuffs like the mm
4710 * notification. Before consolidating that code into iommu core, let
4711 * the intel sva code handle it.
4712 */
4713 if (domain->type == IOMMU_DOMAIN_SVA) {
4714 intel_svm_remove_dev_pasid(dev, pasid);
4715 goto out_tear_down;
4716 }
4717
4718 dmar_domain = to_dmar_domain(domain);
4719 spin_lock_irqsave(&dmar_domain->lock, flags);
4720 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4721 if (curr->dev == dev && curr->pasid == pasid) {
4722 list_del(&curr->link_domain);
4723 dev_pasid = curr;
4724 break;
4725 }
4726 }
4727 WARN_ON_ONCE(!dev_pasid);
4728 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4729
4730 domain_detach_iommu(dmar_domain, iommu);
4731 kfree(dev_pasid);
4732 out_tear_down:
4733 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4734 intel_drain_pasid_prq(dev, pasid);
4735 }
4736
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4737 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4738 struct device *dev, ioasid_t pasid)
4739 {
4740 struct device_domain_info *info = dev_iommu_priv_get(dev);
4741 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4742 struct intel_iommu *iommu = info->iommu;
4743 struct dev_pasid_info *dev_pasid;
4744 unsigned long flags;
4745 int ret;
4746
4747 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4748 return -EOPNOTSUPP;
4749
4750 if (context_copied(iommu, info->bus, info->devfn))
4751 return -EBUSY;
4752
4753 ret = prepare_domain_attach_device(domain, dev);
4754 if (ret)
4755 return ret;
4756
4757 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4758 if (!dev_pasid)
4759 return -ENOMEM;
4760
4761 ret = domain_attach_iommu(dmar_domain, iommu);
4762 if (ret)
4763 goto out_free;
4764
4765 if (domain_type_is_si(dmar_domain))
4766 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4767 dev, pasid);
4768 else if (dmar_domain->use_first_level)
4769 ret = domain_setup_first_level(iommu, dmar_domain,
4770 dev, pasid);
4771 else
4772 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4773 dev, pasid);
4774 if (ret)
4775 goto out_detach_iommu;
4776
4777 dev_pasid->dev = dev;
4778 dev_pasid->pasid = pasid;
4779 spin_lock_irqsave(&dmar_domain->lock, flags);
4780 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4781 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4782
4783 return 0;
4784 out_detach_iommu:
4785 domain_detach_iommu(dmar_domain, iommu);
4786 out_free:
4787 kfree(dev_pasid);
4788 return ret;
4789 }
4790
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4791 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4792 {
4793 struct device_domain_info *info = dev_iommu_priv_get(dev);
4794 struct intel_iommu *iommu = info->iommu;
4795 struct iommu_hw_info_vtd *vtd;
4796
4797 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4798 if (!vtd)
4799 return ERR_PTR(-ENOMEM);
4800
4801 vtd->cap_reg = iommu->cap;
4802 vtd->ecap_reg = iommu->ecap;
4803 *length = sizeof(*vtd);
4804 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4805 return vtd;
4806 }
4807
4808 const struct iommu_ops intel_iommu_ops = {
4809 .capable = intel_iommu_capable,
4810 .hw_info = intel_iommu_hw_info,
4811 .domain_alloc = intel_iommu_domain_alloc,
4812 .probe_device = intel_iommu_probe_device,
4813 .probe_finalize = intel_iommu_probe_finalize,
4814 .release_device = intel_iommu_release_device,
4815 .get_resv_regions = intel_iommu_get_resv_regions,
4816 .device_group = intel_iommu_device_group,
4817 .dev_enable_feat = intel_iommu_dev_enable_feat,
4818 .dev_disable_feat = intel_iommu_dev_disable_feat,
4819 .is_attach_deferred = intel_iommu_is_attach_deferred,
4820 .def_domain_type = device_def_domain_type,
4821 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4822 .pgsize_bitmap = SZ_4K,
4823 #ifdef CONFIG_INTEL_IOMMU_SVM
4824 .page_response = intel_svm_page_response,
4825 #endif
4826 .default_domain_ops = &(const struct iommu_domain_ops) {
4827 .attach_dev = intel_iommu_attach_device,
4828 .set_dev_pasid = intel_iommu_set_dev_pasid,
4829 .map_pages = intel_iommu_map_pages,
4830 .unmap_pages = intel_iommu_unmap_pages,
4831 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4832 .flush_iotlb_all = intel_flush_iotlb_all,
4833 .iotlb_sync = intel_iommu_tlb_sync,
4834 .iova_to_phys = intel_iommu_iova_to_phys,
4835 .free = intel_iommu_domain_free,
4836 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4837 }
4838 };
4839
quirk_iommu_igfx(struct pci_dev * dev)4840 static void quirk_iommu_igfx(struct pci_dev *dev)
4841 {
4842 if (risky_device(dev))
4843 return;
4844
4845 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4846 dmar_map_gfx = 0;
4847 }
4848
4849 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4857
4858 /* Broadwell igfx malfunctions with dmar */
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4883
quirk_iommu_rwbf(struct pci_dev * dev)4884 static void quirk_iommu_rwbf(struct pci_dev *dev)
4885 {
4886 if (risky_device(dev))
4887 return;
4888
4889 /*
4890 * Mobile 4 Series Chipset neglects to set RWBF capability,
4891 * but needs it. Same seems to hold for the desktop versions.
4892 */
4893 pci_info(dev, "Forcing write-buffer flush capability\n");
4894 rwbf_quirk = 1;
4895 }
4896
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4904
4905 #define GGC 0x52
4906 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4907 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4908 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4909 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4910 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4911 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4912 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4913 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4914
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4915 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4916 {
4917 unsigned short ggc;
4918
4919 if (risky_device(dev))
4920 return;
4921
4922 if (pci_read_config_word(dev, GGC, &ggc))
4923 return;
4924
4925 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4926 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4927 dmar_map_gfx = 0;
4928 } else if (dmar_map_gfx) {
4929 /* we have to ensure the gfx device is idle before we flush */
4930 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4931 iommu_set_dma_strict();
4932 }
4933 }
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4938
quirk_igfx_skip_te_disable(struct pci_dev * dev)4939 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4940 {
4941 unsigned short ver;
4942
4943 if (!IS_GFX_DEVICE(dev))
4944 return;
4945
4946 ver = (dev->device >> 8) & 0xff;
4947 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4948 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4949 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4950 return;
4951
4952 if (risky_device(dev))
4953 return;
4954
4955 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4956 iommu_skip_te_disable = 1;
4957 }
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4959
4960 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4961 ISOCH DMAR unit for the Azalia sound device, but not give it any
4962 TLB entries, which causes it to deadlock. Check for that. We do
4963 this in a function called from init_dmars(), instead of in a PCI
4964 quirk, because we don't want to print the obnoxious "BIOS broken"
4965 message if VT-d is actually disabled.
4966 */
check_tylersburg_isoch(void)4967 static void __init check_tylersburg_isoch(void)
4968 {
4969 struct pci_dev *pdev;
4970 uint32_t vtisochctrl;
4971
4972 /* If there's no Azalia in the system anyway, forget it. */
4973 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4974 if (!pdev)
4975 return;
4976
4977 if (risky_device(pdev)) {
4978 pci_dev_put(pdev);
4979 return;
4980 }
4981
4982 pci_dev_put(pdev);
4983
4984 /* System Management Registers. Might be hidden, in which case
4985 we can't do the sanity check. But that's OK, because the
4986 known-broken BIOSes _don't_ actually hide it, so far. */
4987 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4988 if (!pdev)
4989 return;
4990
4991 if (risky_device(pdev)) {
4992 pci_dev_put(pdev);
4993 return;
4994 }
4995
4996 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4997 pci_dev_put(pdev);
4998 return;
4999 }
5000
5001 pci_dev_put(pdev);
5002
5003 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5004 if (vtisochctrl & 1)
5005 return;
5006
5007 /* Drop all bits other than the number of TLB entries */
5008 vtisochctrl &= 0x1c;
5009
5010 /* If we have the recommended number of TLB entries (16), fine. */
5011 if (vtisochctrl == 0x10)
5012 return;
5013
5014 /* Zero TLB entries? You get to ride the short bus to school. */
5015 if (!vtisochctrl) {
5016 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5017 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5018 dmi_get_system_info(DMI_BIOS_VENDOR),
5019 dmi_get_system_info(DMI_BIOS_VERSION),
5020 dmi_get_system_info(DMI_PRODUCT_VERSION));
5021 iommu_identity_mapping |= IDENTMAP_AZALIA;
5022 return;
5023 }
5024
5025 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5026 vtisochctrl);
5027 }
5028
5029 /*
5030 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5031 * invalidation completion before posted writes initiated with translated address
5032 * that utilized translations matching the invalidation address range, violating
5033 * the invalidation completion ordering.
5034 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5035 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5036 * under the control of the trusted/privileged host device driver must use this
5037 * quirk.
5038 * Device TLBs are invalidated under the following six conditions:
5039 * 1. Device driver does DMA API unmap IOVA
5040 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5041 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5042 * exit_mmap() due to crash
5043 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5044 * VM has to free pages that were unmapped
5045 * 5. Userspace driver unmaps a DMA buffer
5046 * 6. Cache invalidation in vSVA usage (upcoming)
5047 *
5048 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5049 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5050 * invalidate TLB the same way as normal user unmap which will use this quirk.
5051 * The dTLB invalidation after PASID cache flush does not need this quirk.
5052 *
5053 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5054 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)5055 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5056 unsigned long address, unsigned long mask,
5057 u32 pasid, u16 qdep)
5058 {
5059 u16 sid;
5060
5061 if (likely(!info->dtlb_extra_inval))
5062 return;
5063
5064 sid = PCI_DEVID(info->bus, info->devfn);
5065 if (pasid == IOMMU_NO_PASID) {
5066 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5067 qdep, address, mask);
5068 } else {
5069 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5070 pasid, qdep, address, mask);
5071 }
5072 }
5073
5074 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5075
5076 /*
5077 * Function to submit a command to the enhanced command interface. The
5078 * valid enhanced command descriptions are defined in Table 47 of the
5079 * VT-d spec. The VT-d hardware implementation may support some but not
5080 * all commands, which can be determined by checking the Enhanced
5081 * Command Capability Register.
5082 *
5083 * Return values:
5084 * - 0: Command successful without any error;
5085 * - Negative: software error value;
5086 * - Nonzero positive: failure status code defined in Table 48.
5087 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)5088 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5089 {
5090 unsigned long flags;
5091 u64 res;
5092 int ret;
5093
5094 if (!cap_ecmds(iommu->cap))
5095 return -ENODEV;
5096
5097 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5098
5099 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5100 if (res & DMA_ECMD_ECRSP_IP) {
5101 ret = -EBUSY;
5102 goto err;
5103 }
5104
5105 /*
5106 * Unconditionally write the operand B, because
5107 * - There is no side effect if an ecmd doesn't require an
5108 * operand B, but we set the register to some value.
5109 * - It's not invoked in any critical path. The extra MMIO
5110 * write doesn't bring any performance concerns.
5111 */
5112 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5113 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5114
5115 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5116 !(res & DMA_ECMD_ECRSP_IP), res);
5117
5118 if (res & DMA_ECMD_ECRSP_IP) {
5119 ret = -ETIMEDOUT;
5120 goto err;
5121 }
5122
5123 ret = ecmd_get_status_code(res);
5124 err:
5125 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5126
5127 return ret;
5128 }
5129