xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 1c9f8dff62d85ce00b0e99f774a84bd783af7cac)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN		(1)
63 
64 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65 
66 /* page table handling */
67 #define LEVEL_STRIDE		(9)
68 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69 
70 static inline int agaw_to_level(int agaw)
71 {
72 	return agaw + 2;
73 }
74 
75 static inline int agaw_to_width(int agaw)
76 {
77 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79 
80 static inline int width_to_agaw(int width)
81 {
82 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84 
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 	return (level - 1) * LEVEL_STRIDE;
88 }
89 
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94 
95 static inline u64 level_mask(int level)
96 {
97 	return -1ULL << level_to_offset_bits(level);
98 }
99 
100 static inline u64 level_size(int level)
101 {
102 	return 1ULL << level_to_offset_bits(level);
103 }
104 
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 	return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109 
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114 
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
118 {
119 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long page_to_dma_pfn(struct page *pg)
122 {
123 	return mm_to_dma_pfn(page_to_pfn(pg));
124 }
125 static inline unsigned long virt_to_dma_pfn(void *p)
126 {
127 	return page_to_dma_pfn(virt_to_page(p));
128 }
129 
130 static void __init check_tylersburg_isoch(void);
131 static int rwbf_quirk;
132 
133 /*
134  * set to 1 to panic kernel if can't successfully enable VT-d
135  * (used when kernel is launched w/ TXT)
136  */
137 static int force_on = 0;
138 static int intel_iommu_tboot_noforce;
139 static int no_platform_optin;
140 
141 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
142 
143 /*
144  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
145  * if marked present.
146  */
147 static phys_addr_t root_entry_lctp(struct root_entry *re)
148 {
149 	if (!(re->lo & 1))
150 		return 0;
151 
152 	return re->lo & VTD_PAGE_MASK;
153 }
154 
155 /*
156  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
157  * if marked present.
158  */
159 static phys_addr_t root_entry_uctp(struct root_entry *re)
160 {
161 	if (!(re->hi & 1))
162 		return 0;
163 
164 	return re->hi & VTD_PAGE_MASK;
165 }
166 
167 static inline void context_set_present(struct context_entry *context)
168 {
169 	context->lo |= 1;
170 }
171 
172 static inline void context_set_fault_enable(struct context_entry *context)
173 {
174 	context->lo &= (((u64)-1) << 2) | 1;
175 }
176 
177 static inline void context_set_translation_type(struct context_entry *context,
178 						unsigned long value)
179 {
180 	context->lo &= (((u64)-1) << 4) | 3;
181 	context->lo |= (value & 3) << 2;
182 }
183 
184 static inline void context_set_address_root(struct context_entry *context,
185 					    unsigned long value)
186 {
187 	context->lo &= ~VTD_PAGE_MASK;
188 	context->lo |= value & VTD_PAGE_MASK;
189 }
190 
191 static inline void context_set_address_width(struct context_entry *context,
192 					     unsigned long value)
193 {
194 	context->hi |= value & 7;
195 }
196 
197 static inline void context_set_domain_id(struct context_entry *context,
198 					 unsigned long value)
199 {
200 	context->hi |= (value & ((1 << 16) - 1)) << 8;
201 }
202 
203 static inline void context_set_pasid(struct context_entry *context)
204 {
205 	context->lo |= CONTEXT_PASIDE;
206 }
207 
208 static inline int context_domain_id(struct context_entry *c)
209 {
210 	return((c->hi >> 8) & 0xffff);
211 }
212 
213 static inline void context_clear_entry(struct context_entry *context)
214 {
215 	context->lo = 0;
216 	context->hi = 0;
217 }
218 
219 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
220 {
221 	if (!iommu->copied_tables)
222 		return false;
223 
224 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
225 }
226 
227 static inline void
228 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
229 {
230 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
231 }
232 
233 static inline void
234 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
235 {
236 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
237 }
238 
239 /*
240  * This domain is a statically identity mapping domain.
241  *	1. This domain creats a static 1:1 mapping to all usable memory.
242  * 	2. It maps to each iommu if successful.
243  *	3. Each iommu mapps to this domain if successful.
244  */
245 static struct dmar_domain *si_domain;
246 static int hw_pass_through = 1;
247 
248 struct dmar_rmrr_unit {
249 	struct list_head list;		/* list of rmrr units	*/
250 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
251 	u64	base_address;		/* reserved base address*/
252 	u64	end_address;		/* reserved end address */
253 	struct dmar_dev_scope *devices;	/* target devices */
254 	int	devices_cnt;		/* target device count */
255 };
256 
257 struct dmar_atsr_unit {
258 	struct list_head list;		/* list of ATSR units */
259 	struct acpi_dmar_header *hdr;	/* ACPI header */
260 	struct dmar_dev_scope *devices;	/* target devices */
261 	int devices_cnt;		/* target device count */
262 	u8 include_all:1;		/* include all ports */
263 };
264 
265 struct dmar_satc_unit {
266 	struct list_head list;		/* list of SATC units */
267 	struct acpi_dmar_header *hdr;	/* ACPI header */
268 	struct dmar_dev_scope *devices;	/* target devices */
269 	struct intel_iommu *iommu;	/* the corresponding iommu */
270 	int devices_cnt;		/* target device count */
271 	u8 atc_required:1;		/* ATS is required */
272 };
273 
274 static LIST_HEAD(dmar_atsr_units);
275 static LIST_HEAD(dmar_rmrr_units);
276 static LIST_HEAD(dmar_satc_units);
277 
278 #define for_each_rmrr_units(rmrr) \
279 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
280 
281 static void device_block_translation(struct device *dev);
282 static void intel_iommu_domain_free(struct iommu_domain *domain);
283 
284 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
285 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
286 
287 int intel_iommu_enabled = 0;
288 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
289 
290 static int dmar_map_gfx = 1;
291 static int intel_iommu_superpage = 1;
292 static int iommu_identity_mapping;
293 static int iommu_skip_te_disable;
294 
295 #define IDENTMAP_GFX		2
296 #define IDENTMAP_AZALIA		4
297 
298 const struct iommu_ops intel_iommu_ops;
299 
300 static bool translation_pre_enabled(struct intel_iommu *iommu)
301 {
302 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
303 }
304 
305 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
306 {
307 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
308 }
309 
310 static void init_translation_status(struct intel_iommu *iommu)
311 {
312 	u32 gsts;
313 
314 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
315 	if (gsts & DMA_GSTS_TES)
316 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
317 }
318 
319 static int __init intel_iommu_setup(char *str)
320 {
321 	if (!str)
322 		return -EINVAL;
323 
324 	while (*str) {
325 		if (!strncmp(str, "on", 2)) {
326 			dmar_disabled = 0;
327 			pr_info("IOMMU enabled\n");
328 		} else if (!strncmp(str, "off", 3)) {
329 			dmar_disabled = 1;
330 			no_platform_optin = 1;
331 			pr_info("IOMMU disabled\n");
332 		} else if (!strncmp(str, "igfx_off", 8)) {
333 			dmar_map_gfx = 0;
334 			pr_info("Disable GFX device mapping\n");
335 		} else if (!strncmp(str, "forcedac", 8)) {
336 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
337 			iommu_dma_forcedac = true;
338 		} else if (!strncmp(str, "strict", 6)) {
339 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
340 			iommu_set_dma_strict();
341 		} else if (!strncmp(str, "sp_off", 6)) {
342 			pr_info("Disable supported super page\n");
343 			intel_iommu_superpage = 0;
344 		} else if (!strncmp(str, "sm_on", 5)) {
345 			pr_info("Enable scalable mode if hardware supports\n");
346 			intel_iommu_sm = 1;
347 		} else if (!strncmp(str, "sm_off", 6)) {
348 			pr_info("Scalable mode is disallowed\n");
349 			intel_iommu_sm = 0;
350 		} else if (!strncmp(str, "tboot_noforce", 13)) {
351 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
352 			intel_iommu_tboot_noforce = 1;
353 		} else {
354 			pr_notice("Unknown option - '%s'\n", str);
355 		}
356 
357 		str += strcspn(str, ",");
358 		while (*str == ',')
359 			str++;
360 	}
361 
362 	return 1;
363 }
364 __setup("intel_iommu=", intel_iommu_setup);
365 
366 void *alloc_pgtable_page(int node, gfp_t gfp)
367 {
368 	struct page *page;
369 	void *vaddr = NULL;
370 
371 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
372 	if (page)
373 		vaddr = page_address(page);
374 	return vaddr;
375 }
376 
377 void free_pgtable_page(void *vaddr)
378 {
379 	free_page((unsigned long)vaddr);
380 }
381 
382 static inline int domain_type_is_si(struct dmar_domain *domain)
383 {
384 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
385 }
386 
387 static inline int domain_pfn_supported(struct dmar_domain *domain,
388 				       unsigned long pfn)
389 {
390 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
391 
392 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
393 }
394 
395 /*
396  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
397  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
398  * the returned SAGAW.
399  */
400 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
401 {
402 	unsigned long fl_sagaw, sl_sagaw;
403 
404 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
405 	sl_sagaw = cap_sagaw(iommu->cap);
406 
407 	/* Second level only. */
408 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
409 		return sl_sagaw;
410 
411 	/* First level only. */
412 	if (!ecap_slts(iommu->ecap))
413 		return fl_sagaw;
414 
415 	return fl_sagaw & sl_sagaw;
416 }
417 
418 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
419 {
420 	unsigned long sagaw;
421 	int agaw;
422 
423 	sagaw = __iommu_calculate_sagaw(iommu);
424 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
425 		if (test_bit(agaw, &sagaw))
426 			break;
427 	}
428 
429 	return agaw;
430 }
431 
432 /*
433  * Calculate max SAGAW for each iommu.
434  */
435 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
436 {
437 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
438 }
439 
440 /*
441  * calculate agaw for each iommu.
442  * "SAGAW" may be different across iommus, use a default agaw, and
443  * get a supported less agaw for iommus that don't support the default agaw.
444  */
445 int iommu_calculate_agaw(struct intel_iommu *iommu)
446 {
447 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
448 }
449 
450 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
451 {
452 	return sm_supported(iommu) ?
453 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
454 }
455 
456 static void domain_update_iommu_coherency(struct dmar_domain *domain)
457 {
458 	struct iommu_domain_info *info;
459 	struct dmar_drhd_unit *drhd;
460 	struct intel_iommu *iommu;
461 	bool found = false;
462 	unsigned long i;
463 
464 	domain->iommu_coherency = true;
465 	xa_for_each(&domain->iommu_array, i, info) {
466 		found = true;
467 		if (!iommu_paging_structure_coherency(info->iommu)) {
468 			domain->iommu_coherency = false;
469 			break;
470 		}
471 	}
472 	if (found)
473 		return;
474 
475 	/* No hardware attached; use lowest common denominator */
476 	rcu_read_lock();
477 	for_each_active_iommu(iommu, drhd) {
478 		if (!iommu_paging_structure_coherency(iommu)) {
479 			domain->iommu_coherency = false;
480 			break;
481 		}
482 	}
483 	rcu_read_unlock();
484 }
485 
486 static int domain_update_iommu_superpage(struct dmar_domain *domain,
487 					 struct intel_iommu *skip)
488 {
489 	struct dmar_drhd_unit *drhd;
490 	struct intel_iommu *iommu;
491 	int mask = 0x3;
492 
493 	if (!intel_iommu_superpage)
494 		return 0;
495 
496 	/* set iommu_superpage to the smallest common denominator */
497 	rcu_read_lock();
498 	for_each_active_iommu(iommu, drhd) {
499 		if (iommu != skip) {
500 			if (domain && domain->use_first_level) {
501 				if (!cap_fl1gp_support(iommu->cap))
502 					mask = 0x1;
503 			} else {
504 				mask &= cap_super_page_val(iommu->cap);
505 			}
506 
507 			if (!mask)
508 				break;
509 		}
510 	}
511 	rcu_read_unlock();
512 
513 	return fls(mask);
514 }
515 
516 static int domain_update_device_node(struct dmar_domain *domain)
517 {
518 	struct device_domain_info *info;
519 	int nid = NUMA_NO_NODE;
520 	unsigned long flags;
521 
522 	spin_lock_irqsave(&domain->lock, flags);
523 	list_for_each_entry(info, &domain->devices, link) {
524 		/*
525 		 * There could possibly be multiple device numa nodes as devices
526 		 * within the same domain may sit behind different IOMMUs. There
527 		 * isn't perfect answer in such situation, so we select first
528 		 * come first served policy.
529 		 */
530 		nid = dev_to_node(info->dev);
531 		if (nid != NUMA_NO_NODE)
532 			break;
533 	}
534 	spin_unlock_irqrestore(&domain->lock, flags);
535 
536 	return nid;
537 }
538 
539 static void domain_update_iotlb(struct dmar_domain *domain);
540 
541 /* Return the super pagesize bitmap if supported. */
542 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
543 {
544 	unsigned long bitmap = 0;
545 
546 	/*
547 	 * 1-level super page supports page size of 2MiB, 2-level super page
548 	 * supports page size of both 2MiB and 1GiB.
549 	 */
550 	if (domain->iommu_superpage == 1)
551 		bitmap |= SZ_2M;
552 	else if (domain->iommu_superpage == 2)
553 		bitmap |= SZ_2M | SZ_1G;
554 
555 	return bitmap;
556 }
557 
558 /* Some capabilities may be different across iommus */
559 static void domain_update_iommu_cap(struct dmar_domain *domain)
560 {
561 	domain_update_iommu_coherency(domain);
562 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
563 
564 	/*
565 	 * If RHSA is missing, we should default to the device numa domain
566 	 * as fall back.
567 	 */
568 	if (domain->nid == NUMA_NO_NODE)
569 		domain->nid = domain_update_device_node(domain);
570 
571 	/*
572 	 * First-level translation restricts the input-address to a
573 	 * canonical address (i.e., address bits 63:N have the same
574 	 * value as address bit [N-1], where N is 48-bits with 4-level
575 	 * paging and 57-bits with 5-level paging). Hence, skip bit
576 	 * [N-1].
577 	 */
578 	if (domain->use_first_level)
579 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
580 	else
581 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
582 
583 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
584 	domain_update_iotlb(domain);
585 }
586 
587 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
588 					 u8 devfn, int alloc)
589 {
590 	struct root_entry *root = &iommu->root_entry[bus];
591 	struct context_entry *context;
592 	u64 *entry;
593 
594 	/*
595 	 * Except that the caller requested to allocate a new entry,
596 	 * returning a copied context entry makes no sense.
597 	 */
598 	if (!alloc && context_copied(iommu, bus, devfn))
599 		return NULL;
600 
601 	entry = &root->lo;
602 	if (sm_supported(iommu)) {
603 		if (devfn >= 0x80) {
604 			devfn -= 0x80;
605 			entry = &root->hi;
606 		}
607 		devfn *= 2;
608 	}
609 	if (*entry & 1)
610 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
611 	else {
612 		unsigned long phy_addr;
613 		if (!alloc)
614 			return NULL;
615 
616 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
617 		if (!context)
618 			return NULL;
619 
620 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621 		phy_addr = virt_to_phys((void *)context);
622 		*entry = phy_addr | 1;
623 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
624 	}
625 	return &context[devfn];
626 }
627 
628 /**
629  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
630  *				 sub-hierarchy of a candidate PCI-PCI bridge
631  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
632  * @bridge: the candidate PCI-PCI bridge
633  *
634  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
635  */
636 static bool
637 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
638 {
639 	struct pci_dev *pdev, *pbridge;
640 
641 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
642 		return false;
643 
644 	pdev = to_pci_dev(dev);
645 	pbridge = to_pci_dev(bridge);
646 
647 	if (pbridge->subordinate &&
648 	    pbridge->subordinate->number <= pdev->bus->number &&
649 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
650 		return true;
651 
652 	return false;
653 }
654 
655 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
656 {
657 	struct dmar_drhd_unit *drhd;
658 	u32 vtbar;
659 	int rc;
660 
661 	/* We know that this device on this chipset has its own IOMMU.
662 	 * If we find it under a different IOMMU, then the BIOS is lying
663 	 * to us. Hope that the IOMMU for this device is actually
664 	 * disabled, and it needs no translation...
665 	 */
666 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
667 	if (rc) {
668 		/* "can't" happen */
669 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
670 		return false;
671 	}
672 	vtbar &= 0xffff0000;
673 
674 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
675 	drhd = dmar_find_matched_drhd_unit(pdev);
676 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
677 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
678 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
679 		return true;
680 	}
681 
682 	return false;
683 }
684 
685 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
686 {
687 	if (!iommu || iommu->drhd->ignored)
688 		return true;
689 
690 	if (dev_is_pci(dev)) {
691 		struct pci_dev *pdev = to_pci_dev(dev);
692 
693 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
694 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
695 		    quirk_ioat_snb_local_iommu(pdev))
696 			return true;
697 	}
698 
699 	return false;
700 }
701 
702 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
703 {
704 	struct dmar_drhd_unit *drhd = NULL;
705 	struct pci_dev *pdev = NULL;
706 	struct intel_iommu *iommu;
707 	struct device *tmp;
708 	u16 segment = 0;
709 	int i;
710 
711 	if (!dev)
712 		return NULL;
713 
714 	if (dev_is_pci(dev)) {
715 		struct pci_dev *pf_pdev;
716 
717 		pdev = pci_real_dma_dev(to_pci_dev(dev));
718 
719 		/* VFs aren't listed in scope tables; we need to look up
720 		 * the PF instead to find the IOMMU. */
721 		pf_pdev = pci_physfn(pdev);
722 		dev = &pf_pdev->dev;
723 		segment = pci_domain_nr(pdev->bus);
724 	} else if (has_acpi_companion(dev))
725 		dev = &ACPI_COMPANION(dev)->dev;
726 
727 	rcu_read_lock();
728 	for_each_iommu(iommu, drhd) {
729 		if (pdev && segment != drhd->segment)
730 			continue;
731 
732 		for_each_active_dev_scope(drhd->devices,
733 					  drhd->devices_cnt, i, tmp) {
734 			if (tmp == dev) {
735 				/* For a VF use its original BDF# not that of the PF
736 				 * which we used for the IOMMU lookup. Strictly speaking
737 				 * we could do this for all PCI devices; we only need to
738 				 * get the BDF# from the scope table for ACPI matches. */
739 				if (pdev && pdev->is_virtfn)
740 					goto got_pdev;
741 
742 				if (bus && devfn) {
743 					*bus = drhd->devices[i].bus;
744 					*devfn = drhd->devices[i].devfn;
745 				}
746 				goto out;
747 			}
748 
749 			if (is_downstream_to_pci_bridge(dev, tmp))
750 				goto got_pdev;
751 		}
752 
753 		if (pdev && drhd->include_all) {
754 got_pdev:
755 			if (bus && devfn) {
756 				*bus = pdev->bus->number;
757 				*devfn = pdev->devfn;
758 			}
759 			goto out;
760 		}
761 	}
762 	iommu = NULL;
763 out:
764 	if (iommu_is_dummy(iommu, dev))
765 		iommu = NULL;
766 
767 	rcu_read_unlock();
768 
769 	return iommu;
770 }
771 
772 static void domain_flush_cache(struct dmar_domain *domain,
773 			       void *addr, int size)
774 {
775 	if (!domain->iommu_coherency)
776 		clflush_cache_range(addr, size);
777 }
778 
779 static void free_context_table(struct intel_iommu *iommu)
780 {
781 	struct context_entry *context;
782 	int i;
783 
784 	if (!iommu->root_entry)
785 		return;
786 
787 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
788 		context = iommu_context_addr(iommu, i, 0, 0);
789 		if (context)
790 			free_pgtable_page(context);
791 
792 		if (!sm_supported(iommu))
793 			continue;
794 
795 		context = iommu_context_addr(iommu, i, 0x80, 0);
796 		if (context)
797 			free_pgtable_page(context);
798 	}
799 
800 	free_pgtable_page(iommu->root_entry);
801 	iommu->root_entry = NULL;
802 }
803 
804 #ifdef CONFIG_DMAR_DEBUG
805 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
806 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
807 {
808 	struct dma_pte *pte;
809 	int offset;
810 
811 	while (1) {
812 		offset = pfn_level_offset(pfn, level);
813 		pte = &parent[offset];
814 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
815 			pr_info("PTE not present at level %d\n", level);
816 			break;
817 		}
818 
819 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
820 
821 		if (level == 1)
822 			break;
823 
824 		parent = phys_to_virt(dma_pte_addr(pte));
825 		level--;
826 	}
827 }
828 
829 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
830 			  unsigned long long addr, u32 pasid)
831 {
832 	struct pasid_dir_entry *dir, *pde;
833 	struct pasid_entry *entries, *pte;
834 	struct context_entry *ctx_entry;
835 	struct root_entry *rt_entry;
836 	int i, dir_index, index, level;
837 	u8 devfn = source_id & 0xff;
838 	u8 bus = source_id >> 8;
839 	struct dma_pte *pgtable;
840 
841 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
842 
843 	/* root entry dump */
844 	rt_entry = &iommu->root_entry[bus];
845 	if (!rt_entry) {
846 		pr_info("root table entry is not present\n");
847 		return;
848 	}
849 
850 	if (sm_supported(iommu))
851 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
852 			rt_entry->hi, rt_entry->lo);
853 	else
854 		pr_info("root entry: 0x%016llx", rt_entry->lo);
855 
856 	/* context entry dump */
857 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
858 	if (!ctx_entry) {
859 		pr_info("context table entry is not present\n");
860 		return;
861 	}
862 
863 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
864 		ctx_entry->hi, ctx_entry->lo);
865 
866 	/* legacy mode does not require PASID entries */
867 	if (!sm_supported(iommu)) {
868 		level = agaw_to_level(ctx_entry->hi & 7);
869 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
870 		goto pgtable_walk;
871 	}
872 
873 	/* get the pointer to pasid directory entry */
874 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
875 	if (!dir) {
876 		pr_info("pasid directory entry is not present\n");
877 		return;
878 	}
879 	/* For request-without-pasid, get the pasid from context entry */
880 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
881 		pasid = PASID_RID2PASID;
882 
883 	dir_index = pasid >> PASID_PDE_SHIFT;
884 	pde = &dir[dir_index];
885 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
886 
887 	/* get the pointer to the pasid table entry */
888 	entries = get_pasid_table_from_pde(pde);
889 	if (!entries) {
890 		pr_info("pasid table entry is not present\n");
891 		return;
892 	}
893 	index = pasid & PASID_PTE_MASK;
894 	pte = &entries[index];
895 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
896 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
897 
898 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
899 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
900 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
901 	} else {
902 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
903 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
904 	}
905 
906 pgtable_walk:
907 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
908 }
909 #endif
910 
911 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
912 				      unsigned long pfn, int *target_level,
913 				      gfp_t gfp)
914 {
915 	struct dma_pte *parent, *pte;
916 	int level = agaw_to_level(domain->agaw);
917 	int offset;
918 
919 	if (!domain_pfn_supported(domain, pfn))
920 		/* Address beyond IOMMU's addressing capabilities. */
921 		return NULL;
922 
923 	parent = domain->pgd;
924 
925 	while (1) {
926 		void *tmp_page;
927 
928 		offset = pfn_level_offset(pfn, level);
929 		pte = &parent[offset];
930 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 			break;
932 		if (level == *target_level)
933 			break;
934 
935 		if (!dma_pte_present(pte)) {
936 			uint64_t pteval;
937 
938 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
939 
940 			if (!tmp_page)
941 				return NULL;
942 
943 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 			if (domain->use_first_level)
946 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
947 
948 			if (cmpxchg64(&pte->val, 0ULL, pteval))
949 				/* Someone else set it while we were thinking; use theirs. */
950 				free_pgtable_page(tmp_page);
951 			else
952 				domain_flush_cache(domain, pte, sizeof(*pte));
953 		}
954 		if (level == 1)
955 			break;
956 
957 		parent = phys_to_virt(dma_pte_addr(pte));
958 		level--;
959 	}
960 
961 	if (!*target_level)
962 		*target_level = level;
963 
964 	return pte;
965 }
966 
967 /* return address's pte at specific level */
968 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 					 unsigned long pfn,
970 					 int level, int *large_page)
971 {
972 	struct dma_pte *parent, *pte;
973 	int total = agaw_to_level(domain->agaw);
974 	int offset;
975 
976 	parent = domain->pgd;
977 	while (level <= total) {
978 		offset = pfn_level_offset(pfn, total);
979 		pte = &parent[offset];
980 		if (level == total)
981 			return pte;
982 
983 		if (!dma_pte_present(pte)) {
984 			*large_page = total;
985 			break;
986 		}
987 
988 		if (dma_pte_superpage(pte)) {
989 			*large_page = total;
990 			return pte;
991 		}
992 
993 		parent = phys_to_virt(dma_pte_addr(pte));
994 		total--;
995 	}
996 	return NULL;
997 }
998 
999 /* clear last level pte, a tlb flush should be followed */
1000 static void dma_pte_clear_range(struct dmar_domain *domain,
1001 				unsigned long start_pfn,
1002 				unsigned long last_pfn)
1003 {
1004 	unsigned int large_page;
1005 	struct dma_pte *first_pte, *pte;
1006 
1007 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1008 	    WARN_ON(start_pfn > last_pfn))
1009 		return;
1010 
1011 	/* we don't need lock here; nobody else touches the iova range */
1012 	do {
1013 		large_page = 1;
1014 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 		if (!pte) {
1016 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017 			continue;
1018 		}
1019 		do {
1020 			dma_clear_pte(pte);
1021 			start_pfn += lvl_to_nr_pages(large_page);
1022 			pte++;
1023 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024 
1025 		domain_flush_cache(domain, first_pte,
1026 				   (void *)pte - (void *)first_pte);
1027 
1028 	} while (start_pfn && start_pfn <= last_pfn);
1029 }
1030 
1031 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 			       int retain_level, struct dma_pte *pte,
1033 			       unsigned long pfn, unsigned long start_pfn,
1034 			       unsigned long last_pfn)
1035 {
1036 	pfn = max(start_pfn, pfn);
1037 	pte = &pte[pfn_level_offset(pfn, level)];
1038 
1039 	do {
1040 		unsigned long level_pfn;
1041 		struct dma_pte *level_pte;
1042 
1043 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044 			goto next;
1045 
1046 		level_pfn = pfn & level_mask(level);
1047 		level_pte = phys_to_virt(dma_pte_addr(pte));
1048 
1049 		if (level > 2) {
1050 			dma_pte_free_level(domain, level - 1, retain_level,
1051 					   level_pte, level_pfn, start_pfn,
1052 					   last_pfn);
1053 		}
1054 
1055 		/*
1056 		 * Free the page table if we're below the level we want to
1057 		 * retain and the range covers the entire table.
1058 		 */
1059 		if (level < retain_level && !(start_pfn > level_pfn ||
1060 		      last_pfn < level_pfn + level_size(level) - 1)) {
1061 			dma_clear_pte(pte);
1062 			domain_flush_cache(domain, pte, sizeof(*pte));
1063 			free_pgtable_page(level_pte);
1064 		}
1065 next:
1066 		pfn += level_size(level);
1067 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 }
1069 
1070 /*
1071  * clear last level (leaf) ptes and free page table pages below the
1072  * level we wish to keep intact.
1073  */
1074 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 				   unsigned long start_pfn,
1076 				   unsigned long last_pfn,
1077 				   int retain_level)
1078 {
1079 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1080 
1081 	/* We don't need lock here; nobody else touches the iova range */
1082 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083 			   domain->pgd, 0, start_pfn, last_pfn);
1084 
1085 	/* free pgd */
1086 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 		free_pgtable_page(domain->pgd);
1088 		domain->pgd = NULL;
1089 	}
1090 }
1091 
1092 /* When a page at a given level is being unlinked from its parent, we don't
1093    need to *modify* it at all. All we need to do is make a list of all the
1094    pages which can be freed just as soon as we've flushed the IOTLB and we
1095    know the hardware page-walk will no longer touch them.
1096    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097    be freed. */
1098 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099 				    int level, struct dma_pte *pte,
1100 				    struct list_head *freelist)
1101 {
1102 	struct page *pg;
1103 
1104 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105 	list_add_tail(&pg->lru, freelist);
1106 
1107 	if (level == 1)
1108 		return;
1109 
1110 	pte = page_address(pg);
1111 	do {
1112 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 		pte++;
1115 	} while (!first_pte_in_page(pte));
1116 }
1117 
1118 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119 				struct dma_pte *pte, unsigned long pfn,
1120 				unsigned long start_pfn, unsigned long last_pfn,
1121 				struct list_head *freelist)
1122 {
1123 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124 
1125 	pfn = max(start_pfn, pfn);
1126 	pte = &pte[pfn_level_offset(pfn, level)];
1127 
1128 	do {
1129 		unsigned long level_pfn = pfn & level_mask(level);
1130 
1131 		if (!dma_pte_present(pte))
1132 			goto next;
1133 
1134 		/* If range covers entire pagetable, free it */
1135 		if (start_pfn <= level_pfn &&
1136 		    last_pfn >= level_pfn + level_size(level) - 1) {
1137 			/* These suborbinate page tables are going away entirely. Don't
1138 			   bother to clear them; we're just going to *free* them. */
1139 			if (level > 1 && !dma_pte_superpage(pte))
1140 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141 
1142 			dma_clear_pte(pte);
1143 			if (!first_pte)
1144 				first_pte = pte;
1145 			last_pte = pte;
1146 		} else if (level > 1) {
1147 			/* Recurse down into a level that isn't *entirely* obsolete */
1148 			dma_pte_clear_level(domain, level - 1,
1149 					    phys_to_virt(dma_pte_addr(pte)),
1150 					    level_pfn, start_pfn, last_pfn,
1151 					    freelist);
1152 		}
1153 next:
1154 		pfn = level_pfn + level_size(level);
1155 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156 
1157 	if (first_pte)
1158 		domain_flush_cache(domain, first_pte,
1159 				   (void *)++last_pte - (void *)first_pte);
1160 }
1161 
1162 /* We can't just free the pages because the IOMMU may still be walking
1163    the page tables, and may have cached the intermediate levels. The
1164    pages can only be freed after the IOTLB flush has been done. */
1165 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166 			 unsigned long last_pfn, struct list_head *freelist)
1167 {
1168 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1169 	    WARN_ON(start_pfn > last_pfn))
1170 		return;
1171 
1172 	/* we don't need lock here; nobody else touches the iova range */
1173 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1175 
1176 	/* free pgd */
1177 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 		struct page *pgd_page = virt_to_page(domain->pgd);
1179 		list_add_tail(&pgd_page->lru, freelist);
1180 		domain->pgd = NULL;
1181 	}
1182 }
1183 
1184 /* iommu handling */
1185 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186 {
1187 	struct root_entry *root;
1188 
1189 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1190 	if (!root) {
1191 		pr_err("Allocating root entry for %s failed\n",
1192 			iommu->name);
1193 		return -ENOMEM;
1194 	}
1195 
1196 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1197 	iommu->root_entry = root;
1198 
1199 	return 0;
1200 }
1201 
1202 static void iommu_set_root_entry(struct intel_iommu *iommu)
1203 {
1204 	u64 addr;
1205 	u32 sts;
1206 	unsigned long flag;
1207 
1208 	addr = virt_to_phys(iommu->root_entry);
1209 	if (sm_supported(iommu))
1210 		addr |= DMA_RTADDR_SMT;
1211 
1212 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214 
1215 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216 
1217 	/* Make sure hardware complete it */
1218 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219 		      readl, (sts & DMA_GSTS_RTPS), sts);
1220 
1221 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 
1223 	/*
1224 	 * Hardware invalidates all DMA remapping hardware translation
1225 	 * caches as part of SRTP flow.
1226 	 */
1227 	if (cap_esrtps(iommu->cap))
1228 		return;
1229 
1230 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231 	if (sm_supported(iommu))
1232 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234 }
1235 
1236 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237 {
1238 	u32 val;
1239 	unsigned long flag;
1240 
1241 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242 		return;
1243 
1244 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246 
1247 	/* Make sure hardware complete it */
1248 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1250 
1251 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252 }
1253 
1254 /* return value determine if we need a write buffer flush */
1255 static void __iommu_flush_context(struct intel_iommu *iommu,
1256 				  u16 did, u16 source_id, u8 function_mask,
1257 				  u64 type)
1258 {
1259 	u64 val = 0;
1260 	unsigned long flag;
1261 
1262 	switch (type) {
1263 	case DMA_CCMD_GLOBAL_INVL:
1264 		val = DMA_CCMD_GLOBAL_INVL;
1265 		break;
1266 	case DMA_CCMD_DOMAIN_INVL:
1267 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268 		break;
1269 	case DMA_CCMD_DEVICE_INVL:
1270 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272 		break;
1273 	default:
1274 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1275 			iommu->name, type);
1276 		return;
1277 	}
1278 	val |= DMA_CCMD_ICC;
1279 
1280 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1281 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1282 
1283 	/* Make sure hardware complete it */
1284 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1285 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1286 
1287 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1288 }
1289 
1290 /* return value determine if we need a write buffer flush */
1291 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1292 				u64 addr, unsigned int size_order, u64 type)
1293 {
1294 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1295 	u64 val = 0, val_iva = 0;
1296 	unsigned long flag;
1297 
1298 	switch (type) {
1299 	case DMA_TLB_GLOBAL_FLUSH:
1300 		/* global flush doesn't need set IVA_REG */
1301 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1302 		break;
1303 	case DMA_TLB_DSI_FLUSH:
1304 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1305 		break;
1306 	case DMA_TLB_PSI_FLUSH:
1307 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1308 		/* IH bit is passed in as part of address */
1309 		val_iva = size_order | addr;
1310 		break;
1311 	default:
1312 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1313 			iommu->name, type);
1314 		return;
1315 	}
1316 
1317 	if (cap_write_drain(iommu->cap))
1318 		val |= DMA_TLB_WRITE_DRAIN;
1319 
1320 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321 	/* Note: Only uses first TLB reg currently */
1322 	if (val_iva)
1323 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1324 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1325 
1326 	/* Make sure hardware complete it */
1327 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1328 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1329 
1330 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1331 
1332 	/* check IOTLB invalidation granularity */
1333 	if (DMA_TLB_IAIG(val) == 0)
1334 		pr_err("Flush IOTLB failed\n");
1335 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1336 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1337 			(unsigned long long)DMA_TLB_IIRG(type),
1338 			(unsigned long long)DMA_TLB_IAIG(val));
1339 }
1340 
1341 static struct device_domain_info *
1342 domain_lookup_dev_info(struct dmar_domain *domain,
1343 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1344 {
1345 	struct device_domain_info *info;
1346 	unsigned long flags;
1347 
1348 	spin_lock_irqsave(&domain->lock, flags);
1349 	list_for_each_entry(info, &domain->devices, link) {
1350 		if (info->iommu == iommu && info->bus == bus &&
1351 		    info->devfn == devfn) {
1352 			spin_unlock_irqrestore(&domain->lock, flags);
1353 			return info;
1354 		}
1355 	}
1356 	spin_unlock_irqrestore(&domain->lock, flags);
1357 
1358 	return NULL;
1359 }
1360 
1361 static void domain_update_iotlb(struct dmar_domain *domain)
1362 {
1363 	struct device_domain_info *info;
1364 	bool has_iotlb_device = false;
1365 	unsigned long flags;
1366 
1367 	spin_lock_irqsave(&domain->lock, flags);
1368 	list_for_each_entry(info, &domain->devices, link) {
1369 		if (info->ats_enabled) {
1370 			has_iotlb_device = true;
1371 			break;
1372 		}
1373 	}
1374 	domain->has_iotlb_device = has_iotlb_device;
1375 	spin_unlock_irqrestore(&domain->lock, flags);
1376 }
1377 
1378 /*
1379  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1380  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1381  * check because it applies only to the built-in QAT devices and it doesn't
1382  * grant additional privileges.
1383  */
1384 #define BUGGY_QAT_DEVID_MASK 0x4940
1385 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1386 {
1387 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1388 		return false;
1389 
1390 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1391 		return false;
1392 
1393 	return true;
1394 }
1395 
1396 static void iommu_enable_pci_caps(struct device_domain_info *info)
1397 {
1398 	struct pci_dev *pdev;
1399 
1400 	if (!dev_is_pci(info->dev))
1401 		return;
1402 
1403 	pdev = to_pci_dev(info->dev);
1404 
1405 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1406 	   the device if you enable PASID support after ATS support is
1407 	   undefined. So always enable PASID support on devices which
1408 	   have it, even if we can't yet know if we're ever going to
1409 	   use it. */
1410 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1411 		info->pasid_enabled = 1;
1412 
1413 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1414 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1415 		info->ats_enabled = 1;
1416 		domain_update_iotlb(info->domain);
1417 	}
1418 }
1419 
1420 static void iommu_disable_pci_caps(struct device_domain_info *info)
1421 {
1422 	struct pci_dev *pdev;
1423 
1424 	if (!dev_is_pci(info->dev))
1425 		return;
1426 
1427 	pdev = to_pci_dev(info->dev);
1428 
1429 	if (info->ats_enabled) {
1430 		pci_disable_ats(pdev);
1431 		info->ats_enabled = 0;
1432 		domain_update_iotlb(info->domain);
1433 	}
1434 
1435 	if (info->pasid_enabled) {
1436 		pci_disable_pasid(pdev);
1437 		info->pasid_enabled = 0;
1438 	}
1439 }
1440 
1441 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1442 				    u64 addr, unsigned int mask)
1443 {
1444 	u16 sid, qdep;
1445 
1446 	if (!info || !info->ats_enabled)
1447 		return;
1448 
1449 	sid = info->bus << 8 | info->devfn;
1450 	qdep = info->ats_qdep;
1451 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1452 			   qdep, addr, mask);
1453 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1454 }
1455 
1456 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1457 				  u64 addr, unsigned mask)
1458 {
1459 	struct device_domain_info *info;
1460 	unsigned long flags;
1461 
1462 	if (!domain->has_iotlb_device)
1463 		return;
1464 
1465 	spin_lock_irqsave(&domain->lock, flags);
1466 	list_for_each_entry(info, &domain->devices, link)
1467 		__iommu_flush_dev_iotlb(info, addr, mask);
1468 	spin_unlock_irqrestore(&domain->lock, flags);
1469 }
1470 
1471 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1472 				  struct dmar_domain *domain,
1473 				  unsigned long pfn, unsigned int pages,
1474 				  int ih, int map)
1475 {
1476 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1477 	unsigned int mask = ilog2(aligned_pages);
1478 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1479 	u16 did = domain_id_iommu(domain, iommu);
1480 
1481 	if (WARN_ON(!pages))
1482 		return;
1483 
1484 	if (ih)
1485 		ih = 1 << 6;
1486 
1487 	if (domain->use_first_level) {
1488 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1489 	} else {
1490 		unsigned long bitmask = aligned_pages - 1;
1491 
1492 		/*
1493 		 * PSI masks the low order bits of the base address. If the
1494 		 * address isn't aligned to the mask, then compute a mask value
1495 		 * needed to ensure the target range is flushed.
1496 		 */
1497 		if (unlikely(bitmask & pfn)) {
1498 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1499 
1500 			/*
1501 			 * Since end_pfn <= pfn + bitmask, the only way bits
1502 			 * higher than bitmask can differ in pfn and end_pfn is
1503 			 * by carrying. This means after masking out bitmask,
1504 			 * high bits starting with the first set bit in
1505 			 * shared_bits are all equal in both pfn and end_pfn.
1506 			 */
1507 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1508 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1509 		}
1510 
1511 		/*
1512 		 * Fallback to domain selective flush if no PSI support or
1513 		 * the size is too big.
1514 		 */
1515 		if (!cap_pgsel_inv(iommu->cap) ||
1516 		    mask > cap_max_amask_val(iommu->cap))
1517 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1518 							DMA_TLB_DSI_FLUSH);
1519 		else
1520 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1521 							DMA_TLB_PSI_FLUSH);
1522 	}
1523 
1524 	/*
1525 	 * In caching mode, changes of pages from non-present to present require
1526 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1527 	 */
1528 	if (!cap_caching_mode(iommu->cap) || !map)
1529 		iommu_flush_dev_iotlb(domain, addr, mask);
1530 }
1531 
1532 /* Notification for newly created mappings */
1533 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1534 					struct dmar_domain *domain,
1535 					unsigned long pfn, unsigned int pages)
1536 {
1537 	/*
1538 	 * It's a non-present to present mapping. Only flush if caching mode
1539 	 * and second level.
1540 	 */
1541 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1542 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1543 	else
1544 		iommu_flush_write_buffer(iommu);
1545 }
1546 
1547 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1548 {
1549 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1550 	struct iommu_domain_info *info;
1551 	unsigned long idx;
1552 
1553 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1554 		struct intel_iommu *iommu = info->iommu;
1555 		u16 did = domain_id_iommu(dmar_domain, iommu);
1556 
1557 		if (dmar_domain->use_first_level)
1558 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1559 		else
1560 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1561 						 DMA_TLB_DSI_FLUSH);
1562 
1563 		if (!cap_caching_mode(iommu->cap))
1564 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1565 	}
1566 }
1567 
1568 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1569 {
1570 	u32 pmen;
1571 	unsigned long flags;
1572 
1573 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1574 		return;
1575 
1576 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1578 	pmen &= ~DMA_PMEN_EPM;
1579 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1580 
1581 	/* wait for the protected region status bit to clear */
1582 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1583 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1584 
1585 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1586 }
1587 
1588 static void iommu_enable_translation(struct intel_iommu *iommu)
1589 {
1590 	u32 sts;
1591 	unsigned long flags;
1592 
1593 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1594 	iommu->gcmd |= DMA_GCMD_TE;
1595 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1596 
1597 	/* Make sure hardware complete it */
1598 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1599 		      readl, (sts & DMA_GSTS_TES), sts);
1600 
1601 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1602 }
1603 
1604 static void iommu_disable_translation(struct intel_iommu *iommu)
1605 {
1606 	u32 sts;
1607 	unsigned long flag;
1608 
1609 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1610 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1611 		return;
1612 
1613 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1614 	iommu->gcmd &= ~DMA_GCMD_TE;
1615 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1616 
1617 	/* Make sure hardware complete it */
1618 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1619 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1620 
1621 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1622 }
1623 
1624 static int iommu_init_domains(struct intel_iommu *iommu)
1625 {
1626 	u32 ndomains;
1627 
1628 	ndomains = cap_ndoms(iommu->cap);
1629 	pr_debug("%s: Number of Domains supported <%d>\n",
1630 		 iommu->name, ndomains);
1631 
1632 	spin_lock_init(&iommu->lock);
1633 
1634 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1635 	if (!iommu->domain_ids)
1636 		return -ENOMEM;
1637 
1638 	/*
1639 	 * If Caching mode is set, then invalid translations are tagged
1640 	 * with domain-id 0, hence we need to pre-allocate it. We also
1641 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1642 	 * make sure it is not used for a real domain.
1643 	 */
1644 	set_bit(0, iommu->domain_ids);
1645 
1646 	/*
1647 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1648 	 * entry for first-level or pass-through translation modes should
1649 	 * be programmed with a domain id different from those used for
1650 	 * second-level or nested translation. We reserve a domain id for
1651 	 * this purpose.
1652 	 */
1653 	if (sm_supported(iommu))
1654 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1655 
1656 	return 0;
1657 }
1658 
1659 static void disable_dmar_iommu(struct intel_iommu *iommu)
1660 {
1661 	if (!iommu->domain_ids)
1662 		return;
1663 
1664 	/*
1665 	 * All iommu domains must have been detached from the devices,
1666 	 * hence there should be no domain IDs in use.
1667 	 */
1668 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1669 		    > NUM_RESERVED_DID))
1670 		return;
1671 
1672 	if (iommu->gcmd & DMA_GCMD_TE)
1673 		iommu_disable_translation(iommu);
1674 }
1675 
1676 static void free_dmar_iommu(struct intel_iommu *iommu)
1677 {
1678 	if (iommu->domain_ids) {
1679 		bitmap_free(iommu->domain_ids);
1680 		iommu->domain_ids = NULL;
1681 	}
1682 
1683 	if (iommu->copied_tables) {
1684 		bitmap_free(iommu->copied_tables);
1685 		iommu->copied_tables = NULL;
1686 	}
1687 
1688 	/* free context mapping */
1689 	free_context_table(iommu);
1690 
1691 #ifdef CONFIG_INTEL_IOMMU_SVM
1692 	if (pasid_supported(iommu)) {
1693 		if (ecap_prs(iommu->ecap))
1694 			intel_svm_finish_prq(iommu);
1695 	}
1696 #endif
1697 }
1698 
1699 /*
1700  * Check and return whether first level is used by default for
1701  * DMA translation.
1702  */
1703 static bool first_level_by_default(unsigned int type)
1704 {
1705 	/* Only SL is available in legacy mode */
1706 	if (!scalable_mode_support())
1707 		return false;
1708 
1709 	/* Only level (either FL or SL) is available, just use it */
1710 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1711 		return intel_cap_flts_sanity();
1712 
1713 	/* Both levels are available, decide it based on domain type */
1714 	return type != IOMMU_DOMAIN_UNMANAGED;
1715 }
1716 
1717 static struct dmar_domain *alloc_domain(unsigned int type)
1718 {
1719 	struct dmar_domain *domain;
1720 
1721 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1722 	if (!domain)
1723 		return NULL;
1724 
1725 	domain->nid = NUMA_NO_NODE;
1726 	if (first_level_by_default(type))
1727 		domain->use_first_level = true;
1728 	domain->has_iotlb_device = false;
1729 	INIT_LIST_HEAD(&domain->devices);
1730 	spin_lock_init(&domain->lock);
1731 	xa_init(&domain->iommu_array);
1732 
1733 	return domain;
1734 }
1735 
1736 static int domain_attach_iommu(struct dmar_domain *domain,
1737 			       struct intel_iommu *iommu)
1738 {
1739 	struct iommu_domain_info *info, *curr;
1740 	unsigned long ndomains;
1741 	int num, ret = -ENOSPC;
1742 
1743 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1744 	if (!info)
1745 		return -ENOMEM;
1746 
1747 	spin_lock(&iommu->lock);
1748 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1749 	if (curr) {
1750 		curr->refcnt++;
1751 		spin_unlock(&iommu->lock);
1752 		kfree(info);
1753 		return 0;
1754 	}
1755 
1756 	ndomains = cap_ndoms(iommu->cap);
1757 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1758 	if (num >= ndomains) {
1759 		pr_err("%s: No free domain ids\n", iommu->name);
1760 		goto err_unlock;
1761 	}
1762 
1763 	set_bit(num, iommu->domain_ids);
1764 	info->refcnt	= 1;
1765 	info->did	= num;
1766 	info->iommu	= iommu;
1767 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1768 			  NULL, info, GFP_ATOMIC);
1769 	if (curr) {
1770 		ret = xa_err(curr) ? : -EBUSY;
1771 		goto err_clear;
1772 	}
1773 	domain_update_iommu_cap(domain);
1774 
1775 	spin_unlock(&iommu->lock);
1776 	return 0;
1777 
1778 err_clear:
1779 	clear_bit(info->did, iommu->domain_ids);
1780 err_unlock:
1781 	spin_unlock(&iommu->lock);
1782 	kfree(info);
1783 	return ret;
1784 }
1785 
1786 static void domain_detach_iommu(struct dmar_domain *domain,
1787 				struct intel_iommu *iommu)
1788 {
1789 	struct iommu_domain_info *info;
1790 
1791 	spin_lock(&iommu->lock);
1792 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1793 	if (--info->refcnt == 0) {
1794 		clear_bit(info->did, iommu->domain_ids);
1795 		xa_erase(&domain->iommu_array, iommu->seq_id);
1796 		domain->nid = NUMA_NO_NODE;
1797 		domain_update_iommu_cap(domain);
1798 		kfree(info);
1799 	}
1800 	spin_unlock(&iommu->lock);
1801 }
1802 
1803 static inline int guestwidth_to_adjustwidth(int gaw)
1804 {
1805 	int agaw;
1806 	int r = (gaw - 12) % 9;
1807 
1808 	if (r == 0)
1809 		agaw = gaw;
1810 	else
1811 		agaw = gaw + 9 - r;
1812 	if (agaw > 64)
1813 		agaw = 64;
1814 	return agaw;
1815 }
1816 
1817 static void domain_exit(struct dmar_domain *domain)
1818 {
1819 	if (domain->pgd) {
1820 		LIST_HEAD(freelist);
1821 
1822 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1823 		put_pages_list(&freelist);
1824 	}
1825 
1826 	if (WARN_ON(!list_empty(&domain->devices)))
1827 		return;
1828 
1829 	kfree(domain);
1830 }
1831 
1832 /*
1833  * Get the PASID directory size for scalable mode context entry.
1834  * Value of X in the PDTS field of a scalable mode context entry
1835  * indicates PASID directory with 2^(X + 7) entries.
1836  */
1837 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1838 {
1839 	unsigned long pds, max_pde;
1840 
1841 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1842 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1843 	if (pds < 7)
1844 		return 0;
1845 
1846 	return pds - 7;
1847 }
1848 
1849 /*
1850  * Set the RID_PASID field of a scalable mode context entry. The
1851  * IOMMU hardware will use the PASID value set in this field for
1852  * DMA translations of DMA requests without PASID.
1853  */
1854 static inline void
1855 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1856 {
1857 	context->hi |= pasid & ((1 << 20) - 1);
1858 }
1859 
1860 /*
1861  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1862  * entry.
1863  */
1864 static inline void context_set_sm_dte(struct context_entry *context)
1865 {
1866 	context->lo |= BIT_ULL(2);
1867 }
1868 
1869 /*
1870  * Set the PRE(Page Request Enable) field of a scalable mode context
1871  * entry.
1872  */
1873 static inline void context_set_sm_pre(struct context_entry *context)
1874 {
1875 	context->lo |= BIT_ULL(4);
1876 }
1877 
1878 /* Convert value to context PASID directory size field coding. */
1879 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1880 
1881 static int domain_context_mapping_one(struct dmar_domain *domain,
1882 				      struct intel_iommu *iommu,
1883 				      struct pasid_table *table,
1884 				      u8 bus, u8 devfn)
1885 {
1886 	struct device_domain_info *info =
1887 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1888 	u16 did = domain_id_iommu(domain, iommu);
1889 	int translation = CONTEXT_TT_MULTI_LEVEL;
1890 	struct context_entry *context;
1891 	int ret;
1892 
1893 	if (hw_pass_through && domain_type_is_si(domain))
1894 		translation = CONTEXT_TT_PASS_THROUGH;
1895 
1896 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1897 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1898 
1899 	spin_lock(&iommu->lock);
1900 	ret = -ENOMEM;
1901 	context = iommu_context_addr(iommu, bus, devfn, 1);
1902 	if (!context)
1903 		goto out_unlock;
1904 
1905 	ret = 0;
1906 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1907 		goto out_unlock;
1908 
1909 	/*
1910 	 * For kdump cases, old valid entries may be cached due to the
1911 	 * in-flight DMA and copied pgtable, but there is no unmapping
1912 	 * behaviour for them, thus we need an explicit cache flush for
1913 	 * the newly-mapped device. For kdump, at this point, the device
1914 	 * is supposed to finish reset at its driver probe stage, so no
1915 	 * in-flight DMA will exist, and we don't need to worry anymore
1916 	 * hereafter.
1917 	 */
1918 	if (context_copied(iommu, bus, devfn)) {
1919 		u16 did_old = context_domain_id(context);
1920 
1921 		if (did_old < cap_ndoms(iommu->cap)) {
1922 			iommu->flush.flush_context(iommu, did_old,
1923 						   (((u16)bus) << 8) | devfn,
1924 						   DMA_CCMD_MASK_NOBIT,
1925 						   DMA_CCMD_DEVICE_INVL);
1926 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1927 						 DMA_TLB_DSI_FLUSH);
1928 		}
1929 
1930 		clear_context_copied(iommu, bus, devfn);
1931 	}
1932 
1933 	context_clear_entry(context);
1934 
1935 	if (sm_supported(iommu)) {
1936 		unsigned long pds;
1937 
1938 		/* Setup the PASID DIR pointer: */
1939 		pds = context_get_sm_pds(table);
1940 		context->lo = (u64)virt_to_phys(table->table) |
1941 				context_pdts(pds);
1942 
1943 		/* Setup the RID_PASID field: */
1944 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1945 
1946 		/*
1947 		 * Setup the Device-TLB enable bit and Page request
1948 		 * Enable bit:
1949 		 */
1950 		if (info && info->ats_supported)
1951 			context_set_sm_dte(context);
1952 		if (info && info->pri_supported)
1953 			context_set_sm_pre(context);
1954 		if (info && info->pasid_supported)
1955 			context_set_pasid(context);
1956 	} else {
1957 		struct dma_pte *pgd = domain->pgd;
1958 		int agaw;
1959 
1960 		context_set_domain_id(context, did);
1961 
1962 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1963 			/*
1964 			 * Skip top levels of page tables for iommu which has
1965 			 * less agaw than default. Unnecessary for PT mode.
1966 			 */
1967 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1968 				ret = -ENOMEM;
1969 				pgd = phys_to_virt(dma_pte_addr(pgd));
1970 				if (!dma_pte_present(pgd))
1971 					goto out_unlock;
1972 			}
1973 
1974 			if (info && info->ats_supported)
1975 				translation = CONTEXT_TT_DEV_IOTLB;
1976 			else
1977 				translation = CONTEXT_TT_MULTI_LEVEL;
1978 
1979 			context_set_address_root(context, virt_to_phys(pgd));
1980 			context_set_address_width(context, agaw);
1981 		} else {
1982 			/*
1983 			 * In pass through mode, AW must be programmed to
1984 			 * indicate the largest AGAW value supported by
1985 			 * hardware. And ASR is ignored by hardware.
1986 			 */
1987 			context_set_address_width(context, iommu->msagaw);
1988 		}
1989 
1990 		context_set_translation_type(context, translation);
1991 	}
1992 
1993 	context_set_fault_enable(context);
1994 	context_set_present(context);
1995 	if (!ecap_coherent(iommu->ecap))
1996 		clflush_cache_range(context, sizeof(*context));
1997 
1998 	/*
1999 	 * It's a non-present to present mapping. If hardware doesn't cache
2000 	 * non-present entry we only need to flush the write-buffer. If the
2001 	 * _does_ cache non-present entries, then it does so in the special
2002 	 * domain #0, which we have to flush:
2003 	 */
2004 	if (cap_caching_mode(iommu->cap)) {
2005 		iommu->flush.flush_context(iommu, 0,
2006 					   (((u16)bus) << 8) | devfn,
2007 					   DMA_CCMD_MASK_NOBIT,
2008 					   DMA_CCMD_DEVICE_INVL);
2009 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2010 	} else {
2011 		iommu_flush_write_buffer(iommu);
2012 	}
2013 
2014 	ret = 0;
2015 
2016 out_unlock:
2017 	spin_unlock(&iommu->lock);
2018 
2019 	return ret;
2020 }
2021 
2022 struct domain_context_mapping_data {
2023 	struct dmar_domain *domain;
2024 	struct intel_iommu *iommu;
2025 	struct pasid_table *table;
2026 };
2027 
2028 static int domain_context_mapping_cb(struct pci_dev *pdev,
2029 				     u16 alias, void *opaque)
2030 {
2031 	struct domain_context_mapping_data *data = opaque;
2032 
2033 	return domain_context_mapping_one(data->domain, data->iommu,
2034 					  data->table, PCI_BUS_NUM(alias),
2035 					  alias & 0xff);
2036 }
2037 
2038 static int
2039 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2040 {
2041 	struct domain_context_mapping_data data;
2042 	struct pasid_table *table;
2043 	struct intel_iommu *iommu;
2044 	u8 bus, devfn;
2045 
2046 	iommu = device_to_iommu(dev, &bus, &devfn);
2047 	if (!iommu)
2048 		return -ENODEV;
2049 
2050 	table = intel_pasid_get_table(dev);
2051 
2052 	if (!dev_is_pci(dev))
2053 		return domain_context_mapping_one(domain, iommu, table,
2054 						  bus, devfn);
2055 
2056 	data.domain = domain;
2057 	data.iommu = iommu;
2058 	data.table = table;
2059 
2060 	return pci_for_each_dma_alias(to_pci_dev(dev),
2061 				      &domain_context_mapping_cb, &data);
2062 }
2063 
2064 /* Returns a number of VTD pages, but aligned to MM page size */
2065 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2066 					    size_t size)
2067 {
2068 	host_addr &= ~PAGE_MASK;
2069 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2070 }
2071 
2072 /* Return largest possible superpage level for a given mapping */
2073 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2074 					  unsigned long iov_pfn,
2075 					  unsigned long phy_pfn,
2076 					  unsigned long pages)
2077 {
2078 	int support, level = 1;
2079 	unsigned long pfnmerge;
2080 
2081 	support = domain->iommu_superpage;
2082 
2083 	/* To use a large page, the virtual *and* physical addresses
2084 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2085 	   of them will mean we have to use smaller pages. So just
2086 	   merge them and check both at once. */
2087 	pfnmerge = iov_pfn | phy_pfn;
2088 
2089 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2090 		pages >>= VTD_STRIDE_SHIFT;
2091 		if (!pages)
2092 			break;
2093 		pfnmerge >>= VTD_STRIDE_SHIFT;
2094 		level++;
2095 		support--;
2096 	}
2097 	return level;
2098 }
2099 
2100 /*
2101  * Ensure that old small page tables are removed to make room for superpage(s).
2102  * We're going to add new large pages, so make sure we don't remove their parent
2103  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2104  */
2105 static void switch_to_super_page(struct dmar_domain *domain,
2106 				 unsigned long start_pfn,
2107 				 unsigned long end_pfn, int level)
2108 {
2109 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2110 	struct iommu_domain_info *info;
2111 	struct dma_pte *pte = NULL;
2112 	unsigned long i;
2113 
2114 	while (start_pfn <= end_pfn) {
2115 		if (!pte)
2116 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2117 					     GFP_ATOMIC);
2118 
2119 		if (dma_pte_present(pte)) {
2120 			dma_pte_free_pagetable(domain, start_pfn,
2121 					       start_pfn + lvl_pages - 1,
2122 					       level + 1);
2123 
2124 			xa_for_each(&domain->iommu_array, i, info)
2125 				iommu_flush_iotlb_psi(info->iommu, domain,
2126 						      start_pfn, lvl_pages,
2127 						      0, 0);
2128 		}
2129 
2130 		pte++;
2131 		start_pfn += lvl_pages;
2132 		if (first_pte_in_page(pte))
2133 			pte = NULL;
2134 	}
2135 }
2136 
2137 static int
2138 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2139 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2140 		 gfp_t gfp)
2141 {
2142 	struct dma_pte *first_pte = NULL, *pte = NULL;
2143 	unsigned int largepage_lvl = 0;
2144 	unsigned long lvl_pages = 0;
2145 	phys_addr_t pteval;
2146 	u64 attr;
2147 
2148 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2149 		return -EINVAL;
2150 
2151 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2152 		return -EINVAL;
2153 
2154 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2155 	attr |= DMA_FL_PTE_PRESENT;
2156 	if (domain->use_first_level) {
2157 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2158 		if (prot & DMA_PTE_WRITE)
2159 			attr |= DMA_FL_PTE_DIRTY;
2160 	}
2161 
2162 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2163 
2164 	while (nr_pages > 0) {
2165 		uint64_t tmp;
2166 
2167 		if (!pte) {
2168 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2169 					phys_pfn, nr_pages);
2170 
2171 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2172 					     gfp);
2173 			if (!pte)
2174 				return -ENOMEM;
2175 			first_pte = pte;
2176 
2177 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2178 
2179 			/* It is large page*/
2180 			if (largepage_lvl > 1) {
2181 				unsigned long end_pfn;
2182 				unsigned long pages_to_remove;
2183 
2184 				pteval |= DMA_PTE_LARGE_PAGE;
2185 				pages_to_remove = min_t(unsigned long, nr_pages,
2186 							nr_pte_to_next_page(pte) * lvl_pages);
2187 				end_pfn = iov_pfn + pages_to_remove - 1;
2188 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2189 			} else {
2190 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2191 			}
2192 
2193 		}
2194 		/* We don't need lock here, nobody else
2195 		 * touches the iova range
2196 		 */
2197 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2198 		if (tmp) {
2199 			static int dumps = 5;
2200 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2201 				iov_pfn, tmp, (unsigned long long)pteval);
2202 			if (dumps) {
2203 				dumps--;
2204 				debug_dma_dump_mappings(NULL);
2205 			}
2206 			WARN_ON(1);
2207 		}
2208 
2209 		nr_pages -= lvl_pages;
2210 		iov_pfn += lvl_pages;
2211 		phys_pfn += lvl_pages;
2212 		pteval += lvl_pages * VTD_PAGE_SIZE;
2213 
2214 		/* If the next PTE would be the first in a new page, then we
2215 		 * need to flush the cache on the entries we've just written.
2216 		 * And then we'll need to recalculate 'pte', so clear it and
2217 		 * let it get set again in the if (!pte) block above.
2218 		 *
2219 		 * If we're done (!nr_pages) we need to flush the cache too.
2220 		 *
2221 		 * Also if we've been setting superpages, we may need to
2222 		 * recalculate 'pte' and switch back to smaller pages for the
2223 		 * end of the mapping, if the trailing size is not enough to
2224 		 * use another superpage (i.e. nr_pages < lvl_pages).
2225 		 */
2226 		pte++;
2227 		if (!nr_pages || first_pte_in_page(pte) ||
2228 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2229 			domain_flush_cache(domain, first_pte,
2230 					   (void *)pte - (void *)first_pte);
2231 			pte = NULL;
2232 		}
2233 	}
2234 
2235 	return 0;
2236 }
2237 
2238 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2239 {
2240 	struct intel_iommu *iommu = info->iommu;
2241 	struct context_entry *context;
2242 	u16 did_old;
2243 
2244 	if (!iommu)
2245 		return;
2246 
2247 	spin_lock(&iommu->lock);
2248 	context = iommu_context_addr(iommu, bus, devfn, 0);
2249 	if (!context) {
2250 		spin_unlock(&iommu->lock);
2251 		return;
2252 	}
2253 
2254 	if (sm_supported(iommu)) {
2255 		if (hw_pass_through && domain_type_is_si(info->domain))
2256 			did_old = FLPT_DEFAULT_DID;
2257 		else
2258 			did_old = domain_id_iommu(info->domain, iommu);
2259 	} else {
2260 		did_old = context_domain_id(context);
2261 	}
2262 
2263 	context_clear_entry(context);
2264 	__iommu_flush_cache(iommu, context, sizeof(*context));
2265 	spin_unlock(&iommu->lock);
2266 	iommu->flush.flush_context(iommu,
2267 				   did_old,
2268 				   (((u16)bus) << 8) | devfn,
2269 				   DMA_CCMD_MASK_NOBIT,
2270 				   DMA_CCMD_DEVICE_INVL);
2271 
2272 	if (sm_supported(iommu))
2273 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2274 
2275 	iommu->flush.flush_iotlb(iommu,
2276 				 did_old,
2277 				 0,
2278 				 0,
2279 				 DMA_TLB_DSI_FLUSH);
2280 
2281 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2282 }
2283 
2284 static int domain_setup_first_level(struct intel_iommu *iommu,
2285 				    struct dmar_domain *domain,
2286 				    struct device *dev,
2287 				    u32 pasid)
2288 {
2289 	struct dma_pte *pgd = domain->pgd;
2290 	int agaw, level;
2291 	int flags = 0;
2292 
2293 	/*
2294 	 * Skip top levels of page tables for iommu which has
2295 	 * less agaw than default. Unnecessary for PT mode.
2296 	 */
2297 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2298 		pgd = phys_to_virt(dma_pte_addr(pgd));
2299 		if (!dma_pte_present(pgd))
2300 			return -ENOMEM;
2301 	}
2302 
2303 	level = agaw_to_level(agaw);
2304 	if (level != 4 && level != 5)
2305 		return -EINVAL;
2306 
2307 	if (level == 5)
2308 		flags |= PASID_FLAG_FL5LP;
2309 
2310 	if (domain->force_snooping)
2311 		flags |= PASID_FLAG_PAGE_SNOOP;
2312 
2313 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2314 					     domain_id_iommu(domain, iommu),
2315 					     flags);
2316 }
2317 
2318 static bool dev_is_real_dma_subdevice(struct device *dev)
2319 {
2320 	return dev && dev_is_pci(dev) &&
2321 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2322 }
2323 
2324 static int iommu_domain_identity_map(struct dmar_domain *domain,
2325 				     unsigned long first_vpfn,
2326 				     unsigned long last_vpfn)
2327 {
2328 	/*
2329 	 * RMRR range might have overlap with physical memory range,
2330 	 * clear it first
2331 	 */
2332 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2333 
2334 	return __domain_mapping(domain, first_vpfn,
2335 				first_vpfn, last_vpfn - first_vpfn + 1,
2336 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2337 }
2338 
2339 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2340 
2341 static int __init si_domain_init(int hw)
2342 {
2343 	struct dmar_rmrr_unit *rmrr;
2344 	struct device *dev;
2345 	int i, nid, ret;
2346 
2347 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2348 	if (!si_domain)
2349 		return -EFAULT;
2350 
2351 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2352 		domain_exit(si_domain);
2353 		si_domain = NULL;
2354 		return -EFAULT;
2355 	}
2356 
2357 	if (hw)
2358 		return 0;
2359 
2360 	for_each_online_node(nid) {
2361 		unsigned long start_pfn, end_pfn;
2362 		int i;
2363 
2364 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2365 			ret = iommu_domain_identity_map(si_domain,
2366 					mm_to_dma_pfn(start_pfn),
2367 					mm_to_dma_pfn(end_pfn));
2368 			if (ret)
2369 				return ret;
2370 		}
2371 	}
2372 
2373 	/*
2374 	 * Identity map the RMRRs so that devices with RMRRs could also use
2375 	 * the si_domain.
2376 	 */
2377 	for_each_rmrr_units(rmrr) {
2378 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2379 					  i, dev) {
2380 			unsigned long long start = rmrr->base_address;
2381 			unsigned long long end = rmrr->end_address;
2382 
2383 			if (WARN_ON(end < start ||
2384 				    end >> agaw_to_width(si_domain->agaw)))
2385 				continue;
2386 
2387 			ret = iommu_domain_identity_map(si_domain,
2388 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2389 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2390 			if (ret)
2391 				return ret;
2392 		}
2393 	}
2394 
2395 	return 0;
2396 }
2397 
2398 static int dmar_domain_attach_device(struct dmar_domain *domain,
2399 				     struct device *dev)
2400 {
2401 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2402 	struct intel_iommu *iommu;
2403 	unsigned long flags;
2404 	u8 bus, devfn;
2405 	int ret;
2406 
2407 	iommu = device_to_iommu(dev, &bus, &devfn);
2408 	if (!iommu)
2409 		return -ENODEV;
2410 
2411 	ret = domain_attach_iommu(domain, iommu);
2412 	if (ret)
2413 		return ret;
2414 	info->domain = domain;
2415 	spin_lock_irqsave(&domain->lock, flags);
2416 	list_add(&info->link, &domain->devices);
2417 	spin_unlock_irqrestore(&domain->lock, flags);
2418 
2419 	/* PASID table is mandatory for a PCI device in scalable mode. */
2420 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2421 		/* Setup the PASID entry for requests without PASID: */
2422 		if (hw_pass_through && domain_type_is_si(domain))
2423 			ret = intel_pasid_setup_pass_through(iommu, domain,
2424 					dev, PASID_RID2PASID);
2425 		else if (domain->use_first_level)
2426 			ret = domain_setup_first_level(iommu, domain, dev,
2427 					PASID_RID2PASID);
2428 		else
2429 			ret = intel_pasid_setup_second_level(iommu, domain,
2430 					dev, PASID_RID2PASID);
2431 		if (ret) {
2432 			dev_err(dev, "Setup RID2PASID failed\n");
2433 			device_block_translation(dev);
2434 			return ret;
2435 		}
2436 	}
2437 
2438 	ret = domain_context_mapping(domain, dev);
2439 	if (ret) {
2440 		dev_err(dev, "Domain context map failed\n");
2441 		device_block_translation(dev);
2442 		return ret;
2443 	}
2444 
2445 	iommu_enable_pci_caps(info);
2446 
2447 	return 0;
2448 }
2449 
2450 static bool device_has_rmrr(struct device *dev)
2451 {
2452 	struct dmar_rmrr_unit *rmrr;
2453 	struct device *tmp;
2454 	int i;
2455 
2456 	rcu_read_lock();
2457 	for_each_rmrr_units(rmrr) {
2458 		/*
2459 		 * Return TRUE if this RMRR contains the device that
2460 		 * is passed in.
2461 		 */
2462 		for_each_active_dev_scope(rmrr->devices,
2463 					  rmrr->devices_cnt, i, tmp)
2464 			if (tmp == dev ||
2465 			    is_downstream_to_pci_bridge(dev, tmp)) {
2466 				rcu_read_unlock();
2467 				return true;
2468 			}
2469 	}
2470 	rcu_read_unlock();
2471 	return false;
2472 }
2473 
2474 /**
2475  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2476  * is relaxable (ie. is allowed to be not enforced under some conditions)
2477  * @dev: device handle
2478  *
2479  * We assume that PCI USB devices with RMRRs have them largely
2480  * for historical reasons and that the RMRR space is not actively used post
2481  * boot.  This exclusion may change if vendors begin to abuse it.
2482  *
2483  * The same exception is made for graphics devices, with the requirement that
2484  * any use of the RMRR regions will be torn down before assigning the device
2485  * to a guest.
2486  *
2487  * Return: true if the RMRR is relaxable, false otherwise
2488  */
2489 static bool device_rmrr_is_relaxable(struct device *dev)
2490 {
2491 	struct pci_dev *pdev;
2492 
2493 	if (!dev_is_pci(dev))
2494 		return false;
2495 
2496 	pdev = to_pci_dev(dev);
2497 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2498 		return true;
2499 	else
2500 		return false;
2501 }
2502 
2503 /*
2504  * There are a couple cases where we need to restrict the functionality of
2505  * devices associated with RMRRs.  The first is when evaluating a device for
2506  * identity mapping because problems exist when devices are moved in and out
2507  * of domains and their respective RMRR information is lost.  This means that
2508  * a device with associated RMRRs will never be in a "passthrough" domain.
2509  * The second is use of the device through the IOMMU API.  This interface
2510  * expects to have full control of the IOVA space for the device.  We cannot
2511  * satisfy both the requirement that RMRR access is maintained and have an
2512  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2513  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2514  * We therefore prevent devices associated with an RMRR from participating in
2515  * the IOMMU API, which eliminates them from device assignment.
2516  *
2517  * In both cases, devices which have relaxable RMRRs are not concerned by this
2518  * restriction. See device_rmrr_is_relaxable comment.
2519  */
2520 static bool device_is_rmrr_locked(struct device *dev)
2521 {
2522 	if (!device_has_rmrr(dev))
2523 		return false;
2524 
2525 	if (device_rmrr_is_relaxable(dev))
2526 		return false;
2527 
2528 	return true;
2529 }
2530 
2531 /*
2532  * Return the required default domain type for a specific device.
2533  *
2534  * @dev: the device in query
2535  * @startup: true if this is during early boot
2536  *
2537  * Returns:
2538  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2539  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2540  *  - 0: both identity and dynamic domains work for this device
2541  */
2542 static int device_def_domain_type(struct device *dev)
2543 {
2544 	if (dev_is_pci(dev)) {
2545 		struct pci_dev *pdev = to_pci_dev(dev);
2546 
2547 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2548 			return IOMMU_DOMAIN_IDENTITY;
2549 
2550 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2551 			return IOMMU_DOMAIN_IDENTITY;
2552 	}
2553 
2554 	return 0;
2555 }
2556 
2557 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2558 {
2559 	/*
2560 	 * Start from the sane iommu hardware state.
2561 	 * If the queued invalidation is already initialized by us
2562 	 * (for example, while enabling interrupt-remapping) then
2563 	 * we got the things already rolling from a sane state.
2564 	 */
2565 	if (!iommu->qi) {
2566 		/*
2567 		 * Clear any previous faults.
2568 		 */
2569 		dmar_fault(-1, iommu);
2570 		/*
2571 		 * Disable queued invalidation if supported and already enabled
2572 		 * before OS handover.
2573 		 */
2574 		dmar_disable_qi(iommu);
2575 	}
2576 
2577 	if (dmar_enable_qi(iommu)) {
2578 		/*
2579 		 * Queued Invalidate not enabled, use Register Based Invalidate
2580 		 */
2581 		iommu->flush.flush_context = __iommu_flush_context;
2582 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2583 		pr_info("%s: Using Register based invalidation\n",
2584 			iommu->name);
2585 	} else {
2586 		iommu->flush.flush_context = qi_flush_context;
2587 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2588 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2589 	}
2590 }
2591 
2592 static int copy_context_table(struct intel_iommu *iommu,
2593 			      struct root_entry *old_re,
2594 			      struct context_entry **tbl,
2595 			      int bus, bool ext)
2596 {
2597 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2598 	struct context_entry *new_ce = NULL, ce;
2599 	struct context_entry *old_ce = NULL;
2600 	struct root_entry re;
2601 	phys_addr_t old_ce_phys;
2602 
2603 	tbl_idx = ext ? bus * 2 : bus;
2604 	memcpy(&re, old_re, sizeof(re));
2605 
2606 	for (devfn = 0; devfn < 256; devfn++) {
2607 		/* First calculate the correct index */
2608 		idx = (ext ? devfn * 2 : devfn) % 256;
2609 
2610 		if (idx == 0) {
2611 			/* First save what we may have and clean up */
2612 			if (new_ce) {
2613 				tbl[tbl_idx] = new_ce;
2614 				__iommu_flush_cache(iommu, new_ce,
2615 						    VTD_PAGE_SIZE);
2616 				pos = 1;
2617 			}
2618 
2619 			if (old_ce)
2620 				memunmap(old_ce);
2621 
2622 			ret = 0;
2623 			if (devfn < 0x80)
2624 				old_ce_phys = root_entry_lctp(&re);
2625 			else
2626 				old_ce_phys = root_entry_uctp(&re);
2627 
2628 			if (!old_ce_phys) {
2629 				if (ext && devfn == 0) {
2630 					/* No LCTP, try UCTP */
2631 					devfn = 0x7f;
2632 					continue;
2633 				} else {
2634 					goto out;
2635 				}
2636 			}
2637 
2638 			ret = -ENOMEM;
2639 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2640 					MEMREMAP_WB);
2641 			if (!old_ce)
2642 				goto out;
2643 
2644 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2645 			if (!new_ce)
2646 				goto out_unmap;
2647 
2648 			ret = 0;
2649 		}
2650 
2651 		/* Now copy the context entry */
2652 		memcpy(&ce, old_ce + idx, sizeof(ce));
2653 
2654 		if (!context_present(&ce))
2655 			continue;
2656 
2657 		did = context_domain_id(&ce);
2658 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2659 			set_bit(did, iommu->domain_ids);
2660 
2661 		set_context_copied(iommu, bus, devfn);
2662 		new_ce[idx] = ce;
2663 	}
2664 
2665 	tbl[tbl_idx + pos] = new_ce;
2666 
2667 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2668 
2669 out_unmap:
2670 	memunmap(old_ce);
2671 
2672 out:
2673 	return ret;
2674 }
2675 
2676 static int copy_translation_tables(struct intel_iommu *iommu)
2677 {
2678 	struct context_entry **ctxt_tbls;
2679 	struct root_entry *old_rt;
2680 	phys_addr_t old_rt_phys;
2681 	int ctxt_table_entries;
2682 	u64 rtaddr_reg;
2683 	int bus, ret;
2684 	bool new_ext, ext;
2685 
2686 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2687 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2688 	new_ext    = !!sm_supported(iommu);
2689 
2690 	/*
2691 	 * The RTT bit can only be changed when translation is disabled,
2692 	 * but disabling translation means to open a window for data
2693 	 * corruption. So bail out and don't copy anything if we would
2694 	 * have to change the bit.
2695 	 */
2696 	if (new_ext != ext)
2697 		return -EINVAL;
2698 
2699 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2700 	if (!iommu->copied_tables)
2701 		return -ENOMEM;
2702 
2703 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2704 	if (!old_rt_phys)
2705 		return -EINVAL;
2706 
2707 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2708 	if (!old_rt)
2709 		return -ENOMEM;
2710 
2711 	/* This is too big for the stack - allocate it from slab */
2712 	ctxt_table_entries = ext ? 512 : 256;
2713 	ret = -ENOMEM;
2714 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2715 	if (!ctxt_tbls)
2716 		goto out_unmap;
2717 
2718 	for (bus = 0; bus < 256; bus++) {
2719 		ret = copy_context_table(iommu, &old_rt[bus],
2720 					 ctxt_tbls, bus, ext);
2721 		if (ret) {
2722 			pr_err("%s: Failed to copy context table for bus %d\n",
2723 				iommu->name, bus);
2724 			continue;
2725 		}
2726 	}
2727 
2728 	spin_lock(&iommu->lock);
2729 
2730 	/* Context tables are copied, now write them to the root_entry table */
2731 	for (bus = 0; bus < 256; bus++) {
2732 		int idx = ext ? bus * 2 : bus;
2733 		u64 val;
2734 
2735 		if (ctxt_tbls[idx]) {
2736 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2737 			iommu->root_entry[bus].lo = val;
2738 		}
2739 
2740 		if (!ext || !ctxt_tbls[idx + 1])
2741 			continue;
2742 
2743 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2744 		iommu->root_entry[bus].hi = val;
2745 	}
2746 
2747 	spin_unlock(&iommu->lock);
2748 
2749 	kfree(ctxt_tbls);
2750 
2751 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2752 
2753 	ret = 0;
2754 
2755 out_unmap:
2756 	memunmap(old_rt);
2757 
2758 	return ret;
2759 }
2760 
2761 static int __init init_dmars(void)
2762 {
2763 	struct dmar_drhd_unit *drhd;
2764 	struct intel_iommu *iommu;
2765 	int ret;
2766 
2767 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2768 	if (ret)
2769 		goto free_iommu;
2770 
2771 	for_each_iommu(iommu, drhd) {
2772 		if (drhd->ignored) {
2773 			iommu_disable_translation(iommu);
2774 			continue;
2775 		}
2776 
2777 		/*
2778 		 * Find the max pasid size of all IOMMU's in the system.
2779 		 * We need to ensure the system pasid table is no bigger
2780 		 * than the smallest supported.
2781 		 */
2782 		if (pasid_supported(iommu)) {
2783 			u32 temp = 2 << ecap_pss(iommu->ecap);
2784 
2785 			intel_pasid_max_id = min_t(u32, temp,
2786 						   intel_pasid_max_id);
2787 		}
2788 
2789 		intel_iommu_init_qi(iommu);
2790 
2791 		ret = iommu_init_domains(iommu);
2792 		if (ret)
2793 			goto free_iommu;
2794 
2795 		init_translation_status(iommu);
2796 
2797 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2798 			iommu_disable_translation(iommu);
2799 			clear_translation_pre_enabled(iommu);
2800 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2801 				iommu->name);
2802 		}
2803 
2804 		/*
2805 		 * TBD:
2806 		 * we could share the same root & context tables
2807 		 * among all IOMMU's. Need to Split it later.
2808 		 */
2809 		ret = iommu_alloc_root_entry(iommu);
2810 		if (ret)
2811 			goto free_iommu;
2812 
2813 		if (translation_pre_enabled(iommu)) {
2814 			pr_info("Translation already enabled - trying to copy translation structures\n");
2815 
2816 			ret = copy_translation_tables(iommu);
2817 			if (ret) {
2818 				/*
2819 				 * We found the IOMMU with translation
2820 				 * enabled - but failed to copy over the
2821 				 * old root-entry table. Try to proceed
2822 				 * by disabling translation now and
2823 				 * allocating a clean root-entry table.
2824 				 * This might cause DMAR faults, but
2825 				 * probably the dump will still succeed.
2826 				 */
2827 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2828 				       iommu->name);
2829 				iommu_disable_translation(iommu);
2830 				clear_translation_pre_enabled(iommu);
2831 			} else {
2832 				pr_info("Copied translation tables from previous kernel for %s\n",
2833 					iommu->name);
2834 			}
2835 		}
2836 
2837 		if (!ecap_pass_through(iommu->ecap))
2838 			hw_pass_through = 0;
2839 		intel_svm_check(iommu);
2840 	}
2841 
2842 	/*
2843 	 * Now that qi is enabled on all iommus, set the root entry and flush
2844 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2845 	 * flush_context function will loop forever and the boot hangs.
2846 	 */
2847 	for_each_active_iommu(iommu, drhd) {
2848 		iommu_flush_write_buffer(iommu);
2849 		iommu_set_root_entry(iommu);
2850 	}
2851 
2852 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2853 	dmar_map_gfx = 0;
2854 #endif
2855 
2856 	if (!dmar_map_gfx)
2857 		iommu_identity_mapping |= IDENTMAP_GFX;
2858 
2859 	check_tylersburg_isoch();
2860 
2861 	ret = si_domain_init(hw_pass_through);
2862 	if (ret)
2863 		goto free_iommu;
2864 
2865 	/*
2866 	 * for each drhd
2867 	 *   enable fault log
2868 	 *   global invalidate context cache
2869 	 *   global invalidate iotlb
2870 	 *   enable translation
2871 	 */
2872 	for_each_iommu(iommu, drhd) {
2873 		if (drhd->ignored) {
2874 			/*
2875 			 * we always have to disable PMRs or DMA may fail on
2876 			 * this device
2877 			 */
2878 			if (force_on)
2879 				iommu_disable_protect_mem_regions(iommu);
2880 			continue;
2881 		}
2882 
2883 		iommu_flush_write_buffer(iommu);
2884 
2885 #ifdef CONFIG_INTEL_IOMMU_SVM
2886 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2887 			/*
2888 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2889 			 * could cause possible lock race condition.
2890 			 */
2891 			up_write(&dmar_global_lock);
2892 			ret = intel_svm_enable_prq(iommu);
2893 			down_write(&dmar_global_lock);
2894 			if (ret)
2895 				goto free_iommu;
2896 		}
2897 #endif
2898 		ret = dmar_set_interrupt(iommu);
2899 		if (ret)
2900 			goto free_iommu;
2901 	}
2902 
2903 	return 0;
2904 
2905 free_iommu:
2906 	for_each_active_iommu(iommu, drhd) {
2907 		disable_dmar_iommu(iommu);
2908 		free_dmar_iommu(iommu);
2909 	}
2910 	if (si_domain) {
2911 		domain_exit(si_domain);
2912 		si_domain = NULL;
2913 	}
2914 
2915 	return ret;
2916 }
2917 
2918 static void __init init_no_remapping_devices(void)
2919 {
2920 	struct dmar_drhd_unit *drhd;
2921 	struct device *dev;
2922 	int i;
2923 
2924 	for_each_drhd_unit(drhd) {
2925 		if (!drhd->include_all) {
2926 			for_each_active_dev_scope(drhd->devices,
2927 						  drhd->devices_cnt, i, dev)
2928 				break;
2929 			/* ignore DMAR unit if no devices exist */
2930 			if (i == drhd->devices_cnt)
2931 				drhd->ignored = 1;
2932 		}
2933 	}
2934 
2935 	for_each_active_drhd_unit(drhd) {
2936 		if (drhd->include_all)
2937 			continue;
2938 
2939 		for_each_active_dev_scope(drhd->devices,
2940 					  drhd->devices_cnt, i, dev)
2941 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2942 				break;
2943 		if (i < drhd->devices_cnt)
2944 			continue;
2945 
2946 		/* This IOMMU has *only* gfx devices. Either bypass it or
2947 		   set the gfx_mapped flag, as appropriate */
2948 		drhd->gfx_dedicated = 1;
2949 		if (!dmar_map_gfx)
2950 			drhd->ignored = 1;
2951 	}
2952 }
2953 
2954 #ifdef CONFIG_SUSPEND
2955 static int init_iommu_hw(void)
2956 {
2957 	struct dmar_drhd_unit *drhd;
2958 	struct intel_iommu *iommu = NULL;
2959 	int ret;
2960 
2961 	for_each_active_iommu(iommu, drhd) {
2962 		if (iommu->qi) {
2963 			ret = dmar_reenable_qi(iommu);
2964 			if (ret)
2965 				return ret;
2966 		}
2967 	}
2968 
2969 	for_each_iommu(iommu, drhd) {
2970 		if (drhd->ignored) {
2971 			/*
2972 			 * we always have to disable PMRs or DMA may fail on
2973 			 * this device
2974 			 */
2975 			if (force_on)
2976 				iommu_disable_protect_mem_regions(iommu);
2977 			continue;
2978 		}
2979 
2980 		iommu_flush_write_buffer(iommu);
2981 		iommu_set_root_entry(iommu);
2982 		iommu_enable_translation(iommu);
2983 		iommu_disable_protect_mem_regions(iommu);
2984 	}
2985 
2986 	return 0;
2987 }
2988 
2989 static void iommu_flush_all(void)
2990 {
2991 	struct dmar_drhd_unit *drhd;
2992 	struct intel_iommu *iommu;
2993 
2994 	for_each_active_iommu(iommu, drhd) {
2995 		iommu->flush.flush_context(iommu, 0, 0, 0,
2996 					   DMA_CCMD_GLOBAL_INVL);
2997 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2998 					 DMA_TLB_GLOBAL_FLUSH);
2999 	}
3000 }
3001 
3002 static int iommu_suspend(void)
3003 {
3004 	struct dmar_drhd_unit *drhd;
3005 	struct intel_iommu *iommu = NULL;
3006 	unsigned long flag;
3007 
3008 	for_each_active_iommu(iommu, drhd) {
3009 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3010 					     GFP_KERNEL);
3011 		if (!iommu->iommu_state)
3012 			goto nomem;
3013 	}
3014 
3015 	iommu_flush_all();
3016 
3017 	for_each_active_iommu(iommu, drhd) {
3018 		iommu_disable_translation(iommu);
3019 
3020 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3021 
3022 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3023 			readl(iommu->reg + DMAR_FECTL_REG);
3024 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3025 			readl(iommu->reg + DMAR_FEDATA_REG);
3026 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3027 			readl(iommu->reg + DMAR_FEADDR_REG);
3028 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3029 			readl(iommu->reg + DMAR_FEUADDR_REG);
3030 
3031 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3032 	}
3033 	return 0;
3034 
3035 nomem:
3036 	for_each_active_iommu(iommu, drhd)
3037 		kfree(iommu->iommu_state);
3038 
3039 	return -ENOMEM;
3040 }
3041 
3042 static void iommu_resume(void)
3043 {
3044 	struct dmar_drhd_unit *drhd;
3045 	struct intel_iommu *iommu = NULL;
3046 	unsigned long flag;
3047 
3048 	if (init_iommu_hw()) {
3049 		if (force_on)
3050 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3051 		else
3052 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3053 		return;
3054 	}
3055 
3056 	for_each_active_iommu(iommu, drhd) {
3057 
3058 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3059 
3060 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3061 			iommu->reg + DMAR_FECTL_REG);
3062 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3063 			iommu->reg + DMAR_FEDATA_REG);
3064 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3065 			iommu->reg + DMAR_FEADDR_REG);
3066 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3067 			iommu->reg + DMAR_FEUADDR_REG);
3068 
3069 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3070 	}
3071 
3072 	for_each_active_iommu(iommu, drhd)
3073 		kfree(iommu->iommu_state);
3074 }
3075 
3076 static struct syscore_ops iommu_syscore_ops = {
3077 	.resume		= iommu_resume,
3078 	.suspend	= iommu_suspend,
3079 };
3080 
3081 static void __init init_iommu_pm_ops(void)
3082 {
3083 	register_syscore_ops(&iommu_syscore_ops);
3084 }
3085 
3086 #else
3087 static inline void init_iommu_pm_ops(void) {}
3088 #endif	/* CONFIG_PM */
3089 
3090 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3091 {
3092 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3093 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3094 	    rmrr->end_address <= rmrr->base_address ||
3095 	    arch_rmrr_sanity_check(rmrr))
3096 		return -EINVAL;
3097 
3098 	return 0;
3099 }
3100 
3101 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3102 {
3103 	struct acpi_dmar_reserved_memory *rmrr;
3104 	struct dmar_rmrr_unit *rmrru;
3105 
3106 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3107 	if (rmrr_sanity_check(rmrr)) {
3108 		pr_warn(FW_BUG
3109 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3110 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3111 			   rmrr->base_address, rmrr->end_address,
3112 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3113 			   dmi_get_system_info(DMI_BIOS_VERSION),
3114 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3115 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3116 	}
3117 
3118 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3119 	if (!rmrru)
3120 		goto out;
3121 
3122 	rmrru->hdr = header;
3123 
3124 	rmrru->base_address = rmrr->base_address;
3125 	rmrru->end_address = rmrr->end_address;
3126 
3127 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3128 				((void *)rmrr) + rmrr->header.length,
3129 				&rmrru->devices_cnt);
3130 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3131 		goto free_rmrru;
3132 
3133 	list_add(&rmrru->list, &dmar_rmrr_units);
3134 
3135 	return 0;
3136 free_rmrru:
3137 	kfree(rmrru);
3138 out:
3139 	return -ENOMEM;
3140 }
3141 
3142 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3143 {
3144 	struct dmar_atsr_unit *atsru;
3145 	struct acpi_dmar_atsr *tmp;
3146 
3147 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3148 				dmar_rcu_check()) {
3149 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3150 		if (atsr->segment != tmp->segment)
3151 			continue;
3152 		if (atsr->header.length != tmp->header.length)
3153 			continue;
3154 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3155 			return atsru;
3156 	}
3157 
3158 	return NULL;
3159 }
3160 
3161 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3162 {
3163 	struct acpi_dmar_atsr *atsr;
3164 	struct dmar_atsr_unit *atsru;
3165 
3166 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3167 		return 0;
3168 
3169 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3170 	atsru = dmar_find_atsr(atsr);
3171 	if (atsru)
3172 		return 0;
3173 
3174 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3175 	if (!atsru)
3176 		return -ENOMEM;
3177 
3178 	/*
3179 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3180 	 * copy the memory content because the memory buffer will be freed
3181 	 * on return.
3182 	 */
3183 	atsru->hdr = (void *)(atsru + 1);
3184 	memcpy(atsru->hdr, hdr, hdr->length);
3185 	atsru->include_all = atsr->flags & 0x1;
3186 	if (!atsru->include_all) {
3187 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3188 				(void *)atsr + atsr->header.length,
3189 				&atsru->devices_cnt);
3190 		if (atsru->devices_cnt && atsru->devices == NULL) {
3191 			kfree(atsru);
3192 			return -ENOMEM;
3193 		}
3194 	}
3195 
3196 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3197 
3198 	return 0;
3199 }
3200 
3201 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3202 {
3203 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3204 	kfree(atsru);
3205 }
3206 
3207 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3208 {
3209 	struct acpi_dmar_atsr *atsr;
3210 	struct dmar_atsr_unit *atsru;
3211 
3212 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3213 	atsru = dmar_find_atsr(atsr);
3214 	if (atsru) {
3215 		list_del_rcu(&atsru->list);
3216 		synchronize_rcu();
3217 		intel_iommu_free_atsr(atsru);
3218 	}
3219 
3220 	return 0;
3221 }
3222 
3223 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3224 {
3225 	int i;
3226 	struct device *dev;
3227 	struct acpi_dmar_atsr *atsr;
3228 	struct dmar_atsr_unit *atsru;
3229 
3230 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3231 	atsru = dmar_find_atsr(atsr);
3232 	if (!atsru)
3233 		return 0;
3234 
3235 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3236 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3237 					  i, dev)
3238 			return -EBUSY;
3239 	}
3240 
3241 	return 0;
3242 }
3243 
3244 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3245 {
3246 	struct dmar_satc_unit *satcu;
3247 	struct acpi_dmar_satc *tmp;
3248 
3249 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3250 				dmar_rcu_check()) {
3251 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3252 		if (satc->segment != tmp->segment)
3253 			continue;
3254 		if (satc->header.length != tmp->header.length)
3255 			continue;
3256 		if (memcmp(satc, tmp, satc->header.length) == 0)
3257 			return satcu;
3258 	}
3259 
3260 	return NULL;
3261 }
3262 
3263 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3264 {
3265 	struct acpi_dmar_satc *satc;
3266 	struct dmar_satc_unit *satcu;
3267 
3268 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3269 		return 0;
3270 
3271 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3272 	satcu = dmar_find_satc(satc);
3273 	if (satcu)
3274 		return 0;
3275 
3276 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3277 	if (!satcu)
3278 		return -ENOMEM;
3279 
3280 	satcu->hdr = (void *)(satcu + 1);
3281 	memcpy(satcu->hdr, hdr, hdr->length);
3282 	satcu->atc_required = satc->flags & 0x1;
3283 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3284 					      (void *)satc + satc->header.length,
3285 					      &satcu->devices_cnt);
3286 	if (satcu->devices_cnt && !satcu->devices) {
3287 		kfree(satcu);
3288 		return -ENOMEM;
3289 	}
3290 	list_add_rcu(&satcu->list, &dmar_satc_units);
3291 
3292 	return 0;
3293 }
3294 
3295 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3296 {
3297 	int sp, ret;
3298 	struct intel_iommu *iommu = dmaru->iommu;
3299 
3300 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3301 	if (ret)
3302 		goto out;
3303 
3304 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3305 		pr_warn("%s: Doesn't support hardware pass through.\n",
3306 			iommu->name);
3307 		return -ENXIO;
3308 	}
3309 
3310 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3311 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3312 		pr_warn("%s: Doesn't support large page.\n",
3313 			iommu->name);
3314 		return -ENXIO;
3315 	}
3316 
3317 	/*
3318 	 * Disable translation if already enabled prior to OS handover.
3319 	 */
3320 	if (iommu->gcmd & DMA_GCMD_TE)
3321 		iommu_disable_translation(iommu);
3322 
3323 	ret = iommu_init_domains(iommu);
3324 	if (ret == 0)
3325 		ret = iommu_alloc_root_entry(iommu);
3326 	if (ret)
3327 		goto out;
3328 
3329 	intel_svm_check(iommu);
3330 
3331 	if (dmaru->ignored) {
3332 		/*
3333 		 * we always have to disable PMRs or DMA may fail on this device
3334 		 */
3335 		if (force_on)
3336 			iommu_disable_protect_mem_regions(iommu);
3337 		return 0;
3338 	}
3339 
3340 	intel_iommu_init_qi(iommu);
3341 	iommu_flush_write_buffer(iommu);
3342 
3343 #ifdef CONFIG_INTEL_IOMMU_SVM
3344 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3345 		ret = intel_svm_enable_prq(iommu);
3346 		if (ret)
3347 			goto disable_iommu;
3348 	}
3349 #endif
3350 	ret = dmar_set_interrupt(iommu);
3351 	if (ret)
3352 		goto disable_iommu;
3353 
3354 	iommu_set_root_entry(iommu);
3355 	iommu_enable_translation(iommu);
3356 
3357 	iommu_disable_protect_mem_regions(iommu);
3358 	return 0;
3359 
3360 disable_iommu:
3361 	disable_dmar_iommu(iommu);
3362 out:
3363 	free_dmar_iommu(iommu);
3364 	return ret;
3365 }
3366 
3367 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3368 {
3369 	int ret = 0;
3370 	struct intel_iommu *iommu = dmaru->iommu;
3371 
3372 	if (!intel_iommu_enabled)
3373 		return 0;
3374 	if (iommu == NULL)
3375 		return -EINVAL;
3376 
3377 	if (insert) {
3378 		ret = intel_iommu_add(dmaru);
3379 	} else {
3380 		disable_dmar_iommu(iommu);
3381 		free_dmar_iommu(iommu);
3382 	}
3383 
3384 	return ret;
3385 }
3386 
3387 static void intel_iommu_free_dmars(void)
3388 {
3389 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3390 	struct dmar_atsr_unit *atsru, *atsr_n;
3391 	struct dmar_satc_unit *satcu, *satc_n;
3392 
3393 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3394 		list_del(&rmrru->list);
3395 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3396 		kfree(rmrru);
3397 	}
3398 
3399 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3400 		list_del(&atsru->list);
3401 		intel_iommu_free_atsr(atsru);
3402 	}
3403 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3404 		list_del(&satcu->list);
3405 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3406 		kfree(satcu);
3407 	}
3408 }
3409 
3410 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3411 {
3412 	struct dmar_satc_unit *satcu;
3413 	struct acpi_dmar_satc *satc;
3414 	struct device *tmp;
3415 	int i;
3416 
3417 	dev = pci_physfn(dev);
3418 	rcu_read_lock();
3419 
3420 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3421 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3422 		if (satc->segment != pci_domain_nr(dev->bus))
3423 			continue;
3424 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3425 			if (to_pci_dev(tmp) == dev)
3426 				goto out;
3427 	}
3428 	satcu = NULL;
3429 out:
3430 	rcu_read_unlock();
3431 	return satcu;
3432 }
3433 
3434 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3435 {
3436 	int i, ret = 1;
3437 	struct pci_bus *bus;
3438 	struct pci_dev *bridge = NULL;
3439 	struct device *tmp;
3440 	struct acpi_dmar_atsr *atsr;
3441 	struct dmar_atsr_unit *atsru;
3442 	struct dmar_satc_unit *satcu;
3443 
3444 	dev = pci_physfn(dev);
3445 	satcu = dmar_find_matched_satc_unit(dev);
3446 	if (satcu)
3447 		/*
3448 		 * This device supports ATS as it is in SATC table.
3449 		 * When IOMMU is in legacy mode, enabling ATS is done
3450 		 * automatically by HW for the device that requires
3451 		 * ATS, hence OS should not enable this device ATS
3452 		 * to avoid duplicated TLB invalidation.
3453 		 */
3454 		return !(satcu->atc_required && !sm_supported(iommu));
3455 
3456 	for (bus = dev->bus; bus; bus = bus->parent) {
3457 		bridge = bus->self;
3458 		/* If it's an integrated device, allow ATS */
3459 		if (!bridge)
3460 			return 1;
3461 		/* Connected via non-PCIe: no ATS */
3462 		if (!pci_is_pcie(bridge) ||
3463 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3464 			return 0;
3465 		/* If we found the root port, look it up in the ATSR */
3466 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3467 			break;
3468 	}
3469 
3470 	rcu_read_lock();
3471 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3472 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3473 		if (atsr->segment != pci_domain_nr(dev->bus))
3474 			continue;
3475 
3476 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3477 			if (tmp == &bridge->dev)
3478 				goto out;
3479 
3480 		if (atsru->include_all)
3481 			goto out;
3482 	}
3483 	ret = 0;
3484 out:
3485 	rcu_read_unlock();
3486 
3487 	return ret;
3488 }
3489 
3490 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3491 {
3492 	int ret;
3493 	struct dmar_rmrr_unit *rmrru;
3494 	struct dmar_atsr_unit *atsru;
3495 	struct dmar_satc_unit *satcu;
3496 	struct acpi_dmar_atsr *atsr;
3497 	struct acpi_dmar_reserved_memory *rmrr;
3498 	struct acpi_dmar_satc *satc;
3499 
3500 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3501 		return 0;
3502 
3503 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3504 		rmrr = container_of(rmrru->hdr,
3505 				    struct acpi_dmar_reserved_memory, header);
3506 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3507 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3508 				((void *)rmrr) + rmrr->header.length,
3509 				rmrr->segment, rmrru->devices,
3510 				rmrru->devices_cnt);
3511 			if (ret < 0)
3512 				return ret;
3513 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514 			dmar_remove_dev_scope(info, rmrr->segment,
3515 				rmrru->devices, rmrru->devices_cnt);
3516 		}
3517 	}
3518 
3519 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3520 		if (atsru->include_all)
3521 			continue;
3522 
3523 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3524 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3525 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3526 					(void *)atsr + atsr->header.length,
3527 					atsr->segment, atsru->devices,
3528 					atsru->devices_cnt);
3529 			if (ret > 0)
3530 				break;
3531 			else if (ret < 0)
3532 				return ret;
3533 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3534 			if (dmar_remove_dev_scope(info, atsr->segment,
3535 					atsru->devices, atsru->devices_cnt))
3536 				break;
3537 		}
3538 	}
3539 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3540 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3541 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3542 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3543 					(void *)satc + satc->header.length,
3544 					satc->segment, satcu->devices,
3545 					satcu->devices_cnt);
3546 			if (ret > 0)
3547 				break;
3548 			else if (ret < 0)
3549 				return ret;
3550 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3551 			if (dmar_remove_dev_scope(info, satc->segment,
3552 					satcu->devices, satcu->devices_cnt))
3553 				break;
3554 		}
3555 	}
3556 
3557 	return 0;
3558 }
3559 
3560 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3561 				       unsigned long val, void *v)
3562 {
3563 	struct memory_notify *mhp = v;
3564 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3565 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3566 			mhp->nr_pages - 1);
3567 
3568 	switch (val) {
3569 	case MEM_GOING_ONLINE:
3570 		if (iommu_domain_identity_map(si_domain,
3571 					      start_vpfn, last_vpfn)) {
3572 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3573 				start_vpfn, last_vpfn);
3574 			return NOTIFY_BAD;
3575 		}
3576 		break;
3577 
3578 	case MEM_OFFLINE:
3579 	case MEM_CANCEL_ONLINE:
3580 		{
3581 			struct dmar_drhd_unit *drhd;
3582 			struct intel_iommu *iommu;
3583 			LIST_HEAD(freelist);
3584 
3585 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3586 
3587 			rcu_read_lock();
3588 			for_each_active_iommu(iommu, drhd)
3589 				iommu_flush_iotlb_psi(iommu, si_domain,
3590 					start_vpfn, mhp->nr_pages,
3591 					list_empty(&freelist), 0);
3592 			rcu_read_unlock();
3593 			put_pages_list(&freelist);
3594 		}
3595 		break;
3596 	}
3597 
3598 	return NOTIFY_OK;
3599 }
3600 
3601 static struct notifier_block intel_iommu_memory_nb = {
3602 	.notifier_call = intel_iommu_memory_notifier,
3603 	.priority = 0
3604 };
3605 
3606 static void intel_disable_iommus(void)
3607 {
3608 	struct intel_iommu *iommu = NULL;
3609 	struct dmar_drhd_unit *drhd;
3610 
3611 	for_each_iommu(iommu, drhd)
3612 		iommu_disable_translation(iommu);
3613 }
3614 
3615 void intel_iommu_shutdown(void)
3616 {
3617 	struct dmar_drhd_unit *drhd;
3618 	struct intel_iommu *iommu = NULL;
3619 
3620 	if (no_iommu || dmar_disabled)
3621 		return;
3622 
3623 	down_write(&dmar_global_lock);
3624 
3625 	/* Disable PMRs explicitly here. */
3626 	for_each_iommu(iommu, drhd)
3627 		iommu_disable_protect_mem_regions(iommu);
3628 
3629 	/* Make sure the IOMMUs are switched off */
3630 	intel_disable_iommus();
3631 
3632 	up_write(&dmar_global_lock);
3633 }
3634 
3635 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3636 {
3637 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3638 
3639 	return container_of(iommu_dev, struct intel_iommu, iommu);
3640 }
3641 
3642 static ssize_t version_show(struct device *dev,
3643 			    struct device_attribute *attr, char *buf)
3644 {
3645 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3646 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3647 	return sysfs_emit(buf, "%d:%d\n",
3648 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3649 }
3650 static DEVICE_ATTR_RO(version);
3651 
3652 static ssize_t address_show(struct device *dev,
3653 			    struct device_attribute *attr, char *buf)
3654 {
3655 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3656 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3657 }
3658 static DEVICE_ATTR_RO(address);
3659 
3660 static ssize_t cap_show(struct device *dev,
3661 			struct device_attribute *attr, char *buf)
3662 {
3663 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3664 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3665 }
3666 static DEVICE_ATTR_RO(cap);
3667 
3668 static ssize_t ecap_show(struct device *dev,
3669 			 struct device_attribute *attr, char *buf)
3670 {
3671 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3672 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3673 }
3674 static DEVICE_ATTR_RO(ecap);
3675 
3676 static ssize_t domains_supported_show(struct device *dev,
3677 				      struct device_attribute *attr, char *buf)
3678 {
3679 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3680 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3681 }
3682 static DEVICE_ATTR_RO(domains_supported);
3683 
3684 static ssize_t domains_used_show(struct device *dev,
3685 				 struct device_attribute *attr, char *buf)
3686 {
3687 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3688 	return sysfs_emit(buf, "%d\n",
3689 			  bitmap_weight(iommu->domain_ids,
3690 					cap_ndoms(iommu->cap)));
3691 }
3692 static DEVICE_ATTR_RO(domains_used);
3693 
3694 static struct attribute *intel_iommu_attrs[] = {
3695 	&dev_attr_version.attr,
3696 	&dev_attr_address.attr,
3697 	&dev_attr_cap.attr,
3698 	&dev_attr_ecap.attr,
3699 	&dev_attr_domains_supported.attr,
3700 	&dev_attr_domains_used.attr,
3701 	NULL,
3702 };
3703 
3704 static struct attribute_group intel_iommu_group = {
3705 	.name = "intel-iommu",
3706 	.attrs = intel_iommu_attrs,
3707 };
3708 
3709 const struct attribute_group *intel_iommu_groups[] = {
3710 	&intel_iommu_group,
3711 	NULL,
3712 };
3713 
3714 static inline bool has_external_pci(void)
3715 {
3716 	struct pci_dev *pdev = NULL;
3717 
3718 	for_each_pci_dev(pdev)
3719 		if (pdev->external_facing) {
3720 			pci_dev_put(pdev);
3721 			return true;
3722 		}
3723 
3724 	return false;
3725 }
3726 
3727 static int __init platform_optin_force_iommu(void)
3728 {
3729 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3730 		return 0;
3731 
3732 	if (no_iommu || dmar_disabled)
3733 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3734 
3735 	/*
3736 	 * If Intel-IOMMU is disabled by default, we will apply identity
3737 	 * map for all devices except those marked as being untrusted.
3738 	 */
3739 	if (dmar_disabled)
3740 		iommu_set_default_passthrough(false);
3741 
3742 	dmar_disabled = 0;
3743 	no_iommu = 0;
3744 
3745 	return 1;
3746 }
3747 
3748 static int __init probe_acpi_namespace_devices(void)
3749 {
3750 	struct dmar_drhd_unit *drhd;
3751 	/* To avoid a -Wunused-but-set-variable warning. */
3752 	struct intel_iommu *iommu __maybe_unused;
3753 	struct device *dev;
3754 	int i, ret = 0;
3755 
3756 	for_each_active_iommu(iommu, drhd) {
3757 		for_each_active_dev_scope(drhd->devices,
3758 					  drhd->devices_cnt, i, dev) {
3759 			struct acpi_device_physical_node *pn;
3760 			struct iommu_group *group;
3761 			struct acpi_device *adev;
3762 
3763 			if (dev->bus != &acpi_bus_type)
3764 				continue;
3765 
3766 			adev = to_acpi_device(dev);
3767 			mutex_lock(&adev->physical_node_lock);
3768 			list_for_each_entry(pn,
3769 					    &adev->physical_node_list, node) {
3770 				group = iommu_group_get(pn->dev);
3771 				if (group) {
3772 					iommu_group_put(group);
3773 					continue;
3774 				}
3775 
3776 				ret = iommu_probe_device(pn->dev);
3777 				if (ret)
3778 					break;
3779 			}
3780 			mutex_unlock(&adev->physical_node_lock);
3781 
3782 			if (ret)
3783 				return ret;
3784 		}
3785 	}
3786 
3787 	return 0;
3788 }
3789 
3790 static __init int tboot_force_iommu(void)
3791 {
3792 	if (!tboot_enabled())
3793 		return 0;
3794 
3795 	if (no_iommu || dmar_disabled)
3796 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3797 
3798 	dmar_disabled = 0;
3799 	no_iommu = 0;
3800 
3801 	return 1;
3802 }
3803 
3804 int __init intel_iommu_init(void)
3805 {
3806 	int ret = -ENODEV;
3807 	struct dmar_drhd_unit *drhd;
3808 	struct intel_iommu *iommu;
3809 
3810 	/*
3811 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3812 	 * opt in, so enforce that.
3813 	 */
3814 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3815 		    platform_optin_force_iommu();
3816 
3817 	down_write(&dmar_global_lock);
3818 	if (dmar_table_init()) {
3819 		if (force_on)
3820 			panic("tboot: Failed to initialize DMAR table\n");
3821 		goto out_free_dmar;
3822 	}
3823 
3824 	if (dmar_dev_scope_init() < 0) {
3825 		if (force_on)
3826 			panic("tboot: Failed to initialize DMAR device scope\n");
3827 		goto out_free_dmar;
3828 	}
3829 
3830 	up_write(&dmar_global_lock);
3831 
3832 	/*
3833 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3834 	 * complain later when we register it under the lock.
3835 	 */
3836 	dmar_register_bus_notifier();
3837 
3838 	down_write(&dmar_global_lock);
3839 
3840 	if (!no_iommu)
3841 		intel_iommu_debugfs_init();
3842 
3843 	if (no_iommu || dmar_disabled) {
3844 		/*
3845 		 * We exit the function here to ensure IOMMU's remapping and
3846 		 * mempool aren't setup, which means that the IOMMU's PMRs
3847 		 * won't be disabled via the call to init_dmars(). So disable
3848 		 * it explicitly here. The PMRs were setup by tboot prior to
3849 		 * calling SENTER, but the kernel is expected to reset/tear
3850 		 * down the PMRs.
3851 		 */
3852 		if (intel_iommu_tboot_noforce) {
3853 			for_each_iommu(iommu, drhd)
3854 				iommu_disable_protect_mem_regions(iommu);
3855 		}
3856 
3857 		/*
3858 		 * Make sure the IOMMUs are switched off, even when we
3859 		 * boot into a kexec kernel and the previous kernel left
3860 		 * them enabled
3861 		 */
3862 		intel_disable_iommus();
3863 		goto out_free_dmar;
3864 	}
3865 
3866 	if (list_empty(&dmar_rmrr_units))
3867 		pr_info("No RMRR found\n");
3868 
3869 	if (list_empty(&dmar_atsr_units))
3870 		pr_info("No ATSR found\n");
3871 
3872 	if (list_empty(&dmar_satc_units))
3873 		pr_info("No SATC found\n");
3874 
3875 	init_no_remapping_devices();
3876 
3877 	ret = init_dmars();
3878 	if (ret) {
3879 		if (force_on)
3880 			panic("tboot: Failed to initialize DMARs\n");
3881 		pr_err("Initialization failed\n");
3882 		goto out_free_dmar;
3883 	}
3884 	up_write(&dmar_global_lock);
3885 
3886 	init_iommu_pm_ops();
3887 
3888 	down_read(&dmar_global_lock);
3889 	for_each_active_iommu(iommu, drhd) {
3890 		/*
3891 		 * The flush queue implementation does not perform
3892 		 * page-selective invalidations that are required for efficient
3893 		 * TLB flushes in virtual environments.  The benefit of batching
3894 		 * is likely to be much lower than the overhead of synchronizing
3895 		 * the virtual and physical IOMMU page-tables.
3896 		 */
3897 		if (cap_caching_mode(iommu->cap) &&
3898 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3899 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3900 			iommu_set_dma_strict();
3901 		}
3902 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3903 				       intel_iommu_groups,
3904 				       "%s", iommu->name);
3905 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3906 
3907 		iommu_pmu_register(iommu);
3908 	}
3909 	up_read(&dmar_global_lock);
3910 
3911 	if (si_domain && !hw_pass_through)
3912 		register_memory_notifier(&intel_iommu_memory_nb);
3913 
3914 	down_read(&dmar_global_lock);
3915 	if (probe_acpi_namespace_devices())
3916 		pr_warn("ACPI name space devices didn't probe correctly\n");
3917 
3918 	/* Finally, we enable the DMA remapping hardware. */
3919 	for_each_iommu(iommu, drhd) {
3920 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3921 			iommu_enable_translation(iommu);
3922 
3923 		iommu_disable_protect_mem_regions(iommu);
3924 	}
3925 	up_read(&dmar_global_lock);
3926 
3927 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3928 
3929 	intel_iommu_enabled = 1;
3930 
3931 	return 0;
3932 
3933 out_free_dmar:
3934 	intel_iommu_free_dmars();
3935 	up_write(&dmar_global_lock);
3936 	return ret;
3937 }
3938 
3939 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3940 {
3941 	struct device_domain_info *info = opaque;
3942 
3943 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3944 	return 0;
3945 }
3946 
3947 /*
3948  * NB - intel-iommu lacks any sort of reference counting for the users of
3949  * dependent devices.  If multiple endpoints have intersecting dependent
3950  * devices, unbinding the driver from any one of them will possibly leave
3951  * the others unable to operate.
3952  */
3953 static void domain_context_clear(struct device_domain_info *info)
3954 {
3955 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3956 		return;
3957 
3958 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3959 			       &domain_context_clear_one_cb, info);
3960 }
3961 
3962 static void dmar_remove_one_dev_info(struct device *dev)
3963 {
3964 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3965 	struct dmar_domain *domain = info->domain;
3966 	struct intel_iommu *iommu = info->iommu;
3967 	unsigned long flags;
3968 
3969 	if (!dev_is_real_dma_subdevice(info->dev)) {
3970 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3971 			intel_pasid_tear_down_entry(iommu, info->dev,
3972 					PASID_RID2PASID, false);
3973 
3974 		iommu_disable_pci_caps(info);
3975 		domain_context_clear(info);
3976 	}
3977 
3978 	spin_lock_irqsave(&domain->lock, flags);
3979 	list_del(&info->link);
3980 	spin_unlock_irqrestore(&domain->lock, flags);
3981 
3982 	domain_detach_iommu(domain, iommu);
3983 	info->domain = NULL;
3984 }
3985 
3986 /*
3987  * Clear the page table pointer in context or pasid table entries so that
3988  * all DMA requests without PASID from the device are blocked. If the page
3989  * table has been set, clean up the data structures.
3990  */
3991 static void device_block_translation(struct device *dev)
3992 {
3993 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3994 	struct intel_iommu *iommu = info->iommu;
3995 	unsigned long flags;
3996 
3997 	iommu_disable_pci_caps(info);
3998 	if (!dev_is_real_dma_subdevice(dev)) {
3999 		if (sm_supported(iommu))
4000 			intel_pasid_tear_down_entry(iommu, dev,
4001 						    PASID_RID2PASID, false);
4002 		else
4003 			domain_context_clear(info);
4004 	}
4005 
4006 	if (!info->domain)
4007 		return;
4008 
4009 	spin_lock_irqsave(&info->domain->lock, flags);
4010 	list_del(&info->link);
4011 	spin_unlock_irqrestore(&info->domain->lock, flags);
4012 
4013 	domain_detach_iommu(info->domain, iommu);
4014 	info->domain = NULL;
4015 }
4016 
4017 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4018 {
4019 	int adjust_width;
4020 
4021 	/* calculate AGAW */
4022 	domain->gaw = guest_width;
4023 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4024 	domain->agaw = width_to_agaw(adjust_width);
4025 
4026 	domain->iommu_coherency = false;
4027 	domain->iommu_superpage = 0;
4028 	domain->max_addr = 0;
4029 
4030 	/* always allocate the top pgd */
4031 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4032 	if (!domain->pgd)
4033 		return -ENOMEM;
4034 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4035 	return 0;
4036 }
4037 
4038 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4039 				      struct device *dev)
4040 {
4041 	device_block_translation(dev);
4042 	return 0;
4043 }
4044 
4045 static struct iommu_domain blocking_domain = {
4046 	.ops = &(const struct iommu_domain_ops) {
4047 		.attach_dev	= blocking_domain_attach_dev,
4048 		.free		= intel_iommu_domain_free
4049 	}
4050 };
4051 
4052 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4053 {
4054 	struct dmar_domain *dmar_domain;
4055 	struct iommu_domain *domain;
4056 
4057 	switch (type) {
4058 	case IOMMU_DOMAIN_BLOCKED:
4059 		return &blocking_domain;
4060 	case IOMMU_DOMAIN_DMA:
4061 	case IOMMU_DOMAIN_UNMANAGED:
4062 		dmar_domain = alloc_domain(type);
4063 		if (!dmar_domain) {
4064 			pr_err("Can't allocate dmar_domain\n");
4065 			return NULL;
4066 		}
4067 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4068 			pr_err("Domain initialization failed\n");
4069 			domain_exit(dmar_domain);
4070 			return NULL;
4071 		}
4072 
4073 		domain = &dmar_domain->domain;
4074 		domain->geometry.aperture_start = 0;
4075 		domain->geometry.aperture_end   =
4076 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4077 		domain->geometry.force_aperture = true;
4078 
4079 		return domain;
4080 	case IOMMU_DOMAIN_IDENTITY:
4081 		return &si_domain->domain;
4082 	case IOMMU_DOMAIN_SVA:
4083 		return intel_svm_domain_alloc();
4084 	default:
4085 		return NULL;
4086 	}
4087 
4088 	return NULL;
4089 }
4090 
4091 static void intel_iommu_domain_free(struct iommu_domain *domain)
4092 {
4093 	if (domain != &si_domain->domain && domain != &blocking_domain)
4094 		domain_exit(to_dmar_domain(domain));
4095 }
4096 
4097 static int prepare_domain_attach_device(struct iommu_domain *domain,
4098 					struct device *dev)
4099 {
4100 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4101 	struct intel_iommu *iommu;
4102 	int addr_width;
4103 
4104 	iommu = device_to_iommu(dev, NULL, NULL);
4105 	if (!iommu)
4106 		return -ENODEV;
4107 
4108 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4109 		return -EINVAL;
4110 
4111 	/* check if this iommu agaw is sufficient for max mapped address */
4112 	addr_width = agaw_to_width(iommu->agaw);
4113 	if (addr_width > cap_mgaw(iommu->cap))
4114 		addr_width = cap_mgaw(iommu->cap);
4115 
4116 	if (dmar_domain->max_addr > (1LL << addr_width))
4117 		return -EINVAL;
4118 	dmar_domain->gaw = addr_width;
4119 
4120 	/*
4121 	 * Knock out extra levels of page tables if necessary
4122 	 */
4123 	while (iommu->agaw < dmar_domain->agaw) {
4124 		struct dma_pte *pte;
4125 
4126 		pte = dmar_domain->pgd;
4127 		if (dma_pte_present(pte)) {
4128 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4129 			free_pgtable_page(pte);
4130 		}
4131 		dmar_domain->agaw--;
4132 	}
4133 
4134 	return 0;
4135 }
4136 
4137 static int intel_iommu_attach_device(struct iommu_domain *domain,
4138 				     struct device *dev)
4139 {
4140 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4141 	int ret;
4142 
4143 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4144 	    device_is_rmrr_locked(dev)) {
4145 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4146 		return -EPERM;
4147 	}
4148 
4149 	if (info->domain)
4150 		device_block_translation(dev);
4151 
4152 	ret = prepare_domain_attach_device(domain, dev);
4153 	if (ret)
4154 		return ret;
4155 
4156 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4157 }
4158 
4159 static int intel_iommu_map(struct iommu_domain *domain,
4160 			   unsigned long iova, phys_addr_t hpa,
4161 			   size_t size, int iommu_prot, gfp_t gfp)
4162 {
4163 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4164 	u64 max_addr;
4165 	int prot = 0;
4166 
4167 	if (iommu_prot & IOMMU_READ)
4168 		prot |= DMA_PTE_READ;
4169 	if (iommu_prot & IOMMU_WRITE)
4170 		prot |= DMA_PTE_WRITE;
4171 	if (dmar_domain->set_pte_snp)
4172 		prot |= DMA_PTE_SNP;
4173 
4174 	max_addr = iova + size;
4175 	if (dmar_domain->max_addr < max_addr) {
4176 		u64 end;
4177 
4178 		/* check if minimum agaw is sufficient for mapped address */
4179 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4180 		if (end < max_addr) {
4181 			pr_err("%s: iommu width (%d) is not "
4182 			       "sufficient for the mapped address (%llx)\n",
4183 			       __func__, dmar_domain->gaw, max_addr);
4184 			return -EFAULT;
4185 		}
4186 		dmar_domain->max_addr = max_addr;
4187 	}
4188 	/* Round up size to next multiple of PAGE_SIZE, if it and
4189 	   the low bits of hpa would take us onto the next page */
4190 	size = aligned_nrpages(hpa, size);
4191 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4192 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4193 }
4194 
4195 static int intel_iommu_map_pages(struct iommu_domain *domain,
4196 				 unsigned long iova, phys_addr_t paddr,
4197 				 size_t pgsize, size_t pgcount,
4198 				 int prot, gfp_t gfp, size_t *mapped)
4199 {
4200 	unsigned long pgshift = __ffs(pgsize);
4201 	size_t size = pgcount << pgshift;
4202 	int ret;
4203 
4204 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4205 		return -EINVAL;
4206 
4207 	if (!IS_ALIGNED(iova | paddr, pgsize))
4208 		return -EINVAL;
4209 
4210 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4211 	if (!ret && mapped)
4212 		*mapped = size;
4213 
4214 	return ret;
4215 }
4216 
4217 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4218 				unsigned long iova, size_t size,
4219 				struct iommu_iotlb_gather *gather)
4220 {
4221 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4222 	unsigned long start_pfn, last_pfn;
4223 	int level = 0;
4224 
4225 	/* Cope with horrid API which requires us to unmap more than the
4226 	   size argument if it happens to be a large-page mapping. */
4227 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4228 				     &level, GFP_ATOMIC)))
4229 		return 0;
4230 
4231 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4232 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4233 
4234 	start_pfn = iova >> VTD_PAGE_SHIFT;
4235 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4236 
4237 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4238 
4239 	if (dmar_domain->max_addr == iova + size)
4240 		dmar_domain->max_addr = iova;
4241 
4242 	/*
4243 	 * We do not use page-selective IOTLB invalidation in flush queue,
4244 	 * so there is no need to track page and sync iotlb.
4245 	 */
4246 	if (!iommu_iotlb_gather_queued(gather))
4247 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4248 
4249 	return size;
4250 }
4251 
4252 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4253 				      unsigned long iova,
4254 				      size_t pgsize, size_t pgcount,
4255 				      struct iommu_iotlb_gather *gather)
4256 {
4257 	unsigned long pgshift = __ffs(pgsize);
4258 	size_t size = pgcount << pgshift;
4259 
4260 	return intel_iommu_unmap(domain, iova, size, gather);
4261 }
4262 
4263 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4264 				 struct iommu_iotlb_gather *gather)
4265 {
4266 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4267 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4268 	size_t size = gather->end - gather->start;
4269 	struct iommu_domain_info *info;
4270 	unsigned long start_pfn;
4271 	unsigned long nrpages;
4272 	unsigned long i;
4273 
4274 	nrpages = aligned_nrpages(gather->start, size);
4275 	start_pfn = mm_to_dma_pfn(iova_pfn);
4276 
4277 	xa_for_each(&dmar_domain->iommu_array, i, info)
4278 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4279 				      start_pfn, nrpages,
4280 				      list_empty(&gather->freelist), 0);
4281 
4282 	put_pages_list(&gather->freelist);
4283 }
4284 
4285 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4286 					    dma_addr_t iova)
4287 {
4288 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4289 	struct dma_pte *pte;
4290 	int level = 0;
4291 	u64 phys = 0;
4292 
4293 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4294 			     GFP_ATOMIC);
4295 	if (pte && dma_pte_present(pte))
4296 		phys = dma_pte_addr(pte) +
4297 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4298 						VTD_PAGE_SHIFT) - 1));
4299 
4300 	return phys;
4301 }
4302 
4303 static bool domain_support_force_snooping(struct dmar_domain *domain)
4304 {
4305 	struct device_domain_info *info;
4306 	bool support = true;
4307 
4308 	assert_spin_locked(&domain->lock);
4309 	list_for_each_entry(info, &domain->devices, link) {
4310 		if (!ecap_sc_support(info->iommu->ecap)) {
4311 			support = false;
4312 			break;
4313 		}
4314 	}
4315 
4316 	return support;
4317 }
4318 
4319 static void domain_set_force_snooping(struct dmar_domain *domain)
4320 {
4321 	struct device_domain_info *info;
4322 
4323 	assert_spin_locked(&domain->lock);
4324 	/*
4325 	 * Second level page table supports per-PTE snoop control. The
4326 	 * iommu_map() interface will handle this by setting SNP bit.
4327 	 */
4328 	if (!domain->use_first_level) {
4329 		domain->set_pte_snp = true;
4330 		return;
4331 	}
4332 
4333 	list_for_each_entry(info, &domain->devices, link)
4334 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4335 						     PASID_RID2PASID);
4336 }
4337 
4338 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4339 {
4340 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4341 	unsigned long flags;
4342 
4343 	if (dmar_domain->force_snooping)
4344 		return true;
4345 
4346 	spin_lock_irqsave(&dmar_domain->lock, flags);
4347 	if (!domain_support_force_snooping(dmar_domain)) {
4348 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4349 		return false;
4350 	}
4351 
4352 	domain_set_force_snooping(dmar_domain);
4353 	dmar_domain->force_snooping = true;
4354 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4355 
4356 	return true;
4357 }
4358 
4359 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4360 {
4361 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4362 
4363 	switch (cap) {
4364 	case IOMMU_CAP_CACHE_COHERENCY:
4365 	case IOMMU_CAP_DEFERRED_FLUSH:
4366 		return true;
4367 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4368 		return dmar_platform_optin();
4369 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4370 		return ecap_sc_support(info->iommu->ecap);
4371 	default:
4372 		return false;
4373 	}
4374 }
4375 
4376 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4377 {
4378 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4379 	struct device_domain_info *info;
4380 	struct intel_iommu *iommu;
4381 	u8 bus, devfn;
4382 	int ret;
4383 
4384 	iommu = device_to_iommu(dev, &bus, &devfn);
4385 	if (!iommu || !iommu->iommu.ops)
4386 		return ERR_PTR(-ENODEV);
4387 
4388 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4389 	if (!info)
4390 		return ERR_PTR(-ENOMEM);
4391 
4392 	if (dev_is_real_dma_subdevice(dev)) {
4393 		info->bus = pdev->bus->number;
4394 		info->devfn = pdev->devfn;
4395 		info->segment = pci_domain_nr(pdev->bus);
4396 	} else {
4397 		info->bus = bus;
4398 		info->devfn = devfn;
4399 		info->segment = iommu->segment;
4400 	}
4401 
4402 	info->dev = dev;
4403 	info->iommu = iommu;
4404 	if (dev_is_pci(dev)) {
4405 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4406 		    pci_ats_supported(pdev) &&
4407 		    dmar_ats_supported(pdev, iommu)) {
4408 			info->ats_supported = 1;
4409 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4410 
4411 			/*
4412 			 * For IOMMU that supports device IOTLB throttling
4413 			 * (DIT), we assign PFSID to the invalidation desc
4414 			 * of a VF such that IOMMU HW can gauge queue depth
4415 			 * at PF level. If DIT is not set, PFSID will be
4416 			 * treated as reserved, which should be set to 0.
4417 			 */
4418 			if (ecap_dit(iommu->ecap))
4419 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4420 			info->ats_qdep = pci_ats_queue_depth(pdev);
4421 		}
4422 		if (sm_supported(iommu)) {
4423 			if (pasid_supported(iommu)) {
4424 				int features = pci_pasid_features(pdev);
4425 
4426 				if (features >= 0)
4427 					info->pasid_supported = features | 1;
4428 			}
4429 
4430 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4431 			    pci_pri_supported(pdev))
4432 				info->pri_supported = 1;
4433 		}
4434 	}
4435 
4436 	dev_iommu_priv_set(dev, info);
4437 
4438 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4439 		ret = intel_pasid_alloc_table(dev);
4440 		if (ret) {
4441 			dev_err(dev, "PASID table allocation failed\n");
4442 			dev_iommu_priv_set(dev, NULL);
4443 			kfree(info);
4444 			return ERR_PTR(ret);
4445 		}
4446 	}
4447 
4448 	return &iommu->iommu;
4449 }
4450 
4451 static void intel_iommu_release_device(struct device *dev)
4452 {
4453 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4454 
4455 	dmar_remove_one_dev_info(dev);
4456 	intel_pasid_free_table(dev);
4457 	dev_iommu_priv_set(dev, NULL);
4458 	kfree(info);
4459 	set_dma_ops(dev, NULL);
4460 }
4461 
4462 static void intel_iommu_probe_finalize(struct device *dev)
4463 {
4464 	set_dma_ops(dev, NULL);
4465 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4466 }
4467 
4468 static void intel_iommu_get_resv_regions(struct device *device,
4469 					 struct list_head *head)
4470 {
4471 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4472 	struct iommu_resv_region *reg;
4473 	struct dmar_rmrr_unit *rmrr;
4474 	struct device *i_dev;
4475 	int i;
4476 
4477 	rcu_read_lock();
4478 	for_each_rmrr_units(rmrr) {
4479 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4480 					  i, i_dev) {
4481 			struct iommu_resv_region *resv;
4482 			enum iommu_resv_type type;
4483 			size_t length;
4484 
4485 			if (i_dev != device &&
4486 			    !is_downstream_to_pci_bridge(device, i_dev))
4487 				continue;
4488 
4489 			length = rmrr->end_address - rmrr->base_address + 1;
4490 
4491 			type = device_rmrr_is_relaxable(device) ?
4492 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4493 
4494 			resv = iommu_alloc_resv_region(rmrr->base_address,
4495 						       length, prot, type,
4496 						       GFP_ATOMIC);
4497 			if (!resv)
4498 				break;
4499 
4500 			list_add_tail(&resv->list, head);
4501 		}
4502 	}
4503 	rcu_read_unlock();
4504 
4505 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4506 	if (dev_is_pci(device)) {
4507 		struct pci_dev *pdev = to_pci_dev(device);
4508 
4509 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4510 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4511 					IOMMU_RESV_DIRECT_RELAXABLE,
4512 					GFP_KERNEL);
4513 			if (reg)
4514 				list_add_tail(&reg->list, head);
4515 		}
4516 	}
4517 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4518 
4519 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4520 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4521 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4522 	if (!reg)
4523 		return;
4524 	list_add_tail(&reg->list, head);
4525 }
4526 
4527 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4528 {
4529 	if (dev_is_pci(dev))
4530 		return pci_device_group(dev);
4531 	return generic_device_group(dev);
4532 }
4533 
4534 static int intel_iommu_enable_sva(struct device *dev)
4535 {
4536 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4537 	struct intel_iommu *iommu;
4538 
4539 	if (!info || dmar_disabled)
4540 		return -EINVAL;
4541 
4542 	iommu = info->iommu;
4543 	if (!iommu)
4544 		return -EINVAL;
4545 
4546 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4547 		return -ENODEV;
4548 
4549 	if (!info->pasid_enabled || !info->ats_enabled)
4550 		return -EINVAL;
4551 
4552 	/*
4553 	 * Devices having device-specific I/O fault handling should not
4554 	 * support PCI/PRI. The IOMMU side has no means to check the
4555 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4556 	 * default that if the device driver enables SVA on a non-PRI
4557 	 * device, it will handle IOPF in its own way.
4558 	 */
4559 	if (!info->pri_supported)
4560 		return 0;
4561 
4562 	/* Devices supporting PRI should have it enabled. */
4563 	if (!info->pri_enabled)
4564 		return -EINVAL;
4565 
4566 	return 0;
4567 }
4568 
4569 static int intel_iommu_enable_iopf(struct device *dev)
4570 {
4571 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4572 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4573 	struct intel_iommu *iommu;
4574 	int ret;
4575 
4576 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4577 		return -ENODEV;
4578 
4579 	if (info->pri_enabled)
4580 		return -EBUSY;
4581 
4582 	iommu = info->iommu;
4583 	if (!iommu)
4584 		return -EINVAL;
4585 
4586 	/* PASID is required in PRG Response Message. */
4587 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4588 		return -EINVAL;
4589 
4590 	ret = pci_reset_pri(pdev);
4591 	if (ret)
4592 		return ret;
4593 
4594 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4595 	if (ret)
4596 		return ret;
4597 
4598 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4599 	if (ret)
4600 		goto iopf_remove_device;
4601 
4602 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4603 	if (ret)
4604 		goto iopf_unregister_handler;
4605 	info->pri_enabled = 1;
4606 
4607 	return 0;
4608 
4609 iopf_unregister_handler:
4610 	iommu_unregister_device_fault_handler(dev);
4611 iopf_remove_device:
4612 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4613 
4614 	return ret;
4615 }
4616 
4617 static int intel_iommu_disable_iopf(struct device *dev)
4618 {
4619 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4620 	struct intel_iommu *iommu = info->iommu;
4621 
4622 	if (!info->pri_enabled)
4623 		return -EINVAL;
4624 
4625 	/*
4626 	 * PCIe spec states that by clearing PRI enable bit, the Page
4627 	 * Request Interface will not issue new page requests, but has
4628 	 * outstanding page requests that have been transmitted or are
4629 	 * queued for transmission. This is supposed to be called after
4630 	 * the device driver has stopped DMA, all PASIDs have been
4631 	 * unbound and the outstanding PRQs have been drained.
4632 	 */
4633 	pci_disable_pri(to_pci_dev(dev));
4634 	info->pri_enabled = 0;
4635 
4636 	/*
4637 	 * With PRI disabled and outstanding PRQs drained, unregistering
4638 	 * fault handler and removing device from iopf queue should never
4639 	 * fail.
4640 	 */
4641 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4642 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4643 
4644 	return 0;
4645 }
4646 
4647 static int
4648 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4649 {
4650 	switch (feat) {
4651 	case IOMMU_DEV_FEAT_IOPF:
4652 		return intel_iommu_enable_iopf(dev);
4653 
4654 	case IOMMU_DEV_FEAT_SVA:
4655 		return intel_iommu_enable_sva(dev);
4656 
4657 	default:
4658 		return -ENODEV;
4659 	}
4660 }
4661 
4662 static int
4663 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4664 {
4665 	switch (feat) {
4666 	case IOMMU_DEV_FEAT_IOPF:
4667 		return intel_iommu_disable_iopf(dev);
4668 
4669 	case IOMMU_DEV_FEAT_SVA:
4670 		return 0;
4671 
4672 	default:
4673 		return -ENODEV;
4674 	}
4675 }
4676 
4677 static bool intel_iommu_is_attach_deferred(struct device *dev)
4678 {
4679 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4680 
4681 	return translation_pre_enabled(info->iommu) && !info->domain;
4682 }
4683 
4684 /*
4685  * Check that the device does not live on an external facing PCI port that is
4686  * marked as untrusted. Such devices should not be able to apply quirks and
4687  * thus not be able to bypass the IOMMU restrictions.
4688  */
4689 static bool risky_device(struct pci_dev *pdev)
4690 {
4691 	if (pdev->untrusted) {
4692 		pci_info(pdev,
4693 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4694 			 pdev->vendor, pdev->device);
4695 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4696 		return true;
4697 	}
4698 	return false;
4699 }
4700 
4701 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4702 				       unsigned long iova, size_t size)
4703 {
4704 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4705 	unsigned long pages = aligned_nrpages(iova, size);
4706 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4707 	struct iommu_domain_info *info;
4708 	unsigned long i;
4709 
4710 	xa_for_each(&dmar_domain->iommu_array, i, info)
4711 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4712 }
4713 
4714 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4715 {
4716 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4717 	struct iommu_domain *domain;
4718 
4719 	/* Domain type specific cleanup: */
4720 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4721 	if (domain) {
4722 		switch (domain->type) {
4723 		case IOMMU_DOMAIN_SVA:
4724 			intel_svm_remove_dev_pasid(dev, pasid);
4725 			break;
4726 		default:
4727 			/* should never reach here */
4728 			WARN_ON(1);
4729 			break;
4730 		}
4731 	}
4732 
4733 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4734 }
4735 
4736 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4737 {
4738 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4739 	struct intel_iommu *iommu = info->iommu;
4740 	struct iommu_hw_info_vtd *vtd;
4741 
4742 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4743 	if (!vtd)
4744 		return ERR_PTR(-ENOMEM);
4745 
4746 	vtd->cap_reg = iommu->cap;
4747 	vtd->ecap_reg = iommu->ecap;
4748 	*length = sizeof(*vtd);
4749 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4750 	return vtd;
4751 }
4752 
4753 const struct iommu_ops intel_iommu_ops = {
4754 	.capable		= intel_iommu_capable,
4755 	.hw_info		= intel_iommu_hw_info,
4756 	.domain_alloc		= intel_iommu_domain_alloc,
4757 	.probe_device		= intel_iommu_probe_device,
4758 	.probe_finalize		= intel_iommu_probe_finalize,
4759 	.release_device		= intel_iommu_release_device,
4760 	.get_resv_regions	= intel_iommu_get_resv_regions,
4761 	.device_group		= intel_iommu_device_group,
4762 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4763 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4764 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4765 	.def_domain_type	= device_def_domain_type,
4766 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4767 	.pgsize_bitmap		= SZ_4K,
4768 #ifdef CONFIG_INTEL_IOMMU_SVM
4769 	.page_response		= intel_svm_page_response,
4770 #endif
4771 	.default_domain_ops = &(const struct iommu_domain_ops) {
4772 		.attach_dev		= intel_iommu_attach_device,
4773 		.map_pages		= intel_iommu_map_pages,
4774 		.unmap_pages		= intel_iommu_unmap_pages,
4775 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4776 		.flush_iotlb_all        = intel_flush_iotlb_all,
4777 		.iotlb_sync		= intel_iommu_tlb_sync,
4778 		.iova_to_phys		= intel_iommu_iova_to_phys,
4779 		.free			= intel_iommu_domain_free,
4780 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4781 	}
4782 };
4783 
4784 static void quirk_iommu_igfx(struct pci_dev *dev)
4785 {
4786 	if (risky_device(dev))
4787 		return;
4788 
4789 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4790 	dmar_map_gfx = 0;
4791 }
4792 
4793 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4801 
4802 /* Broadwell igfx malfunctions with dmar */
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4827 
4828 static void quirk_iommu_rwbf(struct pci_dev *dev)
4829 {
4830 	if (risky_device(dev))
4831 		return;
4832 
4833 	/*
4834 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4835 	 * but needs it. Same seems to hold for the desktop versions.
4836 	 */
4837 	pci_info(dev, "Forcing write-buffer flush capability\n");
4838 	rwbf_quirk = 1;
4839 }
4840 
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4848 
4849 #define GGC 0x52
4850 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4851 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4852 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4853 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4854 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4855 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4856 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4857 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4858 
4859 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4860 {
4861 	unsigned short ggc;
4862 
4863 	if (risky_device(dev))
4864 		return;
4865 
4866 	if (pci_read_config_word(dev, GGC, &ggc))
4867 		return;
4868 
4869 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4870 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4871 		dmar_map_gfx = 0;
4872 	} else if (dmar_map_gfx) {
4873 		/* we have to ensure the gfx device is idle before we flush */
4874 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4875 		iommu_set_dma_strict();
4876 	}
4877 }
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4882 
4883 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4884 {
4885 	unsigned short ver;
4886 
4887 	if (!IS_GFX_DEVICE(dev))
4888 		return;
4889 
4890 	ver = (dev->device >> 8) & 0xff;
4891 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4892 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4893 	    ver != 0x9a && ver != 0xa7)
4894 		return;
4895 
4896 	if (risky_device(dev))
4897 		return;
4898 
4899 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4900 	iommu_skip_te_disable = 1;
4901 }
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4903 
4904 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4905    ISOCH DMAR unit for the Azalia sound device, but not give it any
4906    TLB entries, which causes it to deadlock. Check for that.  We do
4907    this in a function called from init_dmars(), instead of in a PCI
4908    quirk, because we don't want to print the obnoxious "BIOS broken"
4909    message if VT-d is actually disabled.
4910 */
4911 static void __init check_tylersburg_isoch(void)
4912 {
4913 	struct pci_dev *pdev;
4914 	uint32_t vtisochctrl;
4915 
4916 	/* If there's no Azalia in the system anyway, forget it. */
4917 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4918 	if (!pdev)
4919 		return;
4920 
4921 	if (risky_device(pdev)) {
4922 		pci_dev_put(pdev);
4923 		return;
4924 	}
4925 
4926 	pci_dev_put(pdev);
4927 
4928 	/* System Management Registers. Might be hidden, in which case
4929 	   we can't do the sanity check. But that's OK, because the
4930 	   known-broken BIOSes _don't_ actually hide it, so far. */
4931 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4932 	if (!pdev)
4933 		return;
4934 
4935 	if (risky_device(pdev)) {
4936 		pci_dev_put(pdev);
4937 		return;
4938 	}
4939 
4940 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4941 		pci_dev_put(pdev);
4942 		return;
4943 	}
4944 
4945 	pci_dev_put(pdev);
4946 
4947 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4948 	if (vtisochctrl & 1)
4949 		return;
4950 
4951 	/* Drop all bits other than the number of TLB entries */
4952 	vtisochctrl &= 0x1c;
4953 
4954 	/* If we have the recommended number of TLB entries (16), fine. */
4955 	if (vtisochctrl == 0x10)
4956 		return;
4957 
4958 	/* Zero TLB entries? You get to ride the short bus to school. */
4959 	if (!vtisochctrl) {
4960 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4961 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4962 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4963 		     dmi_get_system_info(DMI_BIOS_VERSION),
4964 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4965 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4966 		return;
4967 	}
4968 
4969 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4970 	       vtisochctrl);
4971 }
4972 
4973 /*
4974  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4975  * invalidation completion before posted writes initiated with translated address
4976  * that utilized translations matching the invalidation address range, violating
4977  * the invalidation completion ordering.
4978  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4979  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4980  * under the control of the trusted/privileged host device driver must use this
4981  * quirk.
4982  * Device TLBs are invalidated under the following six conditions:
4983  * 1. Device driver does DMA API unmap IOVA
4984  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4985  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4986  *    exit_mmap() due to crash
4987  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4988  *    VM has to free pages that were unmapped
4989  * 5. Userspace driver unmaps a DMA buffer
4990  * 6. Cache invalidation in vSVA usage (upcoming)
4991  *
4992  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4993  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4994  * invalidate TLB the same way as normal user unmap which will use this quirk.
4995  * The dTLB invalidation after PASID cache flush does not need this quirk.
4996  *
4997  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4998  */
4999 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5000 			       unsigned long address, unsigned long mask,
5001 			       u32 pasid, u16 qdep)
5002 {
5003 	u16 sid;
5004 
5005 	if (likely(!info->dtlb_extra_inval))
5006 		return;
5007 
5008 	sid = PCI_DEVID(info->bus, info->devfn);
5009 	if (pasid == PASID_RID2PASID) {
5010 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5011 				   qdep, address, mask);
5012 	} else {
5013 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5014 					 pasid, qdep, address, mask);
5015 	}
5016 }
5017 
5018 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5019 
5020 /*
5021  * Function to submit a command to the enhanced command interface. The
5022  * valid enhanced command descriptions are defined in Table 47 of the
5023  * VT-d spec. The VT-d hardware implementation may support some but not
5024  * all commands, which can be determined by checking the Enhanced
5025  * Command Capability Register.
5026  *
5027  * Return values:
5028  *  - 0: Command successful without any error;
5029  *  - Negative: software error value;
5030  *  - Nonzero positive: failure status code defined in Table 48.
5031  */
5032 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5033 {
5034 	unsigned long flags;
5035 	u64 res;
5036 	int ret;
5037 
5038 	if (!cap_ecmds(iommu->cap))
5039 		return -ENODEV;
5040 
5041 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5042 
5043 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5044 	if (res & DMA_ECMD_ECRSP_IP) {
5045 		ret = -EBUSY;
5046 		goto err;
5047 	}
5048 
5049 	/*
5050 	 * Unconditionally write the operand B, because
5051 	 * - There is no side effect if an ecmd doesn't require an
5052 	 *   operand B, but we set the register to some value.
5053 	 * - It's not invoked in any critical path. The extra MMIO
5054 	 *   write doesn't bring any performance concerns.
5055 	 */
5056 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5057 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5058 
5059 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5060 		      !(res & DMA_ECMD_ECRSP_IP), res);
5061 
5062 	if (res & DMA_ECMD_ECRSP_IP) {
5063 		ret = -ETIMEDOUT;
5064 		goto err;
5065 	}
5066 
5067 	ret = ecmd_get_status_code(res);
5068 err:
5069 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5070 
5071 	return ret;
5072 }
5073