xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 37dd6b9f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 
26 #include "iommu.h"
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
282 
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285 
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288 
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
293 
294 #define IDENTMAP_GFX		2
295 #define IDENTMAP_AZALIA		4
296 
297 const struct iommu_ops intel_iommu_ops;
298 
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
300 {
301 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302 }
303 
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307 }
308 
309 static void init_translation_status(struct intel_iommu *iommu)
310 {
311 	u32 gsts;
312 
313 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 	if (gsts & DMA_GSTS_TES)
315 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316 }
317 
318 static int __init intel_iommu_setup(char *str)
319 {
320 	if (!str)
321 		return -EINVAL;
322 
323 	while (*str) {
324 		if (!strncmp(str, "on", 2)) {
325 			dmar_disabled = 0;
326 			pr_info("IOMMU enabled\n");
327 		} else if (!strncmp(str, "off", 3)) {
328 			dmar_disabled = 1;
329 			no_platform_optin = 1;
330 			pr_info("IOMMU disabled\n");
331 		} else if (!strncmp(str, "igfx_off", 8)) {
332 			dmar_map_gfx = 0;
333 			pr_info("Disable GFX device mapping\n");
334 		} else if (!strncmp(str, "forcedac", 8)) {
335 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 			iommu_dma_forcedac = true;
337 		} else if (!strncmp(str, "strict", 6)) {
338 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 			iommu_set_dma_strict();
340 		} else if (!strncmp(str, "sp_off", 6)) {
341 			pr_info("Disable supported super page\n");
342 			intel_iommu_superpage = 0;
343 		} else if (!strncmp(str, "sm_on", 5)) {
344 			pr_info("Enable scalable mode if hardware supports\n");
345 			intel_iommu_sm = 1;
346 		} else if (!strncmp(str, "sm_off", 6)) {
347 			pr_info("Scalable mode is disallowed\n");
348 			intel_iommu_sm = 0;
349 		} else if (!strncmp(str, "tboot_noforce", 13)) {
350 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 			intel_iommu_tboot_noforce = 1;
352 		} else {
353 			pr_notice("Unknown option - '%s'\n", str);
354 		}
355 
356 		str += strcspn(str, ",");
357 		while (*str == ',')
358 			str++;
359 	}
360 
361 	return 1;
362 }
363 __setup("intel_iommu=", intel_iommu_setup);
364 
365 void *alloc_pgtable_page(int node, gfp_t gfp)
366 {
367 	struct page *page;
368 	void *vaddr = NULL;
369 
370 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
371 	if (page)
372 		vaddr = page_address(page);
373 	return vaddr;
374 }
375 
376 void free_pgtable_page(void *vaddr)
377 {
378 	free_page((unsigned long)vaddr);
379 }
380 
381 static inline int domain_type_is_si(struct dmar_domain *domain)
382 {
383 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384 }
385 
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
387 				       unsigned long pfn)
388 {
389 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390 
391 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392 }
393 
394 /*
395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397  * the returned SAGAW.
398  */
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400 {
401 	unsigned long fl_sagaw, sl_sagaw;
402 
403 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 	sl_sagaw = cap_sagaw(iommu->cap);
405 
406 	/* Second level only. */
407 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 		return sl_sagaw;
409 
410 	/* First level only. */
411 	if (!ecap_slts(iommu->ecap))
412 		return fl_sagaw;
413 
414 	return fl_sagaw & sl_sagaw;
415 }
416 
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418 {
419 	unsigned long sagaw;
420 	int agaw;
421 
422 	sagaw = __iommu_calculate_sagaw(iommu);
423 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 		if (test_bit(agaw, &sagaw))
425 			break;
426 	}
427 
428 	return agaw;
429 }
430 
431 /*
432  * Calculate max SAGAW for each iommu.
433  */
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435 {
436 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437 }
438 
439 /*
440  * calculate agaw for each iommu.
441  * "SAGAW" may be different across iommus, use a default agaw, and
442  * get a supported less agaw for iommus that don't support the default agaw.
443  */
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
445 {
446 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447 }
448 
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450 {
451 	return sm_supported(iommu) ?
452 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453 }
454 
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
456 {
457 	struct iommu_domain_info *info;
458 	struct dmar_drhd_unit *drhd;
459 	struct intel_iommu *iommu;
460 	bool found = false;
461 	unsigned long i;
462 
463 	domain->iommu_coherency = true;
464 	xa_for_each(&domain->iommu_array, i, info) {
465 		found = true;
466 		if (!iommu_paging_structure_coherency(info->iommu)) {
467 			domain->iommu_coherency = false;
468 			break;
469 		}
470 	}
471 	if (found)
472 		return;
473 
474 	/* No hardware attached; use lowest common denominator */
475 	rcu_read_lock();
476 	for_each_active_iommu(iommu, drhd) {
477 		if (!iommu_paging_structure_coherency(iommu)) {
478 			domain->iommu_coherency = false;
479 			break;
480 		}
481 	}
482 	rcu_read_unlock();
483 }
484 
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 					 struct intel_iommu *skip)
487 {
488 	struct dmar_drhd_unit *drhd;
489 	struct intel_iommu *iommu;
490 	int mask = 0x3;
491 
492 	if (!intel_iommu_superpage)
493 		return 0;
494 
495 	/* set iommu_superpage to the smallest common denominator */
496 	rcu_read_lock();
497 	for_each_active_iommu(iommu, drhd) {
498 		if (iommu != skip) {
499 			if (domain && domain->use_first_level) {
500 				if (!cap_fl1gp_support(iommu->cap))
501 					mask = 0x1;
502 			} else {
503 				mask &= cap_super_page_val(iommu->cap);
504 			}
505 
506 			if (!mask)
507 				break;
508 		}
509 	}
510 	rcu_read_unlock();
511 
512 	return fls(mask);
513 }
514 
515 static int domain_update_device_node(struct dmar_domain *domain)
516 {
517 	struct device_domain_info *info;
518 	int nid = NUMA_NO_NODE;
519 	unsigned long flags;
520 
521 	spin_lock_irqsave(&domain->lock, flags);
522 	list_for_each_entry(info, &domain->devices, link) {
523 		/*
524 		 * There could possibly be multiple device numa nodes as devices
525 		 * within the same domain may sit behind different IOMMUs. There
526 		 * isn't perfect answer in such situation, so we select first
527 		 * come first served policy.
528 		 */
529 		nid = dev_to_node(info->dev);
530 		if (nid != NUMA_NO_NODE)
531 			break;
532 	}
533 	spin_unlock_irqrestore(&domain->lock, flags);
534 
535 	return nid;
536 }
537 
538 static void domain_update_iotlb(struct dmar_domain *domain);
539 
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542 {
543 	unsigned long bitmap = 0;
544 
545 	/*
546 	 * 1-level super page supports page size of 2MiB, 2-level super page
547 	 * supports page size of both 2MiB and 1GiB.
548 	 */
549 	if (domain->iommu_superpage == 1)
550 		bitmap |= SZ_2M;
551 	else if (domain->iommu_superpage == 2)
552 		bitmap |= SZ_2M | SZ_1G;
553 
554 	return bitmap;
555 }
556 
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560 	domain_update_iommu_coherency(domain);
561 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562 
563 	/*
564 	 * If RHSA is missing, we should default to the device numa domain
565 	 * as fall back.
566 	 */
567 	if (domain->nid == NUMA_NO_NODE)
568 		domain->nid = domain_update_device_node(domain);
569 
570 	/*
571 	 * First-level translation restricts the input-address to a
572 	 * canonical address (i.e., address bits 63:N have the same
573 	 * value as address bit [N-1], where N is 48-bits with 4-level
574 	 * paging and 57-bits with 5-level paging). Hence, skip bit
575 	 * [N-1].
576 	 */
577 	if (domain->use_first_level)
578 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 	else
580 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581 
582 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 	domain_update_iotlb(domain);
584 }
585 
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 					 u8 devfn, int alloc)
588 {
589 	struct root_entry *root = &iommu->root_entry[bus];
590 	struct context_entry *context;
591 	u64 *entry;
592 
593 	/*
594 	 * Except that the caller requested to allocate a new entry,
595 	 * returning a copied context entry makes no sense.
596 	 */
597 	if (!alloc && context_copied(iommu, bus, devfn))
598 		return NULL;
599 
600 	entry = &root->lo;
601 	if (sm_supported(iommu)) {
602 		if (devfn >= 0x80) {
603 			devfn -= 0x80;
604 			entry = &root->hi;
605 		}
606 		devfn *= 2;
607 	}
608 	if (*entry & 1)
609 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 	else {
611 		unsigned long phy_addr;
612 		if (!alloc)
613 			return NULL;
614 
615 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
616 		if (!context)
617 			return NULL;
618 
619 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 		phy_addr = virt_to_phys((void *)context);
621 		*entry = phy_addr | 1;
622 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
623 	}
624 	return &context[devfn];
625 }
626 
627 /**
628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629  *				 sub-hierarchy of a candidate PCI-PCI bridge
630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631  * @bridge: the candidate PCI-PCI bridge
632  *
633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634  */
635 static bool
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637 {
638 	struct pci_dev *pdev, *pbridge;
639 
640 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 		return false;
642 
643 	pdev = to_pci_dev(dev);
644 	pbridge = to_pci_dev(bridge);
645 
646 	if (pbridge->subordinate &&
647 	    pbridge->subordinate->number <= pdev->bus->number &&
648 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 		return true;
650 
651 	return false;
652 }
653 
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655 {
656 	struct dmar_drhd_unit *drhd;
657 	u32 vtbar;
658 	int rc;
659 
660 	/* We know that this device on this chipset has its own IOMMU.
661 	 * If we find it under a different IOMMU, then the BIOS is lying
662 	 * to us. Hope that the IOMMU for this device is actually
663 	 * disabled, and it needs no translation...
664 	 */
665 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 	if (rc) {
667 		/* "can't" happen */
668 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 		return false;
670 	}
671 	vtbar &= 0xffff0000;
672 
673 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
674 	drhd = dmar_find_matched_drhd_unit(pdev);
675 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 		return true;
679 	}
680 
681 	return false;
682 }
683 
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685 {
686 	if (!iommu || iommu->drhd->ignored)
687 		return true;
688 
689 	if (dev_is_pci(dev)) {
690 		struct pci_dev *pdev = to_pci_dev(dev);
691 
692 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 		    quirk_ioat_snb_local_iommu(pdev))
695 			return true;
696 	}
697 
698 	return false;
699 }
700 
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702 {
703 	struct dmar_drhd_unit *drhd = NULL;
704 	struct pci_dev *pdev = NULL;
705 	struct intel_iommu *iommu;
706 	struct device *tmp;
707 	u16 segment = 0;
708 	int i;
709 
710 	if (!dev)
711 		return NULL;
712 
713 	if (dev_is_pci(dev)) {
714 		struct pci_dev *pf_pdev;
715 
716 		pdev = pci_real_dma_dev(to_pci_dev(dev));
717 
718 		/* VFs aren't listed in scope tables; we need to look up
719 		 * the PF instead to find the IOMMU. */
720 		pf_pdev = pci_physfn(pdev);
721 		dev = &pf_pdev->dev;
722 		segment = pci_domain_nr(pdev->bus);
723 	} else if (has_acpi_companion(dev))
724 		dev = &ACPI_COMPANION(dev)->dev;
725 
726 	rcu_read_lock();
727 	for_each_iommu(iommu, drhd) {
728 		if (pdev && segment != drhd->segment)
729 			continue;
730 
731 		for_each_active_dev_scope(drhd->devices,
732 					  drhd->devices_cnt, i, tmp) {
733 			if (tmp == dev) {
734 				/* For a VF use its original BDF# not that of the PF
735 				 * which we used for the IOMMU lookup. Strictly speaking
736 				 * we could do this for all PCI devices; we only need to
737 				 * get the BDF# from the scope table for ACPI matches. */
738 				if (pdev && pdev->is_virtfn)
739 					goto got_pdev;
740 
741 				if (bus && devfn) {
742 					*bus = drhd->devices[i].bus;
743 					*devfn = drhd->devices[i].devfn;
744 				}
745 				goto out;
746 			}
747 
748 			if (is_downstream_to_pci_bridge(dev, tmp))
749 				goto got_pdev;
750 		}
751 
752 		if (pdev && drhd->include_all) {
753 got_pdev:
754 			if (bus && devfn) {
755 				*bus = pdev->bus->number;
756 				*devfn = pdev->devfn;
757 			}
758 			goto out;
759 		}
760 	}
761 	iommu = NULL;
762 out:
763 	if (iommu_is_dummy(iommu, dev))
764 		iommu = NULL;
765 
766 	rcu_read_unlock();
767 
768 	return iommu;
769 }
770 
771 static void domain_flush_cache(struct dmar_domain *domain,
772 			       void *addr, int size)
773 {
774 	if (!domain->iommu_coherency)
775 		clflush_cache_range(addr, size);
776 }
777 
778 static void free_context_table(struct intel_iommu *iommu)
779 {
780 	struct context_entry *context;
781 	int i;
782 
783 	if (!iommu->root_entry)
784 		return;
785 
786 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 		context = iommu_context_addr(iommu, i, 0, 0);
788 		if (context)
789 			free_pgtable_page(context);
790 
791 		if (!sm_supported(iommu))
792 			continue;
793 
794 		context = iommu_context_addr(iommu, i, 0x80, 0);
795 		if (context)
796 			free_pgtable_page(context);
797 	}
798 
799 	free_pgtable_page(iommu->root_entry);
800 	iommu->root_entry = NULL;
801 }
802 
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806 {
807 	struct dma_pte *pte;
808 	int offset;
809 
810 	while (1) {
811 		offset = pfn_level_offset(pfn, level);
812 		pte = &parent[offset];
813 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 			pr_info("PTE not present at level %d\n", level);
815 			break;
816 		}
817 
818 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819 
820 		if (level == 1)
821 			break;
822 
823 		parent = phys_to_virt(dma_pte_addr(pte));
824 		level--;
825 	}
826 }
827 
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 			  unsigned long long addr, u32 pasid)
830 {
831 	struct pasid_dir_entry *dir, *pde;
832 	struct pasid_entry *entries, *pte;
833 	struct context_entry *ctx_entry;
834 	struct root_entry *rt_entry;
835 	int i, dir_index, index, level;
836 	u8 devfn = source_id & 0xff;
837 	u8 bus = source_id >> 8;
838 	struct dma_pte *pgtable;
839 
840 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841 
842 	/* root entry dump */
843 	rt_entry = &iommu->root_entry[bus];
844 	if (!rt_entry) {
845 		pr_info("root table entry is not present\n");
846 		return;
847 	}
848 
849 	if (sm_supported(iommu))
850 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 			rt_entry->hi, rt_entry->lo);
852 	else
853 		pr_info("root entry: 0x%016llx", rt_entry->lo);
854 
855 	/* context entry dump */
856 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 	if (!ctx_entry) {
858 		pr_info("context table entry is not present\n");
859 		return;
860 	}
861 
862 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 		ctx_entry->hi, ctx_entry->lo);
864 
865 	/* legacy mode does not require PASID entries */
866 	if (!sm_supported(iommu)) {
867 		level = agaw_to_level(ctx_entry->hi & 7);
868 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 		goto pgtable_walk;
870 	}
871 
872 	/* get the pointer to pasid directory entry */
873 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 	if (!dir) {
875 		pr_info("pasid directory entry is not present\n");
876 		return;
877 	}
878 	/* For request-without-pasid, get the pasid from context entry */
879 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
880 		pasid = PASID_RID2PASID;
881 
882 	dir_index = pasid >> PASID_PDE_SHIFT;
883 	pde = &dir[dir_index];
884 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885 
886 	/* get the pointer to the pasid table entry */
887 	entries = get_pasid_table_from_pde(pde);
888 	if (!entries) {
889 		pr_info("pasid table entry is not present\n");
890 		return;
891 	}
892 	index = pasid & PASID_PTE_MASK;
893 	pte = &entries[index];
894 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896 
897 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 	} else {
901 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 	}
904 
905 pgtable_walk:
906 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907 }
908 #endif
909 
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 				      unsigned long pfn, int *target_level,
912 				      gfp_t gfp)
913 {
914 	struct dma_pte *parent, *pte;
915 	int level = agaw_to_level(domain->agaw);
916 	int offset;
917 
918 	if (!domain_pfn_supported(domain, pfn))
919 		/* Address beyond IOMMU's addressing capabilities. */
920 		return NULL;
921 
922 	parent = domain->pgd;
923 
924 	while (1) {
925 		void *tmp_page;
926 
927 		offset = pfn_level_offset(pfn, level);
928 		pte = &parent[offset];
929 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
930 			break;
931 		if (level == *target_level)
932 			break;
933 
934 		if (!dma_pte_present(pte)) {
935 			uint64_t pteval;
936 
937 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
938 
939 			if (!tmp_page)
940 				return NULL;
941 
942 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
943 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
944 			if (domain->use_first_level)
945 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
946 
947 			if (cmpxchg64(&pte->val, 0ULL, pteval))
948 				/* Someone else set it while we were thinking; use theirs. */
949 				free_pgtable_page(tmp_page);
950 			else
951 				domain_flush_cache(domain, pte, sizeof(*pte));
952 		}
953 		if (level == 1)
954 			break;
955 
956 		parent = phys_to_virt(dma_pte_addr(pte));
957 		level--;
958 	}
959 
960 	if (!*target_level)
961 		*target_level = level;
962 
963 	return pte;
964 }
965 
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968 					 unsigned long pfn,
969 					 int level, int *large_page)
970 {
971 	struct dma_pte *parent, *pte;
972 	int total = agaw_to_level(domain->agaw);
973 	int offset;
974 
975 	parent = domain->pgd;
976 	while (level <= total) {
977 		offset = pfn_level_offset(pfn, total);
978 		pte = &parent[offset];
979 		if (level == total)
980 			return pte;
981 
982 		if (!dma_pte_present(pte)) {
983 			*large_page = total;
984 			break;
985 		}
986 
987 		if (dma_pte_superpage(pte)) {
988 			*large_page = total;
989 			return pte;
990 		}
991 
992 		parent = phys_to_virt(dma_pte_addr(pte));
993 		total--;
994 	}
995 	return NULL;
996 }
997 
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 				unsigned long start_pfn,
1001 				unsigned long last_pfn)
1002 {
1003 	unsigned int large_page;
1004 	struct dma_pte *first_pte, *pte;
1005 
1006 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1007 	    WARN_ON(start_pfn > last_pfn))
1008 		return;
1009 
1010 	/* we don't need lock here; nobody else touches the iova range */
1011 	do {
1012 		large_page = 1;
1013 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014 		if (!pte) {
1015 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016 			continue;
1017 		}
1018 		do {
1019 			dma_clear_pte(pte);
1020 			start_pfn += lvl_to_nr_pages(large_page);
1021 			pte++;
1022 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023 
1024 		domain_flush_cache(domain, first_pte,
1025 				   (void *)pte - (void *)first_pte);
1026 
1027 	} while (start_pfn && start_pfn <= last_pfn);
1028 }
1029 
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 			       int retain_level, struct dma_pte *pte,
1032 			       unsigned long pfn, unsigned long start_pfn,
1033 			       unsigned long last_pfn)
1034 {
1035 	pfn = max(start_pfn, pfn);
1036 	pte = &pte[pfn_level_offset(pfn, level)];
1037 
1038 	do {
1039 		unsigned long level_pfn;
1040 		struct dma_pte *level_pte;
1041 
1042 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043 			goto next;
1044 
1045 		level_pfn = pfn & level_mask(level);
1046 		level_pte = phys_to_virt(dma_pte_addr(pte));
1047 
1048 		if (level > 2) {
1049 			dma_pte_free_level(domain, level - 1, retain_level,
1050 					   level_pte, level_pfn, start_pfn,
1051 					   last_pfn);
1052 		}
1053 
1054 		/*
1055 		 * Free the page table if we're below the level we want to
1056 		 * retain and the range covers the entire table.
1057 		 */
1058 		if (level < retain_level && !(start_pfn > level_pfn ||
1059 		      last_pfn < level_pfn + level_size(level) - 1)) {
1060 			dma_clear_pte(pte);
1061 			domain_flush_cache(domain, pte, sizeof(*pte));
1062 			free_pgtable_page(level_pte);
1063 		}
1064 next:
1065 		pfn += level_size(level);
1066 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068 
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 				   unsigned long start_pfn,
1075 				   unsigned long last_pfn,
1076 				   int retain_level)
1077 {
1078 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1079 
1080 	/* We don't need lock here; nobody else touches the iova range */
1081 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1082 			   domain->pgd, 0, start_pfn, last_pfn);
1083 
1084 	/* free pgd */
1085 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1086 		free_pgtable_page(domain->pgd);
1087 		domain->pgd = NULL;
1088 	}
1089 }
1090 
1091 /* When a page at a given level is being unlinked from its parent, we don't
1092    need to *modify* it at all. All we need to do is make a list of all the
1093    pages which can be freed just as soon as we've flushed the IOTLB and we
1094    know the hardware page-walk will no longer touch them.
1095    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1096    be freed. */
1097 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1098 				    int level, struct dma_pte *pte,
1099 				    struct list_head *freelist)
1100 {
1101 	struct page *pg;
1102 
1103 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1104 	list_add_tail(&pg->lru, freelist);
1105 
1106 	if (level == 1)
1107 		return;
1108 
1109 	pte = page_address(pg);
1110 	do {
1111 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1113 		pte++;
1114 	} while (!first_pte_in_page(pte));
1115 }
1116 
1117 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1118 				struct dma_pte *pte, unsigned long pfn,
1119 				unsigned long start_pfn, unsigned long last_pfn,
1120 				struct list_head *freelist)
1121 {
1122 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1123 
1124 	pfn = max(start_pfn, pfn);
1125 	pte = &pte[pfn_level_offset(pfn, level)];
1126 
1127 	do {
1128 		unsigned long level_pfn = pfn & level_mask(level);
1129 
1130 		if (!dma_pte_present(pte))
1131 			goto next;
1132 
1133 		/* If range covers entire pagetable, free it */
1134 		if (start_pfn <= level_pfn &&
1135 		    last_pfn >= level_pfn + level_size(level) - 1) {
1136 			/* These suborbinate page tables are going away entirely. Don't
1137 			   bother to clear them; we're just going to *free* them. */
1138 			if (level > 1 && !dma_pte_superpage(pte))
1139 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140 
1141 			dma_clear_pte(pte);
1142 			if (!first_pte)
1143 				first_pte = pte;
1144 			last_pte = pte;
1145 		} else if (level > 1) {
1146 			/* Recurse down into a level that isn't *entirely* obsolete */
1147 			dma_pte_clear_level(domain, level - 1,
1148 					    phys_to_virt(dma_pte_addr(pte)),
1149 					    level_pfn, start_pfn, last_pfn,
1150 					    freelist);
1151 		}
1152 next:
1153 		pfn = level_pfn + level_size(level);
1154 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155 
1156 	if (first_pte)
1157 		domain_flush_cache(domain, first_pte,
1158 				   (void *)++last_pte - (void *)first_pte);
1159 }
1160 
1161 /* We can't just free the pages because the IOMMU may still be walking
1162    the page tables, and may have cached the intermediate levels. The
1163    pages can only be freed after the IOTLB flush has been done. */
1164 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1165 			 unsigned long last_pfn, struct list_head *freelist)
1166 {
1167 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1168 	    WARN_ON(start_pfn > last_pfn))
1169 		return;
1170 
1171 	/* we don't need lock here; nobody else touches the iova range */
1172 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1173 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1174 
1175 	/* free pgd */
1176 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177 		struct page *pgd_page = virt_to_page(domain->pgd);
1178 		list_add_tail(&pgd_page->lru, freelist);
1179 		domain->pgd = NULL;
1180 	}
1181 }
1182 
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1185 {
1186 	struct root_entry *root;
1187 
1188 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1189 	if (!root) {
1190 		pr_err("Allocating root entry for %s failed\n",
1191 			iommu->name);
1192 		return -ENOMEM;
1193 	}
1194 
1195 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1196 	iommu->root_entry = root;
1197 
1198 	return 0;
1199 }
1200 
1201 static void iommu_set_root_entry(struct intel_iommu *iommu)
1202 {
1203 	u64 addr;
1204 	u32 sts;
1205 	unsigned long flag;
1206 
1207 	addr = virt_to_phys(iommu->root_entry);
1208 	if (sm_supported(iommu))
1209 		addr |= DMA_RTADDR_SMT;
1210 
1211 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1212 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213 
1214 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215 
1216 	/* Make sure hardware complete it */
1217 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1218 		      readl, (sts & DMA_GSTS_RTPS), sts);
1219 
1220 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1221 
1222 	/*
1223 	 * Hardware invalidates all DMA remapping hardware translation
1224 	 * caches as part of SRTP flow.
1225 	 */
1226 	if (cap_esrtps(iommu->cap))
1227 		return;
1228 
1229 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1230 	if (sm_supported(iommu))
1231 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1232 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1233 }
1234 
1235 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1236 {
1237 	u32 val;
1238 	unsigned long flag;
1239 
1240 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1241 		return;
1242 
1243 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1244 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1245 
1246 	/* Make sure hardware complete it */
1247 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1248 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1249 
1250 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 }
1252 
1253 /* return value determine if we need a write buffer flush */
1254 static void __iommu_flush_context(struct intel_iommu *iommu,
1255 				  u16 did, u16 source_id, u8 function_mask,
1256 				  u64 type)
1257 {
1258 	u64 val = 0;
1259 	unsigned long flag;
1260 
1261 	switch (type) {
1262 	case DMA_CCMD_GLOBAL_INVL:
1263 		val = DMA_CCMD_GLOBAL_INVL;
1264 		break;
1265 	case DMA_CCMD_DOMAIN_INVL:
1266 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1267 		break;
1268 	case DMA_CCMD_DEVICE_INVL:
1269 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1270 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1271 		break;
1272 	default:
1273 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1274 			iommu->name, type);
1275 		return;
1276 	}
1277 	val |= DMA_CCMD_ICC;
1278 
1279 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1281 
1282 	/* Make sure hardware complete it */
1283 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1285 
1286 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1287 }
1288 
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291 				u64 addr, unsigned int size_order, u64 type)
1292 {
1293 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294 	u64 val = 0, val_iva = 0;
1295 	unsigned long flag;
1296 
1297 	switch (type) {
1298 	case DMA_TLB_GLOBAL_FLUSH:
1299 		/* global flush doesn't need set IVA_REG */
1300 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1301 		break;
1302 	case DMA_TLB_DSI_FLUSH:
1303 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304 		break;
1305 	case DMA_TLB_PSI_FLUSH:
1306 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307 		/* IH bit is passed in as part of address */
1308 		val_iva = size_order | addr;
1309 		break;
1310 	default:
1311 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1312 			iommu->name, type);
1313 		return;
1314 	}
1315 
1316 	if (cap_write_drain(iommu->cap))
1317 		val |= DMA_TLB_WRITE_DRAIN;
1318 
1319 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1320 	/* Note: Only uses first TLB reg currently */
1321 	if (val_iva)
1322 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1323 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1324 
1325 	/* Make sure hardware complete it */
1326 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1327 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1328 
1329 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330 
1331 	/* check IOTLB invalidation granularity */
1332 	if (DMA_TLB_IAIG(val) == 0)
1333 		pr_err("Flush IOTLB failed\n");
1334 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1335 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1336 			(unsigned long long)DMA_TLB_IIRG(type),
1337 			(unsigned long long)DMA_TLB_IAIG(val));
1338 }
1339 
1340 static struct device_domain_info *
1341 domain_lookup_dev_info(struct dmar_domain *domain,
1342 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1343 {
1344 	struct device_domain_info *info;
1345 	unsigned long flags;
1346 
1347 	spin_lock_irqsave(&domain->lock, flags);
1348 	list_for_each_entry(info, &domain->devices, link) {
1349 		if (info->iommu == iommu && info->bus == bus &&
1350 		    info->devfn == devfn) {
1351 			spin_unlock_irqrestore(&domain->lock, flags);
1352 			return info;
1353 		}
1354 	}
1355 	spin_unlock_irqrestore(&domain->lock, flags);
1356 
1357 	return NULL;
1358 }
1359 
1360 static void domain_update_iotlb(struct dmar_domain *domain)
1361 {
1362 	struct device_domain_info *info;
1363 	bool has_iotlb_device = false;
1364 	unsigned long flags;
1365 
1366 	spin_lock_irqsave(&domain->lock, flags);
1367 	list_for_each_entry(info, &domain->devices, link) {
1368 		if (info->ats_enabled) {
1369 			has_iotlb_device = true;
1370 			break;
1371 		}
1372 	}
1373 	domain->has_iotlb_device = has_iotlb_device;
1374 	spin_unlock_irqrestore(&domain->lock, flags);
1375 }
1376 
1377 /*
1378  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1379  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1380  * check because it applies only to the built-in QAT devices and it doesn't
1381  * grant additional privileges.
1382  */
1383 #define BUGGY_QAT_DEVID_MASK 0x4940
1384 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1385 {
1386 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1387 		return false;
1388 
1389 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1390 		return false;
1391 
1392 	return true;
1393 }
1394 
1395 static void iommu_enable_pci_caps(struct device_domain_info *info)
1396 {
1397 	struct pci_dev *pdev;
1398 
1399 	if (!dev_is_pci(info->dev))
1400 		return;
1401 
1402 	pdev = to_pci_dev(info->dev);
1403 
1404 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1405 	   the device if you enable PASID support after ATS support is
1406 	   undefined. So always enable PASID support on devices which
1407 	   have it, even if we can't yet know if we're ever going to
1408 	   use it. */
1409 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1410 		info->pasid_enabled = 1;
1411 
1412 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1413 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1414 		info->ats_enabled = 1;
1415 		domain_update_iotlb(info->domain);
1416 	}
1417 }
1418 
1419 static void iommu_disable_pci_caps(struct device_domain_info *info)
1420 {
1421 	struct pci_dev *pdev;
1422 
1423 	if (!dev_is_pci(info->dev))
1424 		return;
1425 
1426 	pdev = to_pci_dev(info->dev);
1427 
1428 	if (info->ats_enabled) {
1429 		pci_disable_ats(pdev);
1430 		info->ats_enabled = 0;
1431 		domain_update_iotlb(info->domain);
1432 	}
1433 
1434 	if (info->pasid_enabled) {
1435 		pci_disable_pasid(pdev);
1436 		info->pasid_enabled = 0;
1437 	}
1438 }
1439 
1440 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1441 				    u64 addr, unsigned int mask)
1442 {
1443 	u16 sid, qdep;
1444 
1445 	if (!info || !info->ats_enabled)
1446 		return;
1447 
1448 	sid = info->bus << 8 | info->devfn;
1449 	qdep = info->ats_qdep;
1450 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1451 			   qdep, addr, mask);
1452 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1453 }
1454 
1455 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1456 				  u64 addr, unsigned mask)
1457 {
1458 	struct device_domain_info *info;
1459 	unsigned long flags;
1460 
1461 	if (!domain->has_iotlb_device)
1462 		return;
1463 
1464 	spin_lock_irqsave(&domain->lock, flags);
1465 	list_for_each_entry(info, &domain->devices, link)
1466 		__iommu_flush_dev_iotlb(info, addr, mask);
1467 	spin_unlock_irqrestore(&domain->lock, flags);
1468 }
1469 
1470 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1471 				  struct dmar_domain *domain,
1472 				  unsigned long pfn, unsigned int pages,
1473 				  int ih, int map)
1474 {
1475 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1476 	unsigned int mask = ilog2(aligned_pages);
1477 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1478 	u16 did = domain_id_iommu(domain, iommu);
1479 
1480 	if (WARN_ON(!pages))
1481 		return;
1482 
1483 	if (ih)
1484 		ih = 1 << 6;
1485 
1486 	if (domain->use_first_level) {
1487 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1488 	} else {
1489 		unsigned long bitmask = aligned_pages - 1;
1490 
1491 		/*
1492 		 * PSI masks the low order bits of the base address. If the
1493 		 * address isn't aligned to the mask, then compute a mask value
1494 		 * needed to ensure the target range is flushed.
1495 		 */
1496 		if (unlikely(bitmask & pfn)) {
1497 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1498 
1499 			/*
1500 			 * Since end_pfn <= pfn + bitmask, the only way bits
1501 			 * higher than bitmask can differ in pfn and end_pfn is
1502 			 * by carrying. This means after masking out bitmask,
1503 			 * high bits starting with the first set bit in
1504 			 * shared_bits are all equal in both pfn and end_pfn.
1505 			 */
1506 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1507 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1508 		}
1509 
1510 		/*
1511 		 * Fallback to domain selective flush if no PSI support or
1512 		 * the size is too big.
1513 		 */
1514 		if (!cap_pgsel_inv(iommu->cap) ||
1515 		    mask > cap_max_amask_val(iommu->cap))
1516 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1517 							DMA_TLB_DSI_FLUSH);
1518 		else
1519 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1520 							DMA_TLB_PSI_FLUSH);
1521 	}
1522 
1523 	/*
1524 	 * In caching mode, changes of pages from non-present to present require
1525 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1526 	 */
1527 	if (!cap_caching_mode(iommu->cap) || !map)
1528 		iommu_flush_dev_iotlb(domain, addr, mask);
1529 }
1530 
1531 /* Notification for newly created mappings */
1532 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1533 					struct dmar_domain *domain,
1534 					unsigned long pfn, unsigned int pages)
1535 {
1536 	/*
1537 	 * It's a non-present to present mapping. Only flush if caching mode
1538 	 * and second level.
1539 	 */
1540 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1541 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1542 	else
1543 		iommu_flush_write_buffer(iommu);
1544 }
1545 
1546 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1547 {
1548 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1549 	struct iommu_domain_info *info;
1550 	unsigned long idx;
1551 
1552 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1553 		struct intel_iommu *iommu = info->iommu;
1554 		u16 did = domain_id_iommu(dmar_domain, iommu);
1555 
1556 		if (dmar_domain->use_first_level)
1557 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1558 		else
1559 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1560 						 DMA_TLB_DSI_FLUSH);
1561 
1562 		if (!cap_caching_mode(iommu->cap))
1563 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1564 	}
1565 }
1566 
1567 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1568 {
1569 	u32 pmen;
1570 	unsigned long flags;
1571 
1572 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1573 		return;
1574 
1575 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1576 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1577 	pmen &= ~DMA_PMEN_EPM;
1578 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1579 
1580 	/* wait for the protected region status bit to clear */
1581 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1582 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1583 
1584 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585 }
1586 
1587 static void iommu_enable_translation(struct intel_iommu *iommu)
1588 {
1589 	u32 sts;
1590 	unsigned long flags;
1591 
1592 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1593 	iommu->gcmd |= DMA_GCMD_TE;
1594 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595 
1596 	/* Make sure hardware complete it */
1597 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598 		      readl, (sts & DMA_GSTS_TES), sts);
1599 
1600 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1601 }
1602 
1603 static void iommu_disable_translation(struct intel_iommu *iommu)
1604 {
1605 	u32 sts;
1606 	unsigned long flag;
1607 
1608 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1609 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1610 		return;
1611 
1612 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1613 	iommu->gcmd &= ~DMA_GCMD_TE;
1614 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1615 
1616 	/* Make sure hardware complete it */
1617 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1618 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1619 
1620 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1621 }
1622 
1623 static int iommu_init_domains(struct intel_iommu *iommu)
1624 {
1625 	u32 ndomains;
1626 
1627 	ndomains = cap_ndoms(iommu->cap);
1628 	pr_debug("%s: Number of Domains supported <%d>\n",
1629 		 iommu->name, ndomains);
1630 
1631 	spin_lock_init(&iommu->lock);
1632 
1633 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1634 	if (!iommu->domain_ids)
1635 		return -ENOMEM;
1636 
1637 	/*
1638 	 * If Caching mode is set, then invalid translations are tagged
1639 	 * with domain-id 0, hence we need to pre-allocate it. We also
1640 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1641 	 * make sure it is not used for a real domain.
1642 	 */
1643 	set_bit(0, iommu->domain_ids);
1644 
1645 	/*
1646 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1647 	 * entry for first-level or pass-through translation modes should
1648 	 * be programmed with a domain id different from those used for
1649 	 * second-level or nested translation. We reserve a domain id for
1650 	 * this purpose.
1651 	 */
1652 	if (sm_supported(iommu))
1653 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1654 
1655 	return 0;
1656 }
1657 
1658 static void disable_dmar_iommu(struct intel_iommu *iommu)
1659 {
1660 	if (!iommu->domain_ids)
1661 		return;
1662 
1663 	/*
1664 	 * All iommu domains must have been detached from the devices,
1665 	 * hence there should be no domain IDs in use.
1666 	 */
1667 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1668 		    > NUM_RESERVED_DID))
1669 		return;
1670 
1671 	if (iommu->gcmd & DMA_GCMD_TE)
1672 		iommu_disable_translation(iommu);
1673 }
1674 
1675 static void free_dmar_iommu(struct intel_iommu *iommu)
1676 {
1677 	if (iommu->domain_ids) {
1678 		bitmap_free(iommu->domain_ids);
1679 		iommu->domain_ids = NULL;
1680 	}
1681 
1682 	if (iommu->copied_tables) {
1683 		bitmap_free(iommu->copied_tables);
1684 		iommu->copied_tables = NULL;
1685 	}
1686 
1687 	/* free context mapping */
1688 	free_context_table(iommu);
1689 
1690 #ifdef CONFIG_INTEL_IOMMU_SVM
1691 	if (pasid_supported(iommu)) {
1692 		if (ecap_prs(iommu->ecap))
1693 			intel_svm_finish_prq(iommu);
1694 	}
1695 #endif
1696 }
1697 
1698 /*
1699  * Check and return whether first level is used by default for
1700  * DMA translation.
1701  */
1702 static bool first_level_by_default(unsigned int type)
1703 {
1704 	/* Only SL is available in legacy mode */
1705 	if (!scalable_mode_support())
1706 		return false;
1707 
1708 	/* Only level (either FL or SL) is available, just use it */
1709 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1710 		return intel_cap_flts_sanity();
1711 
1712 	/* Both levels are available, decide it based on domain type */
1713 	return type != IOMMU_DOMAIN_UNMANAGED;
1714 }
1715 
1716 static struct dmar_domain *alloc_domain(unsigned int type)
1717 {
1718 	struct dmar_domain *domain;
1719 
1720 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1721 	if (!domain)
1722 		return NULL;
1723 
1724 	domain->nid = NUMA_NO_NODE;
1725 	if (first_level_by_default(type))
1726 		domain->use_first_level = true;
1727 	domain->has_iotlb_device = false;
1728 	INIT_LIST_HEAD(&domain->devices);
1729 	spin_lock_init(&domain->lock);
1730 	xa_init(&domain->iommu_array);
1731 
1732 	return domain;
1733 }
1734 
1735 static int domain_attach_iommu(struct dmar_domain *domain,
1736 			       struct intel_iommu *iommu)
1737 {
1738 	struct iommu_domain_info *info, *curr;
1739 	unsigned long ndomains;
1740 	int num, ret = -ENOSPC;
1741 
1742 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1743 	if (!info)
1744 		return -ENOMEM;
1745 
1746 	spin_lock(&iommu->lock);
1747 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1748 	if (curr) {
1749 		curr->refcnt++;
1750 		spin_unlock(&iommu->lock);
1751 		kfree(info);
1752 		return 0;
1753 	}
1754 
1755 	ndomains = cap_ndoms(iommu->cap);
1756 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1757 	if (num >= ndomains) {
1758 		pr_err("%s: No free domain ids\n", iommu->name);
1759 		goto err_unlock;
1760 	}
1761 
1762 	set_bit(num, iommu->domain_ids);
1763 	info->refcnt	= 1;
1764 	info->did	= num;
1765 	info->iommu	= iommu;
1766 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1767 			  NULL, info, GFP_ATOMIC);
1768 	if (curr) {
1769 		ret = xa_err(curr) ? : -EBUSY;
1770 		goto err_clear;
1771 	}
1772 	domain_update_iommu_cap(domain);
1773 
1774 	spin_unlock(&iommu->lock);
1775 	return 0;
1776 
1777 err_clear:
1778 	clear_bit(info->did, iommu->domain_ids);
1779 err_unlock:
1780 	spin_unlock(&iommu->lock);
1781 	kfree(info);
1782 	return ret;
1783 }
1784 
1785 static void domain_detach_iommu(struct dmar_domain *domain,
1786 				struct intel_iommu *iommu)
1787 {
1788 	struct iommu_domain_info *info;
1789 
1790 	spin_lock(&iommu->lock);
1791 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1792 	if (--info->refcnt == 0) {
1793 		clear_bit(info->did, iommu->domain_ids);
1794 		xa_erase(&domain->iommu_array, iommu->seq_id);
1795 		domain->nid = NUMA_NO_NODE;
1796 		domain_update_iommu_cap(domain);
1797 		kfree(info);
1798 	}
1799 	spin_unlock(&iommu->lock);
1800 }
1801 
1802 static inline int guestwidth_to_adjustwidth(int gaw)
1803 {
1804 	int agaw;
1805 	int r = (gaw - 12) % 9;
1806 
1807 	if (r == 0)
1808 		agaw = gaw;
1809 	else
1810 		agaw = gaw + 9 - r;
1811 	if (agaw > 64)
1812 		agaw = 64;
1813 	return agaw;
1814 }
1815 
1816 static void domain_exit(struct dmar_domain *domain)
1817 {
1818 	if (domain->pgd) {
1819 		LIST_HEAD(freelist);
1820 
1821 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1822 		put_pages_list(&freelist);
1823 	}
1824 
1825 	if (WARN_ON(!list_empty(&domain->devices)))
1826 		return;
1827 
1828 	kfree(domain);
1829 }
1830 
1831 /*
1832  * Get the PASID directory size for scalable mode context entry.
1833  * Value of X in the PDTS field of a scalable mode context entry
1834  * indicates PASID directory with 2^(X + 7) entries.
1835  */
1836 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1837 {
1838 	unsigned long pds, max_pde;
1839 
1840 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1841 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1842 	if (pds < 7)
1843 		return 0;
1844 
1845 	return pds - 7;
1846 }
1847 
1848 /*
1849  * Set the RID_PASID field of a scalable mode context entry. The
1850  * IOMMU hardware will use the PASID value set in this field for
1851  * DMA translations of DMA requests without PASID.
1852  */
1853 static inline void
1854 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1855 {
1856 	context->hi |= pasid & ((1 << 20) - 1);
1857 }
1858 
1859 /*
1860  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1861  * entry.
1862  */
1863 static inline void context_set_sm_dte(struct context_entry *context)
1864 {
1865 	context->lo |= BIT_ULL(2);
1866 }
1867 
1868 /*
1869  * Set the PRE(Page Request Enable) field of a scalable mode context
1870  * entry.
1871  */
1872 static inline void context_set_sm_pre(struct context_entry *context)
1873 {
1874 	context->lo |= BIT_ULL(4);
1875 }
1876 
1877 /* Convert value to context PASID directory size field coding. */
1878 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1879 
1880 static int domain_context_mapping_one(struct dmar_domain *domain,
1881 				      struct intel_iommu *iommu,
1882 				      struct pasid_table *table,
1883 				      u8 bus, u8 devfn)
1884 {
1885 	struct device_domain_info *info =
1886 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1887 	u16 did = domain_id_iommu(domain, iommu);
1888 	int translation = CONTEXT_TT_MULTI_LEVEL;
1889 	struct context_entry *context;
1890 	int ret;
1891 
1892 	if (hw_pass_through && domain_type_is_si(domain))
1893 		translation = CONTEXT_TT_PASS_THROUGH;
1894 
1895 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1896 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1897 
1898 	spin_lock(&iommu->lock);
1899 	ret = -ENOMEM;
1900 	context = iommu_context_addr(iommu, bus, devfn, 1);
1901 	if (!context)
1902 		goto out_unlock;
1903 
1904 	ret = 0;
1905 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1906 		goto out_unlock;
1907 
1908 	/*
1909 	 * For kdump cases, old valid entries may be cached due to the
1910 	 * in-flight DMA and copied pgtable, but there is no unmapping
1911 	 * behaviour for them, thus we need an explicit cache flush for
1912 	 * the newly-mapped device. For kdump, at this point, the device
1913 	 * is supposed to finish reset at its driver probe stage, so no
1914 	 * in-flight DMA will exist, and we don't need to worry anymore
1915 	 * hereafter.
1916 	 */
1917 	if (context_copied(iommu, bus, devfn)) {
1918 		u16 did_old = context_domain_id(context);
1919 
1920 		if (did_old < cap_ndoms(iommu->cap)) {
1921 			iommu->flush.flush_context(iommu, did_old,
1922 						   (((u16)bus) << 8) | devfn,
1923 						   DMA_CCMD_MASK_NOBIT,
1924 						   DMA_CCMD_DEVICE_INVL);
1925 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1926 						 DMA_TLB_DSI_FLUSH);
1927 		}
1928 
1929 		clear_context_copied(iommu, bus, devfn);
1930 	}
1931 
1932 	context_clear_entry(context);
1933 
1934 	if (sm_supported(iommu)) {
1935 		unsigned long pds;
1936 
1937 		/* Setup the PASID DIR pointer: */
1938 		pds = context_get_sm_pds(table);
1939 		context->lo = (u64)virt_to_phys(table->table) |
1940 				context_pdts(pds);
1941 
1942 		/* Setup the RID_PASID field: */
1943 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1944 
1945 		/*
1946 		 * Setup the Device-TLB enable bit and Page request
1947 		 * Enable bit:
1948 		 */
1949 		if (info && info->ats_supported)
1950 			context_set_sm_dte(context);
1951 		if (info && info->pri_supported)
1952 			context_set_sm_pre(context);
1953 		if (info && info->pasid_supported)
1954 			context_set_pasid(context);
1955 	} else {
1956 		struct dma_pte *pgd = domain->pgd;
1957 		int agaw;
1958 
1959 		context_set_domain_id(context, did);
1960 
1961 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1962 			/*
1963 			 * Skip top levels of page tables for iommu which has
1964 			 * less agaw than default. Unnecessary for PT mode.
1965 			 */
1966 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1967 				ret = -ENOMEM;
1968 				pgd = phys_to_virt(dma_pte_addr(pgd));
1969 				if (!dma_pte_present(pgd))
1970 					goto out_unlock;
1971 			}
1972 
1973 			if (info && info->ats_supported)
1974 				translation = CONTEXT_TT_DEV_IOTLB;
1975 			else
1976 				translation = CONTEXT_TT_MULTI_LEVEL;
1977 
1978 			context_set_address_root(context, virt_to_phys(pgd));
1979 			context_set_address_width(context, agaw);
1980 		} else {
1981 			/*
1982 			 * In pass through mode, AW must be programmed to
1983 			 * indicate the largest AGAW value supported by
1984 			 * hardware. And ASR is ignored by hardware.
1985 			 */
1986 			context_set_address_width(context, iommu->msagaw);
1987 		}
1988 
1989 		context_set_translation_type(context, translation);
1990 	}
1991 
1992 	context_set_fault_enable(context);
1993 	context_set_present(context);
1994 	if (!ecap_coherent(iommu->ecap))
1995 		clflush_cache_range(context, sizeof(*context));
1996 
1997 	/*
1998 	 * It's a non-present to present mapping. If hardware doesn't cache
1999 	 * non-present entry we only need to flush the write-buffer. If the
2000 	 * _does_ cache non-present entries, then it does so in the special
2001 	 * domain #0, which we have to flush:
2002 	 */
2003 	if (cap_caching_mode(iommu->cap)) {
2004 		iommu->flush.flush_context(iommu, 0,
2005 					   (((u16)bus) << 8) | devfn,
2006 					   DMA_CCMD_MASK_NOBIT,
2007 					   DMA_CCMD_DEVICE_INVL);
2008 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2009 	} else {
2010 		iommu_flush_write_buffer(iommu);
2011 	}
2012 
2013 	ret = 0;
2014 
2015 out_unlock:
2016 	spin_unlock(&iommu->lock);
2017 
2018 	return ret;
2019 }
2020 
2021 struct domain_context_mapping_data {
2022 	struct dmar_domain *domain;
2023 	struct intel_iommu *iommu;
2024 	struct pasid_table *table;
2025 };
2026 
2027 static int domain_context_mapping_cb(struct pci_dev *pdev,
2028 				     u16 alias, void *opaque)
2029 {
2030 	struct domain_context_mapping_data *data = opaque;
2031 
2032 	return domain_context_mapping_one(data->domain, data->iommu,
2033 					  data->table, PCI_BUS_NUM(alias),
2034 					  alias & 0xff);
2035 }
2036 
2037 static int
2038 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2039 {
2040 	struct domain_context_mapping_data data;
2041 	struct pasid_table *table;
2042 	struct intel_iommu *iommu;
2043 	u8 bus, devfn;
2044 
2045 	iommu = device_to_iommu(dev, &bus, &devfn);
2046 	if (!iommu)
2047 		return -ENODEV;
2048 
2049 	table = intel_pasid_get_table(dev);
2050 
2051 	if (!dev_is_pci(dev))
2052 		return domain_context_mapping_one(domain, iommu, table,
2053 						  bus, devfn);
2054 
2055 	data.domain = domain;
2056 	data.iommu = iommu;
2057 	data.table = table;
2058 
2059 	return pci_for_each_dma_alias(to_pci_dev(dev),
2060 				      &domain_context_mapping_cb, &data);
2061 }
2062 
2063 /* Returns a number of VTD pages, but aligned to MM page size */
2064 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2065 					    size_t size)
2066 {
2067 	host_addr &= ~PAGE_MASK;
2068 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2069 }
2070 
2071 /* Return largest possible superpage level for a given mapping */
2072 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2073 					  unsigned long iov_pfn,
2074 					  unsigned long phy_pfn,
2075 					  unsigned long pages)
2076 {
2077 	int support, level = 1;
2078 	unsigned long pfnmerge;
2079 
2080 	support = domain->iommu_superpage;
2081 
2082 	/* To use a large page, the virtual *and* physical addresses
2083 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2084 	   of them will mean we have to use smaller pages. So just
2085 	   merge them and check both at once. */
2086 	pfnmerge = iov_pfn | phy_pfn;
2087 
2088 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2089 		pages >>= VTD_STRIDE_SHIFT;
2090 		if (!pages)
2091 			break;
2092 		pfnmerge >>= VTD_STRIDE_SHIFT;
2093 		level++;
2094 		support--;
2095 	}
2096 	return level;
2097 }
2098 
2099 /*
2100  * Ensure that old small page tables are removed to make room for superpage(s).
2101  * We're going to add new large pages, so make sure we don't remove their parent
2102  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2103  */
2104 static void switch_to_super_page(struct dmar_domain *domain,
2105 				 unsigned long start_pfn,
2106 				 unsigned long end_pfn, int level)
2107 {
2108 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2109 	struct iommu_domain_info *info;
2110 	struct dma_pte *pte = NULL;
2111 	unsigned long i;
2112 
2113 	while (start_pfn <= end_pfn) {
2114 		if (!pte)
2115 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2116 					     GFP_ATOMIC);
2117 
2118 		if (dma_pte_present(pte)) {
2119 			dma_pte_free_pagetable(domain, start_pfn,
2120 					       start_pfn + lvl_pages - 1,
2121 					       level + 1);
2122 
2123 			xa_for_each(&domain->iommu_array, i, info)
2124 				iommu_flush_iotlb_psi(info->iommu, domain,
2125 						      start_pfn, lvl_pages,
2126 						      0, 0);
2127 		}
2128 
2129 		pte++;
2130 		start_pfn += lvl_pages;
2131 		if (first_pte_in_page(pte))
2132 			pte = NULL;
2133 	}
2134 }
2135 
2136 static int
2137 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2138 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2139 		 gfp_t gfp)
2140 {
2141 	struct dma_pte *first_pte = NULL, *pte = NULL;
2142 	unsigned int largepage_lvl = 0;
2143 	unsigned long lvl_pages = 0;
2144 	phys_addr_t pteval;
2145 	u64 attr;
2146 
2147 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2148 		return -EINVAL;
2149 
2150 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2151 		return -EINVAL;
2152 
2153 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2154 	attr |= DMA_FL_PTE_PRESENT;
2155 	if (domain->use_first_level) {
2156 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2157 		if (prot & DMA_PTE_WRITE)
2158 			attr |= DMA_FL_PTE_DIRTY;
2159 	}
2160 
2161 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2162 
2163 	while (nr_pages > 0) {
2164 		uint64_t tmp;
2165 
2166 		if (!pte) {
2167 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2168 					phys_pfn, nr_pages);
2169 
2170 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2171 					     gfp);
2172 			if (!pte)
2173 				return -ENOMEM;
2174 			first_pte = pte;
2175 
2176 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2177 
2178 			/* It is large page*/
2179 			if (largepage_lvl > 1) {
2180 				unsigned long end_pfn;
2181 				unsigned long pages_to_remove;
2182 
2183 				pteval |= DMA_PTE_LARGE_PAGE;
2184 				pages_to_remove = min_t(unsigned long, nr_pages,
2185 							nr_pte_to_next_page(pte) * lvl_pages);
2186 				end_pfn = iov_pfn + pages_to_remove - 1;
2187 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2188 			} else {
2189 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2190 			}
2191 
2192 		}
2193 		/* We don't need lock here, nobody else
2194 		 * touches the iova range
2195 		 */
2196 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2197 		if (tmp) {
2198 			static int dumps = 5;
2199 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2200 				iov_pfn, tmp, (unsigned long long)pteval);
2201 			if (dumps) {
2202 				dumps--;
2203 				debug_dma_dump_mappings(NULL);
2204 			}
2205 			WARN_ON(1);
2206 		}
2207 
2208 		nr_pages -= lvl_pages;
2209 		iov_pfn += lvl_pages;
2210 		phys_pfn += lvl_pages;
2211 		pteval += lvl_pages * VTD_PAGE_SIZE;
2212 
2213 		/* If the next PTE would be the first in a new page, then we
2214 		 * need to flush the cache on the entries we've just written.
2215 		 * And then we'll need to recalculate 'pte', so clear it and
2216 		 * let it get set again in the if (!pte) block above.
2217 		 *
2218 		 * If we're done (!nr_pages) we need to flush the cache too.
2219 		 *
2220 		 * Also if we've been setting superpages, we may need to
2221 		 * recalculate 'pte' and switch back to smaller pages for the
2222 		 * end of the mapping, if the trailing size is not enough to
2223 		 * use another superpage (i.e. nr_pages < lvl_pages).
2224 		 */
2225 		pte++;
2226 		if (!nr_pages || first_pte_in_page(pte) ||
2227 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2228 			domain_flush_cache(domain, first_pte,
2229 					   (void *)pte - (void *)first_pte);
2230 			pte = NULL;
2231 		}
2232 	}
2233 
2234 	return 0;
2235 }
2236 
2237 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2238 {
2239 	struct intel_iommu *iommu = info->iommu;
2240 	struct context_entry *context;
2241 	u16 did_old;
2242 
2243 	if (!iommu)
2244 		return;
2245 
2246 	spin_lock(&iommu->lock);
2247 	context = iommu_context_addr(iommu, bus, devfn, 0);
2248 	if (!context) {
2249 		spin_unlock(&iommu->lock);
2250 		return;
2251 	}
2252 
2253 	if (sm_supported(iommu)) {
2254 		if (hw_pass_through && domain_type_is_si(info->domain))
2255 			did_old = FLPT_DEFAULT_DID;
2256 		else
2257 			did_old = domain_id_iommu(info->domain, iommu);
2258 	} else {
2259 		did_old = context_domain_id(context);
2260 	}
2261 
2262 	context_clear_entry(context);
2263 	__iommu_flush_cache(iommu, context, sizeof(*context));
2264 	spin_unlock(&iommu->lock);
2265 	iommu->flush.flush_context(iommu,
2266 				   did_old,
2267 				   (((u16)bus) << 8) | devfn,
2268 				   DMA_CCMD_MASK_NOBIT,
2269 				   DMA_CCMD_DEVICE_INVL);
2270 
2271 	if (sm_supported(iommu))
2272 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2273 
2274 	iommu->flush.flush_iotlb(iommu,
2275 				 did_old,
2276 				 0,
2277 				 0,
2278 				 DMA_TLB_DSI_FLUSH);
2279 
2280 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2281 }
2282 
2283 static int domain_setup_first_level(struct intel_iommu *iommu,
2284 				    struct dmar_domain *domain,
2285 				    struct device *dev,
2286 				    u32 pasid)
2287 {
2288 	struct dma_pte *pgd = domain->pgd;
2289 	int agaw, level;
2290 	int flags = 0;
2291 
2292 	/*
2293 	 * Skip top levels of page tables for iommu which has
2294 	 * less agaw than default. Unnecessary for PT mode.
2295 	 */
2296 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2297 		pgd = phys_to_virt(dma_pte_addr(pgd));
2298 		if (!dma_pte_present(pgd))
2299 			return -ENOMEM;
2300 	}
2301 
2302 	level = agaw_to_level(agaw);
2303 	if (level != 4 && level != 5)
2304 		return -EINVAL;
2305 
2306 	if (level == 5)
2307 		flags |= PASID_FLAG_FL5LP;
2308 
2309 	if (domain->force_snooping)
2310 		flags |= PASID_FLAG_PAGE_SNOOP;
2311 
2312 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2313 					     domain_id_iommu(domain, iommu),
2314 					     flags);
2315 }
2316 
2317 static bool dev_is_real_dma_subdevice(struct device *dev)
2318 {
2319 	return dev && dev_is_pci(dev) &&
2320 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2321 }
2322 
2323 static int iommu_domain_identity_map(struct dmar_domain *domain,
2324 				     unsigned long first_vpfn,
2325 				     unsigned long last_vpfn)
2326 {
2327 	/*
2328 	 * RMRR range might have overlap with physical memory range,
2329 	 * clear it first
2330 	 */
2331 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2332 
2333 	return __domain_mapping(domain, first_vpfn,
2334 				first_vpfn, last_vpfn - first_vpfn + 1,
2335 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2336 }
2337 
2338 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2339 
2340 static int __init si_domain_init(int hw)
2341 {
2342 	struct dmar_rmrr_unit *rmrr;
2343 	struct device *dev;
2344 	int i, nid, ret;
2345 
2346 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2347 	if (!si_domain)
2348 		return -EFAULT;
2349 
2350 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2351 		domain_exit(si_domain);
2352 		si_domain = NULL;
2353 		return -EFAULT;
2354 	}
2355 
2356 	if (hw)
2357 		return 0;
2358 
2359 	for_each_online_node(nid) {
2360 		unsigned long start_pfn, end_pfn;
2361 		int i;
2362 
2363 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2364 			ret = iommu_domain_identity_map(si_domain,
2365 					mm_to_dma_pfn(start_pfn),
2366 					mm_to_dma_pfn(end_pfn));
2367 			if (ret)
2368 				return ret;
2369 		}
2370 	}
2371 
2372 	/*
2373 	 * Identity map the RMRRs so that devices with RMRRs could also use
2374 	 * the si_domain.
2375 	 */
2376 	for_each_rmrr_units(rmrr) {
2377 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2378 					  i, dev) {
2379 			unsigned long long start = rmrr->base_address;
2380 			unsigned long long end = rmrr->end_address;
2381 
2382 			if (WARN_ON(end < start ||
2383 				    end >> agaw_to_width(si_domain->agaw)))
2384 				continue;
2385 
2386 			ret = iommu_domain_identity_map(si_domain,
2387 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2388 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2389 			if (ret)
2390 				return ret;
2391 		}
2392 	}
2393 
2394 	return 0;
2395 }
2396 
2397 static int dmar_domain_attach_device(struct dmar_domain *domain,
2398 				     struct device *dev)
2399 {
2400 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2401 	struct intel_iommu *iommu;
2402 	unsigned long flags;
2403 	u8 bus, devfn;
2404 	int ret;
2405 
2406 	iommu = device_to_iommu(dev, &bus, &devfn);
2407 	if (!iommu)
2408 		return -ENODEV;
2409 
2410 	ret = domain_attach_iommu(domain, iommu);
2411 	if (ret)
2412 		return ret;
2413 	info->domain = domain;
2414 	spin_lock_irqsave(&domain->lock, flags);
2415 	list_add(&info->link, &domain->devices);
2416 	spin_unlock_irqrestore(&domain->lock, flags);
2417 
2418 	/* PASID table is mandatory for a PCI device in scalable mode. */
2419 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2420 		/* Setup the PASID entry for requests without PASID: */
2421 		if (hw_pass_through && domain_type_is_si(domain))
2422 			ret = intel_pasid_setup_pass_through(iommu, domain,
2423 					dev, PASID_RID2PASID);
2424 		else if (domain->use_first_level)
2425 			ret = domain_setup_first_level(iommu, domain, dev,
2426 					PASID_RID2PASID);
2427 		else
2428 			ret = intel_pasid_setup_second_level(iommu, domain,
2429 					dev, PASID_RID2PASID);
2430 		if (ret) {
2431 			dev_err(dev, "Setup RID2PASID failed\n");
2432 			device_block_translation(dev);
2433 			return ret;
2434 		}
2435 	}
2436 
2437 	ret = domain_context_mapping(domain, dev);
2438 	if (ret) {
2439 		dev_err(dev, "Domain context map failed\n");
2440 		device_block_translation(dev);
2441 		return ret;
2442 	}
2443 
2444 	iommu_enable_pci_caps(info);
2445 
2446 	return 0;
2447 }
2448 
2449 static bool device_has_rmrr(struct device *dev)
2450 {
2451 	struct dmar_rmrr_unit *rmrr;
2452 	struct device *tmp;
2453 	int i;
2454 
2455 	rcu_read_lock();
2456 	for_each_rmrr_units(rmrr) {
2457 		/*
2458 		 * Return TRUE if this RMRR contains the device that
2459 		 * is passed in.
2460 		 */
2461 		for_each_active_dev_scope(rmrr->devices,
2462 					  rmrr->devices_cnt, i, tmp)
2463 			if (tmp == dev ||
2464 			    is_downstream_to_pci_bridge(dev, tmp)) {
2465 				rcu_read_unlock();
2466 				return true;
2467 			}
2468 	}
2469 	rcu_read_unlock();
2470 	return false;
2471 }
2472 
2473 /**
2474  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2475  * is relaxable (ie. is allowed to be not enforced under some conditions)
2476  * @dev: device handle
2477  *
2478  * We assume that PCI USB devices with RMRRs have them largely
2479  * for historical reasons and that the RMRR space is not actively used post
2480  * boot.  This exclusion may change if vendors begin to abuse it.
2481  *
2482  * The same exception is made for graphics devices, with the requirement that
2483  * any use of the RMRR regions will be torn down before assigning the device
2484  * to a guest.
2485  *
2486  * Return: true if the RMRR is relaxable, false otherwise
2487  */
2488 static bool device_rmrr_is_relaxable(struct device *dev)
2489 {
2490 	struct pci_dev *pdev;
2491 
2492 	if (!dev_is_pci(dev))
2493 		return false;
2494 
2495 	pdev = to_pci_dev(dev);
2496 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2497 		return true;
2498 	else
2499 		return false;
2500 }
2501 
2502 /*
2503  * There are a couple cases where we need to restrict the functionality of
2504  * devices associated with RMRRs.  The first is when evaluating a device for
2505  * identity mapping because problems exist when devices are moved in and out
2506  * of domains and their respective RMRR information is lost.  This means that
2507  * a device with associated RMRRs will never be in a "passthrough" domain.
2508  * The second is use of the device through the IOMMU API.  This interface
2509  * expects to have full control of the IOVA space for the device.  We cannot
2510  * satisfy both the requirement that RMRR access is maintained and have an
2511  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2512  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2513  * We therefore prevent devices associated with an RMRR from participating in
2514  * the IOMMU API, which eliminates them from device assignment.
2515  *
2516  * In both cases, devices which have relaxable RMRRs are not concerned by this
2517  * restriction. See device_rmrr_is_relaxable comment.
2518  */
2519 static bool device_is_rmrr_locked(struct device *dev)
2520 {
2521 	if (!device_has_rmrr(dev))
2522 		return false;
2523 
2524 	if (device_rmrr_is_relaxable(dev))
2525 		return false;
2526 
2527 	return true;
2528 }
2529 
2530 /*
2531  * Return the required default domain type for a specific device.
2532  *
2533  * @dev: the device in query
2534  * @startup: true if this is during early boot
2535  *
2536  * Returns:
2537  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2538  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2539  *  - 0: both identity and dynamic domains work for this device
2540  */
2541 static int device_def_domain_type(struct device *dev)
2542 {
2543 	if (dev_is_pci(dev)) {
2544 		struct pci_dev *pdev = to_pci_dev(dev);
2545 
2546 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2547 			return IOMMU_DOMAIN_IDENTITY;
2548 
2549 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2550 			return IOMMU_DOMAIN_IDENTITY;
2551 	}
2552 
2553 	return 0;
2554 }
2555 
2556 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2557 {
2558 	/*
2559 	 * Start from the sane iommu hardware state.
2560 	 * If the queued invalidation is already initialized by us
2561 	 * (for example, while enabling interrupt-remapping) then
2562 	 * we got the things already rolling from a sane state.
2563 	 */
2564 	if (!iommu->qi) {
2565 		/*
2566 		 * Clear any previous faults.
2567 		 */
2568 		dmar_fault(-1, iommu);
2569 		/*
2570 		 * Disable queued invalidation if supported and already enabled
2571 		 * before OS handover.
2572 		 */
2573 		dmar_disable_qi(iommu);
2574 	}
2575 
2576 	if (dmar_enable_qi(iommu)) {
2577 		/*
2578 		 * Queued Invalidate not enabled, use Register Based Invalidate
2579 		 */
2580 		iommu->flush.flush_context = __iommu_flush_context;
2581 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2582 		pr_info("%s: Using Register based invalidation\n",
2583 			iommu->name);
2584 	} else {
2585 		iommu->flush.flush_context = qi_flush_context;
2586 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2587 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2588 	}
2589 }
2590 
2591 static int copy_context_table(struct intel_iommu *iommu,
2592 			      struct root_entry *old_re,
2593 			      struct context_entry **tbl,
2594 			      int bus, bool ext)
2595 {
2596 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2597 	struct context_entry *new_ce = NULL, ce;
2598 	struct context_entry *old_ce = NULL;
2599 	struct root_entry re;
2600 	phys_addr_t old_ce_phys;
2601 
2602 	tbl_idx = ext ? bus * 2 : bus;
2603 	memcpy(&re, old_re, sizeof(re));
2604 
2605 	for (devfn = 0; devfn < 256; devfn++) {
2606 		/* First calculate the correct index */
2607 		idx = (ext ? devfn * 2 : devfn) % 256;
2608 
2609 		if (idx == 0) {
2610 			/* First save what we may have and clean up */
2611 			if (new_ce) {
2612 				tbl[tbl_idx] = new_ce;
2613 				__iommu_flush_cache(iommu, new_ce,
2614 						    VTD_PAGE_SIZE);
2615 				pos = 1;
2616 			}
2617 
2618 			if (old_ce)
2619 				memunmap(old_ce);
2620 
2621 			ret = 0;
2622 			if (devfn < 0x80)
2623 				old_ce_phys = root_entry_lctp(&re);
2624 			else
2625 				old_ce_phys = root_entry_uctp(&re);
2626 
2627 			if (!old_ce_phys) {
2628 				if (ext && devfn == 0) {
2629 					/* No LCTP, try UCTP */
2630 					devfn = 0x7f;
2631 					continue;
2632 				} else {
2633 					goto out;
2634 				}
2635 			}
2636 
2637 			ret = -ENOMEM;
2638 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2639 					MEMREMAP_WB);
2640 			if (!old_ce)
2641 				goto out;
2642 
2643 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2644 			if (!new_ce)
2645 				goto out_unmap;
2646 
2647 			ret = 0;
2648 		}
2649 
2650 		/* Now copy the context entry */
2651 		memcpy(&ce, old_ce + idx, sizeof(ce));
2652 
2653 		if (!context_present(&ce))
2654 			continue;
2655 
2656 		did = context_domain_id(&ce);
2657 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2658 			set_bit(did, iommu->domain_ids);
2659 
2660 		set_context_copied(iommu, bus, devfn);
2661 		new_ce[idx] = ce;
2662 	}
2663 
2664 	tbl[tbl_idx + pos] = new_ce;
2665 
2666 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2667 
2668 out_unmap:
2669 	memunmap(old_ce);
2670 
2671 out:
2672 	return ret;
2673 }
2674 
2675 static int copy_translation_tables(struct intel_iommu *iommu)
2676 {
2677 	struct context_entry **ctxt_tbls;
2678 	struct root_entry *old_rt;
2679 	phys_addr_t old_rt_phys;
2680 	int ctxt_table_entries;
2681 	u64 rtaddr_reg;
2682 	int bus, ret;
2683 	bool new_ext, ext;
2684 
2685 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2686 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2687 	new_ext    = !!sm_supported(iommu);
2688 
2689 	/*
2690 	 * The RTT bit can only be changed when translation is disabled,
2691 	 * but disabling translation means to open a window for data
2692 	 * corruption. So bail out and don't copy anything if we would
2693 	 * have to change the bit.
2694 	 */
2695 	if (new_ext != ext)
2696 		return -EINVAL;
2697 
2698 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2699 	if (!iommu->copied_tables)
2700 		return -ENOMEM;
2701 
2702 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2703 	if (!old_rt_phys)
2704 		return -EINVAL;
2705 
2706 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2707 	if (!old_rt)
2708 		return -ENOMEM;
2709 
2710 	/* This is too big for the stack - allocate it from slab */
2711 	ctxt_table_entries = ext ? 512 : 256;
2712 	ret = -ENOMEM;
2713 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2714 	if (!ctxt_tbls)
2715 		goto out_unmap;
2716 
2717 	for (bus = 0; bus < 256; bus++) {
2718 		ret = copy_context_table(iommu, &old_rt[bus],
2719 					 ctxt_tbls, bus, ext);
2720 		if (ret) {
2721 			pr_err("%s: Failed to copy context table for bus %d\n",
2722 				iommu->name, bus);
2723 			continue;
2724 		}
2725 	}
2726 
2727 	spin_lock(&iommu->lock);
2728 
2729 	/* Context tables are copied, now write them to the root_entry table */
2730 	for (bus = 0; bus < 256; bus++) {
2731 		int idx = ext ? bus * 2 : bus;
2732 		u64 val;
2733 
2734 		if (ctxt_tbls[idx]) {
2735 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2736 			iommu->root_entry[bus].lo = val;
2737 		}
2738 
2739 		if (!ext || !ctxt_tbls[idx + 1])
2740 			continue;
2741 
2742 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2743 		iommu->root_entry[bus].hi = val;
2744 	}
2745 
2746 	spin_unlock(&iommu->lock);
2747 
2748 	kfree(ctxt_tbls);
2749 
2750 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2751 
2752 	ret = 0;
2753 
2754 out_unmap:
2755 	memunmap(old_rt);
2756 
2757 	return ret;
2758 }
2759 
2760 static int __init init_dmars(void)
2761 {
2762 	struct dmar_drhd_unit *drhd;
2763 	struct intel_iommu *iommu;
2764 	int ret;
2765 
2766 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2767 	if (ret)
2768 		goto free_iommu;
2769 
2770 	for_each_iommu(iommu, drhd) {
2771 		if (drhd->ignored) {
2772 			iommu_disable_translation(iommu);
2773 			continue;
2774 		}
2775 
2776 		/*
2777 		 * Find the max pasid size of all IOMMU's in the system.
2778 		 * We need to ensure the system pasid table is no bigger
2779 		 * than the smallest supported.
2780 		 */
2781 		if (pasid_supported(iommu)) {
2782 			u32 temp = 2 << ecap_pss(iommu->ecap);
2783 
2784 			intel_pasid_max_id = min_t(u32, temp,
2785 						   intel_pasid_max_id);
2786 		}
2787 
2788 		intel_iommu_init_qi(iommu);
2789 
2790 		ret = iommu_init_domains(iommu);
2791 		if (ret)
2792 			goto free_iommu;
2793 
2794 		init_translation_status(iommu);
2795 
2796 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2797 			iommu_disable_translation(iommu);
2798 			clear_translation_pre_enabled(iommu);
2799 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2800 				iommu->name);
2801 		}
2802 
2803 		/*
2804 		 * TBD:
2805 		 * we could share the same root & context tables
2806 		 * among all IOMMU's. Need to Split it later.
2807 		 */
2808 		ret = iommu_alloc_root_entry(iommu);
2809 		if (ret)
2810 			goto free_iommu;
2811 
2812 		if (translation_pre_enabled(iommu)) {
2813 			pr_info("Translation already enabled - trying to copy translation structures\n");
2814 
2815 			ret = copy_translation_tables(iommu);
2816 			if (ret) {
2817 				/*
2818 				 * We found the IOMMU with translation
2819 				 * enabled - but failed to copy over the
2820 				 * old root-entry table. Try to proceed
2821 				 * by disabling translation now and
2822 				 * allocating a clean root-entry table.
2823 				 * This might cause DMAR faults, but
2824 				 * probably the dump will still succeed.
2825 				 */
2826 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2827 				       iommu->name);
2828 				iommu_disable_translation(iommu);
2829 				clear_translation_pre_enabled(iommu);
2830 			} else {
2831 				pr_info("Copied translation tables from previous kernel for %s\n",
2832 					iommu->name);
2833 			}
2834 		}
2835 
2836 		if (!ecap_pass_through(iommu->ecap))
2837 			hw_pass_through = 0;
2838 		intel_svm_check(iommu);
2839 	}
2840 
2841 	/*
2842 	 * Now that qi is enabled on all iommus, set the root entry and flush
2843 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2844 	 * flush_context function will loop forever and the boot hangs.
2845 	 */
2846 	for_each_active_iommu(iommu, drhd) {
2847 		iommu_flush_write_buffer(iommu);
2848 		iommu_set_root_entry(iommu);
2849 	}
2850 
2851 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2852 	dmar_map_gfx = 0;
2853 #endif
2854 
2855 	if (!dmar_map_gfx)
2856 		iommu_identity_mapping |= IDENTMAP_GFX;
2857 
2858 	check_tylersburg_isoch();
2859 
2860 	ret = si_domain_init(hw_pass_through);
2861 	if (ret)
2862 		goto free_iommu;
2863 
2864 	/*
2865 	 * for each drhd
2866 	 *   enable fault log
2867 	 *   global invalidate context cache
2868 	 *   global invalidate iotlb
2869 	 *   enable translation
2870 	 */
2871 	for_each_iommu(iommu, drhd) {
2872 		if (drhd->ignored) {
2873 			/*
2874 			 * we always have to disable PMRs or DMA may fail on
2875 			 * this device
2876 			 */
2877 			if (force_on)
2878 				iommu_disable_protect_mem_regions(iommu);
2879 			continue;
2880 		}
2881 
2882 		iommu_flush_write_buffer(iommu);
2883 
2884 #ifdef CONFIG_INTEL_IOMMU_SVM
2885 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2886 			/*
2887 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2888 			 * could cause possible lock race condition.
2889 			 */
2890 			up_write(&dmar_global_lock);
2891 			ret = intel_svm_enable_prq(iommu);
2892 			down_write(&dmar_global_lock);
2893 			if (ret)
2894 				goto free_iommu;
2895 		}
2896 #endif
2897 		ret = dmar_set_interrupt(iommu);
2898 		if (ret)
2899 			goto free_iommu;
2900 	}
2901 
2902 	return 0;
2903 
2904 free_iommu:
2905 	for_each_active_iommu(iommu, drhd) {
2906 		disable_dmar_iommu(iommu);
2907 		free_dmar_iommu(iommu);
2908 	}
2909 	if (si_domain) {
2910 		domain_exit(si_domain);
2911 		si_domain = NULL;
2912 	}
2913 
2914 	return ret;
2915 }
2916 
2917 static void __init init_no_remapping_devices(void)
2918 {
2919 	struct dmar_drhd_unit *drhd;
2920 	struct device *dev;
2921 	int i;
2922 
2923 	for_each_drhd_unit(drhd) {
2924 		if (!drhd->include_all) {
2925 			for_each_active_dev_scope(drhd->devices,
2926 						  drhd->devices_cnt, i, dev)
2927 				break;
2928 			/* ignore DMAR unit if no devices exist */
2929 			if (i == drhd->devices_cnt)
2930 				drhd->ignored = 1;
2931 		}
2932 	}
2933 
2934 	for_each_active_drhd_unit(drhd) {
2935 		if (drhd->include_all)
2936 			continue;
2937 
2938 		for_each_active_dev_scope(drhd->devices,
2939 					  drhd->devices_cnt, i, dev)
2940 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2941 				break;
2942 		if (i < drhd->devices_cnt)
2943 			continue;
2944 
2945 		/* This IOMMU has *only* gfx devices. Either bypass it or
2946 		   set the gfx_mapped flag, as appropriate */
2947 		drhd->gfx_dedicated = 1;
2948 		if (!dmar_map_gfx)
2949 			drhd->ignored = 1;
2950 	}
2951 }
2952 
2953 #ifdef CONFIG_SUSPEND
2954 static int init_iommu_hw(void)
2955 {
2956 	struct dmar_drhd_unit *drhd;
2957 	struct intel_iommu *iommu = NULL;
2958 	int ret;
2959 
2960 	for_each_active_iommu(iommu, drhd) {
2961 		if (iommu->qi) {
2962 			ret = dmar_reenable_qi(iommu);
2963 			if (ret)
2964 				return ret;
2965 		}
2966 	}
2967 
2968 	for_each_iommu(iommu, drhd) {
2969 		if (drhd->ignored) {
2970 			/*
2971 			 * we always have to disable PMRs or DMA may fail on
2972 			 * this device
2973 			 */
2974 			if (force_on)
2975 				iommu_disable_protect_mem_regions(iommu);
2976 			continue;
2977 		}
2978 
2979 		iommu_flush_write_buffer(iommu);
2980 		iommu_set_root_entry(iommu);
2981 		iommu_enable_translation(iommu);
2982 		iommu_disable_protect_mem_regions(iommu);
2983 	}
2984 
2985 	return 0;
2986 }
2987 
2988 static void iommu_flush_all(void)
2989 {
2990 	struct dmar_drhd_unit *drhd;
2991 	struct intel_iommu *iommu;
2992 
2993 	for_each_active_iommu(iommu, drhd) {
2994 		iommu->flush.flush_context(iommu, 0, 0, 0,
2995 					   DMA_CCMD_GLOBAL_INVL);
2996 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2997 					 DMA_TLB_GLOBAL_FLUSH);
2998 	}
2999 }
3000 
3001 static int iommu_suspend(void)
3002 {
3003 	struct dmar_drhd_unit *drhd;
3004 	struct intel_iommu *iommu = NULL;
3005 	unsigned long flag;
3006 
3007 	for_each_active_iommu(iommu, drhd) {
3008 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3009 					     GFP_KERNEL);
3010 		if (!iommu->iommu_state)
3011 			goto nomem;
3012 	}
3013 
3014 	iommu_flush_all();
3015 
3016 	for_each_active_iommu(iommu, drhd) {
3017 		iommu_disable_translation(iommu);
3018 
3019 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3020 
3021 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3022 			readl(iommu->reg + DMAR_FECTL_REG);
3023 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3024 			readl(iommu->reg + DMAR_FEDATA_REG);
3025 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3026 			readl(iommu->reg + DMAR_FEADDR_REG);
3027 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3028 			readl(iommu->reg + DMAR_FEUADDR_REG);
3029 
3030 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3031 	}
3032 	return 0;
3033 
3034 nomem:
3035 	for_each_active_iommu(iommu, drhd)
3036 		kfree(iommu->iommu_state);
3037 
3038 	return -ENOMEM;
3039 }
3040 
3041 static void iommu_resume(void)
3042 {
3043 	struct dmar_drhd_unit *drhd;
3044 	struct intel_iommu *iommu = NULL;
3045 	unsigned long flag;
3046 
3047 	if (init_iommu_hw()) {
3048 		if (force_on)
3049 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3050 		else
3051 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3052 		return;
3053 	}
3054 
3055 	for_each_active_iommu(iommu, drhd) {
3056 
3057 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3058 
3059 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3060 			iommu->reg + DMAR_FECTL_REG);
3061 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3062 			iommu->reg + DMAR_FEDATA_REG);
3063 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3064 			iommu->reg + DMAR_FEADDR_REG);
3065 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3066 			iommu->reg + DMAR_FEUADDR_REG);
3067 
3068 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3069 	}
3070 
3071 	for_each_active_iommu(iommu, drhd)
3072 		kfree(iommu->iommu_state);
3073 }
3074 
3075 static struct syscore_ops iommu_syscore_ops = {
3076 	.resume		= iommu_resume,
3077 	.suspend	= iommu_suspend,
3078 };
3079 
3080 static void __init init_iommu_pm_ops(void)
3081 {
3082 	register_syscore_ops(&iommu_syscore_ops);
3083 }
3084 
3085 #else
3086 static inline void init_iommu_pm_ops(void) {}
3087 #endif	/* CONFIG_PM */
3088 
3089 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3090 {
3091 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3092 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3093 	    rmrr->end_address <= rmrr->base_address ||
3094 	    arch_rmrr_sanity_check(rmrr))
3095 		return -EINVAL;
3096 
3097 	return 0;
3098 }
3099 
3100 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3101 {
3102 	struct acpi_dmar_reserved_memory *rmrr;
3103 	struct dmar_rmrr_unit *rmrru;
3104 
3105 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3106 	if (rmrr_sanity_check(rmrr)) {
3107 		pr_warn(FW_BUG
3108 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3109 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3110 			   rmrr->base_address, rmrr->end_address,
3111 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3112 			   dmi_get_system_info(DMI_BIOS_VERSION),
3113 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3114 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3115 	}
3116 
3117 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3118 	if (!rmrru)
3119 		goto out;
3120 
3121 	rmrru->hdr = header;
3122 
3123 	rmrru->base_address = rmrr->base_address;
3124 	rmrru->end_address = rmrr->end_address;
3125 
3126 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3127 				((void *)rmrr) + rmrr->header.length,
3128 				&rmrru->devices_cnt);
3129 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3130 		goto free_rmrru;
3131 
3132 	list_add(&rmrru->list, &dmar_rmrr_units);
3133 
3134 	return 0;
3135 free_rmrru:
3136 	kfree(rmrru);
3137 out:
3138 	return -ENOMEM;
3139 }
3140 
3141 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3142 {
3143 	struct dmar_atsr_unit *atsru;
3144 	struct acpi_dmar_atsr *tmp;
3145 
3146 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3147 				dmar_rcu_check()) {
3148 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3149 		if (atsr->segment != tmp->segment)
3150 			continue;
3151 		if (atsr->header.length != tmp->header.length)
3152 			continue;
3153 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3154 			return atsru;
3155 	}
3156 
3157 	return NULL;
3158 }
3159 
3160 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3161 {
3162 	struct acpi_dmar_atsr *atsr;
3163 	struct dmar_atsr_unit *atsru;
3164 
3165 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3166 		return 0;
3167 
3168 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3169 	atsru = dmar_find_atsr(atsr);
3170 	if (atsru)
3171 		return 0;
3172 
3173 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3174 	if (!atsru)
3175 		return -ENOMEM;
3176 
3177 	/*
3178 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3179 	 * copy the memory content because the memory buffer will be freed
3180 	 * on return.
3181 	 */
3182 	atsru->hdr = (void *)(atsru + 1);
3183 	memcpy(atsru->hdr, hdr, hdr->length);
3184 	atsru->include_all = atsr->flags & 0x1;
3185 	if (!atsru->include_all) {
3186 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3187 				(void *)atsr + atsr->header.length,
3188 				&atsru->devices_cnt);
3189 		if (atsru->devices_cnt && atsru->devices == NULL) {
3190 			kfree(atsru);
3191 			return -ENOMEM;
3192 		}
3193 	}
3194 
3195 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3196 
3197 	return 0;
3198 }
3199 
3200 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3201 {
3202 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3203 	kfree(atsru);
3204 }
3205 
3206 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3207 {
3208 	struct acpi_dmar_atsr *atsr;
3209 	struct dmar_atsr_unit *atsru;
3210 
3211 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3212 	atsru = dmar_find_atsr(atsr);
3213 	if (atsru) {
3214 		list_del_rcu(&atsru->list);
3215 		synchronize_rcu();
3216 		intel_iommu_free_atsr(atsru);
3217 	}
3218 
3219 	return 0;
3220 }
3221 
3222 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3223 {
3224 	int i;
3225 	struct device *dev;
3226 	struct acpi_dmar_atsr *atsr;
3227 	struct dmar_atsr_unit *atsru;
3228 
3229 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3230 	atsru = dmar_find_atsr(atsr);
3231 	if (!atsru)
3232 		return 0;
3233 
3234 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3235 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3236 					  i, dev)
3237 			return -EBUSY;
3238 	}
3239 
3240 	return 0;
3241 }
3242 
3243 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3244 {
3245 	struct dmar_satc_unit *satcu;
3246 	struct acpi_dmar_satc *tmp;
3247 
3248 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3249 				dmar_rcu_check()) {
3250 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3251 		if (satc->segment != tmp->segment)
3252 			continue;
3253 		if (satc->header.length != tmp->header.length)
3254 			continue;
3255 		if (memcmp(satc, tmp, satc->header.length) == 0)
3256 			return satcu;
3257 	}
3258 
3259 	return NULL;
3260 }
3261 
3262 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3263 {
3264 	struct acpi_dmar_satc *satc;
3265 	struct dmar_satc_unit *satcu;
3266 
3267 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3268 		return 0;
3269 
3270 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3271 	satcu = dmar_find_satc(satc);
3272 	if (satcu)
3273 		return 0;
3274 
3275 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3276 	if (!satcu)
3277 		return -ENOMEM;
3278 
3279 	satcu->hdr = (void *)(satcu + 1);
3280 	memcpy(satcu->hdr, hdr, hdr->length);
3281 	satcu->atc_required = satc->flags & 0x1;
3282 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3283 					      (void *)satc + satc->header.length,
3284 					      &satcu->devices_cnt);
3285 	if (satcu->devices_cnt && !satcu->devices) {
3286 		kfree(satcu);
3287 		return -ENOMEM;
3288 	}
3289 	list_add_rcu(&satcu->list, &dmar_satc_units);
3290 
3291 	return 0;
3292 }
3293 
3294 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3295 {
3296 	int sp, ret;
3297 	struct intel_iommu *iommu = dmaru->iommu;
3298 
3299 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3300 	if (ret)
3301 		goto out;
3302 
3303 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3304 		pr_warn("%s: Doesn't support hardware pass through.\n",
3305 			iommu->name);
3306 		return -ENXIO;
3307 	}
3308 
3309 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3310 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3311 		pr_warn("%s: Doesn't support large page.\n",
3312 			iommu->name);
3313 		return -ENXIO;
3314 	}
3315 
3316 	/*
3317 	 * Disable translation if already enabled prior to OS handover.
3318 	 */
3319 	if (iommu->gcmd & DMA_GCMD_TE)
3320 		iommu_disable_translation(iommu);
3321 
3322 	ret = iommu_init_domains(iommu);
3323 	if (ret == 0)
3324 		ret = iommu_alloc_root_entry(iommu);
3325 	if (ret)
3326 		goto out;
3327 
3328 	intel_svm_check(iommu);
3329 
3330 	if (dmaru->ignored) {
3331 		/*
3332 		 * we always have to disable PMRs or DMA may fail on this device
3333 		 */
3334 		if (force_on)
3335 			iommu_disable_protect_mem_regions(iommu);
3336 		return 0;
3337 	}
3338 
3339 	intel_iommu_init_qi(iommu);
3340 	iommu_flush_write_buffer(iommu);
3341 
3342 #ifdef CONFIG_INTEL_IOMMU_SVM
3343 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3344 		ret = intel_svm_enable_prq(iommu);
3345 		if (ret)
3346 			goto disable_iommu;
3347 	}
3348 #endif
3349 	ret = dmar_set_interrupt(iommu);
3350 	if (ret)
3351 		goto disable_iommu;
3352 
3353 	iommu_set_root_entry(iommu);
3354 	iommu_enable_translation(iommu);
3355 
3356 	iommu_disable_protect_mem_regions(iommu);
3357 	return 0;
3358 
3359 disable_iommu:
3360 	disable_dmar_iommu(iommu);
3361 out:
3362 	free_dmar_iommu(iommu);
3363 	return ret;
3364 }
3365 
3366 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3367 {
3368 	int ret = 0;
3369 	struct intel_iommu *iommu = dmaru->iommu;
3370 
3371 	if (!intel_iommu_enabled)
3372 		return 0;
3373 	if (iommu == NULL)
3374 		return -EINVAL;
3375 
3376 	if (insert) {
3377 		ret = intel_iommu_add(dmaru);
3378 	} else {
3379 		disable_dmar_iommu(iommu);
3380 		free_dmar_iommu(iommu);
3381 	}
3382 
3383 	return ret;
3384 }
3385 
3386 static void intel_iommu_free_dmars(void)
3387 {
3388 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3389 	struct dmar_atsr_unit *atsru, *atsr_n;
3390 	struct dmar_satc_unit *satcu, *satc_n;
3391 
3392 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3393 		list_del(&rmrru->list);
3394 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3395 		kfree(rmrru);
3396 	}
3397 
3398 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3399 		list_del(&atsru->list);
3400 		intel_iommu_free_atsr(atsru);
3401 	}
3402 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3403 		list_del(&satcu->list);
3404 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3405 		kfree(satcu);
3406 	}
3407 }
3408 
3409 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3410 {
3411 	struct dmar_satc_unit *satcu;
3412 	struct acpi_dmar_satc *satc;
3413 	struct device *tmp;
3414 	int i;
3415 
3416 	dev = pci_physfn(dev);
3417 	rcu_read_lock();
3418 
3419 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3420 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3421 		if (satc->segment != pci_domain_nr(dev->bus))
3422 			continue;
3423 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3424 			if (to_pci_dev(tmp) == dev)
3425 				goto out;
3426 	}
3427 	satcu = NULL;
3428 out:
3429 	rcu_read_unlock();
3430 	return satcu;
3431 }
3432 
3433 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3434 {
3435 	int i, ret = 1;
3436 	struct pci_bus *bus;
3437 	struct pci_dev *bridge = NULL;
3438 	struct device *tmp;
3439 	struct acpi_dmar_atsr *atsr;
3440 	struct dmar_atsr_unit *atsru;
3441 	struct dmar_satc_unit *satcu;
3442 
3443 	dev = pci_physfn(dev);
3444 	satcu = dmar_find_matched_satc_unit(dev);
3445 	if (satcu)
3446 		/*
3447 		 * This device supports ATS as it is in SATC table.
3448 		 * When IOMMU is in legacy mode, enabling ATS is done
3449 		 * automatically by HW for the device that requires
3450 		 * ATS, hence OS should not enable this device ATS
3451 		 * to avoid duplicated TLB invalidation.
3452 		 */
3453 		return !(satcu->atc_required && !sm_supported(iommu));
3454 
3455 	for (bus = dev->bus; bus; bus = bus->parent) {
3456 		bridge = bus->self;
3457 		/* If it's an integrated device, allow ATS */
3458 		if (!bridge)
3459 			return 1;
3460 		/* Connected via non-PCIe: no ATS */
3461 		if (!pci_is_pcie(bridge) ||
3462 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3463 			return 0;
3464 		/* If we found the root port, look it up in the ATSR */
3465 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3466 			break;
3467 	}
3468 
3469 	rcu_read_lock();
3470 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3471 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3472 		if (atsr->segment != pci_domain_nr(dev->bus))
3473 			continue;
3474 
3475 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3476 			if (tmp == &bridge->dev)
3477 				goto out;
3478 
3479 		if (atsru->include_all)
3480 			goto out;
3481 	}
3482 	ret = 0;
3483 out:
3484 	rcu_read_unlock();
3485 
3486 	return ret;
3487 }
3488 
3489 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3490 {
3491 	int ret;
3492 	struct dmar_rmrr_unit *rmrru;
3493 	struct dmar_atsr_unit *atsru;
3494 	struct dmar_satc_unit *satcu;
3495 	struct acpi_dmar_atsr *atsr;
3496 	struct acpi_dmar_reserved_memory *rmrr;
3497 	struct acpi_dmar_satc *satc;
3498 
3499 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3500 		return 0;
3501 
3502 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3503 		rmrr = container_of(rmrru->hdr,
3504 				    struct acpi_dmar_reserved_memory, header);
3505 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3506 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3507 				((void *)rmrr) + rmrr->header.length,
3508 				rmrr->segment, rmrru->devices,
3509 				rmrru->devices_cnt);
3510 			if (ret < 0)
3511 				return ret;
3512 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3513 			dmar_remove_dev_scope(info, rmrr->segment,
3514 				rmrru->devices, rmrru->devices_cnt);
3515 		}
3516 	}
3517 
3518 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3519 		if (atsru->include_all)
3520 			continue;
3521 
3522 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3523 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3524 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3525 					(void *)atsr + atsr->header.length,
3526 					atsr->segment, atsru->devices,
3527 					atsru->devices_cnt);
3528 			if (ret > 0)
3529 				break;
3530 			else if (ret < 0)
3531 				return ret;
3532 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3533 			if (dmar_remove_dev_scope(info, atsr->segment,
3534 					atsru->devices, atsru->devices_cnt))
3535 				break;
3536 		}
3537 	}
3538 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3539 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3540 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3541 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3542 					(void *)satc + satc->header.length,
3543 					satc->segment, satcu->devices,
3544 					satcu->devices_cnt);
3545 			if (ret > 0)
3546 				break;
3547 			else if (ret < 0)
3548 				return ret;
3549 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3550 			if (dmar_remove_dev_scope(info, satc->segment,
3551 					satcu->devices, satcu->devices_cnt))
3552 				break;
3553 		}
3554 	}
3555 
3556 	return 0;
3557 }
3558 
3559 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3560 				       unsigned long val, void *v)
3561 {
3562 	struct memory_notify *mhp = v;
3563 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3564 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3565 			mhp->nr_pages - 1);
3566 
3567 	switch (val) {
3568 	case MEM_GOING_ONLINE:
3569 		if (iommu_domain_identity_map(si_domain,
3570 					      start_vpfn, last_vpfn)) {
3571 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3572 				start_vpfn, last_vpfn);
3573 			return NOTIFY_BAD;
3574 		}
3575 		break;
3576 
3577 	case MEM_OFFLINE:
3578 	case MEM_CANCEL_ONLINE:
3579 		{
3580 			struct dmar_drhd_unit *drhd;
3581 			struct intel_iommu *iommu;
3582 			LIST_HEAD(freelist);
3583 
3584 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3585 
3586 			rcu_read_lock();
3587 			for_each_active_iommu(iommu, drhd)
3588 				iommu_flush_iotlb_psi(iommu, si_domain,
3589 					start_vpfn, mhp->nr_pages,
3590 					list_empty(&freelist), 0);
3591 			rcu_read_unlock();
3592 			put_pages_list(&freelist);
3593 		}
3594 		break;
3595 	}
3596 
3597 	return NOTIFY_OK;
3598 }
3599 
3600 static struct notifier_block intel_iommu_memory_nb = {
3601 	.notifier_call = intel_iommu_memory_notifier,
3602 	.priority = 0
3603 };
3604 
3605 static void intel_disable_iommus(void)
3606 {
3607 	struct intel_iommu *iommu = NULL;
3608 	struct dmar_drhd_unit *drhd;
3609 
3610 	for_each_iommu(iommu, drhd)
3611 		iommu_disable_translation(iommu);
3612 }
3613 
3614 void intel_iommu_shutdown(void)
3615 {
3616 	struct dmar_drhd_unit *drhd;
3617 	struct intel_iommu *iommu = NULL;
3618 
3619 	if (no_iommu || dmar_disabled)
3620 		return;
3621 
3622 	down_write(&dmar_global_lock);
3623 
3624 	/* Disable PMRs explicitly here. */
3625 	for_each_iommu(iommu, drhd)
3626 		iommu_disable_protect_mem_regions(iommu);
3627 
3628 	/* Make sure the IOMMUs are switched off */
3629 	intel_disable_iommus();
3630 
3631 	up_write(&dmar_global_lock);
3632 }
3633 
3634 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3635 {
3636 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3637 
3638 	return container_of(iommu_dev, struct intel_iommu, iommu);
3639 }
3640 
3641 static ssize_t version_show(struct device *dev,
3642 			    struct device_attribute *attr, char *buf)
3643 {
3644 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3645 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3646 	return sysfs_emit(buf, "%d:%d\n",
3647 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3648 }
3649 static DEVICE_ATTR_RO(version);
3650 
3651 static ssize_t address_show(struct device *dev,
3652 			    struct device_attribute *attr, char *buf)
3653 {
3654 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3655 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3656 }
3657 static DEVICE_ATTR_RO(address);
3658 
3659 static ssize_t cap_show(struct device *dev,
3660 			struct device_attribute *attr, char *buf)
3661 {
3662 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3663 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3664 }
3665 static DEVICE_ATTR_RO(cap);
3666 
3667 static ssize_t ecap_show(struct device *dev,
3668 			 struct device_attribute *attr, char *buf)
3669 {
3670 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3671 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3672 }
3673 static DEVICE_ATTR_RO(ecap);
3674 
3675 static ssize_t domains_supported_show(struct device *dev,
3676 				      struct device_attribute *attr, char *buf)
3677 {
3678 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3679 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3680 }
3681 static DEVICE_ATTR_RO(domains_supported);
3682 
3683 static ssize_t domains_used_show(struct device *dev,
3684 				 struct device_attribute *attr, char *buf)
3685 {
3686 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3687 	return sysfs_emit(buf, "%d\n",
3688 			  bitmap_weight(iommu->domain_ids,
3689 					cap_ndoms(iommu->cap)));
3690 }
3691 static DEVICE_ATTR_RO(domains_used);
3692 
3693 static struct attribute *intel_iommu_attrs[] = {
3694 	&dev_attr_version.attr,
3695 	&dev_attr_address.attr,
3696 	&dev_attr_cap.attr,
3697 	&dev_attr_ecap.attr,
3698 	&dev_attr_domains_supported.attr,
3699 	&dev_attr_domains_used.attr,
3700 	NULL,
3701 };
3702 
3703 static struct attribute_group intel_iommu_group = {
3704 	.name = "intel-iommu",
3705 	.attrs = intel_iommu_attrs,
3706 };
3707 
3708 const struct attribute_group *intel_iommu_groups[] = {
3709 	&intel_iommu_group,
3710 	NULL,
3711 };
3712 
3713 static inline bool has_external_pci(void)
3714 {
3715 	struct pci_dev *pdev = NULL;
3716 
3717 	for_each_pci_dev(pdev)
3718 		if (pdev->external_facing) {
3719 			pci_dev_put(pdev);
3720 			return true;
3721 		}
3722 
3723 	return false;
3724 }
3725 
3726 static int __init platform_optin_force_iommu(void)
3727 {
3728 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3729 		return 0;
3730 
3731 	if (no_iommu || dmar_disabled)
3732 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3733 
3734 	/*
3735 	 * If Intel-IOMMU is disabled by default, we will apply identity
3736 	 * map for all devices except those marked as being untrusted.
3737 	 */
3738 	if (dmar_disabled)
3739 		iommu_set_default_passthrough(false);
3740 
3741 	dmar_disabled = 0;
3742 	no_iommu = 0;
3743 
3744 	return 1;
3745 }
3746 
3747 static int __init probe_acpi_namespace_devices(void)
3748 {
3749 	struct dmar_drhd_unit *drhd;
3750 	/* To avoid a -Wunused-but-set-variable warning. */
3751 	struct intel_iommu *iommu __maybe_unused;
3752 	struct device *dev;
3753 	int i, ret = 0;
3754 
3755 	for_each_active_iommu(iommu, drhd) {
3756 		for_each_active_dev_scope(drhd->devices,
3757 					  drhd->devices_cnt, i, dev) {
3758 			struct acpi_device_physical_node *pn;
3759 			struct iommu_group *group;
3760 			struct acpi_device *adev;
3761 
3762 			if (dev->bus != &acpi_bus_type)
3763 				continue;
3764 
3765 			adev = to_acpi_device(dev);
3766 			mutex_lock(&adev->physical_node_lock);
3767 			list_for_each_entry(pn,
3768 					    &adev->physical_node_list, node) {
3769 				group = iommu_group_get(pn->dev);
3770 				if (group) {
3771 					iommu_group_put(group);
3772 					continue;
3773 				}
3774 
3775 				ret = iommu_probe_device(pn->dev);
3776 				if (ret)
3777 					break;
3778 			}
3779 			mutex_unlock(&adev->physical_node_lock);
3780 
3781 			if (ret)
3782 				return ret;
3783 		}
3784 	}
3785 
3786 	return 0;
3787 }
3788 
3789 static __init int tboot_force_iommu(void)
3790 {
3791 	if (!tboot_enabled())
3792 		return 0;
3793 
3794 	if (no_iommu || dmar_disabled)
3795 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3796 
3797 	dmar_disabled = 0;
3798 	no_iommu = 0;
3799 
3800 	return 1;
3801 }
3802 
3803 int __init intel_iommu_init(void)
3804 {
3805 	int ret = -ENODEV;
3806 	struct dmar_drhd_unit *drhd;
3807 	struct intel_iommu *iommu;
3808 
3809 	/*
3810 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3811 	 * opt in, so enforce that.
3812 	 */
3813 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3814 		    platform_optin_force_iommu();
3815 
3816 	down_write(&dmar_global_lock);
3817 	if (dmar_table_init()) {
3818 		if (force_on)
3819 			panic("tboot: Failed to initialize DMAR table\n");
3820 		goto out_free_dmar;
3821 	}
3822 
3823 	if (dmar_dev_scope_init() < 0) {
3824 		if (force_on)
3825 			panic("tboot: Failed to initialize DMAR device scope\n");
3826 		goto out_free_dmar;
3827 	}
3828 
3829 	up_write(&dmar_global_lock);
3830 
3831 	/*
3832 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3833 	 * complain later when we register it under the lock.
3834 	 */
3835 	dmar_register_bus_notifier();
3836 
3837 	down_write(&dmar_global_lock);
3838 
3839 	if (!no_iommu)
3840 		intel_iommu_debugfs_init();
3841 
3842 	if (no_iommu || dmar_disabled) {
3843 		/*
3844 		 * We exit the function here to ensure IOMMU's remapping and
3845 		 * mempool aren't setup, which means that the IOMMU's PMRs
3846 		 * won't be disabled via the call to init_dmars(). So disable
3847 		 * it explicitly here. The PMRs were setup by tboot prior to
3848 		 * calling SENTER, but the kernel is expected to reset/tear
3849 		 * down the PMRs.
3850 		 */
3851 		if (intel_iommu_tboot_noforce) {
3852 			for_each_iommu(iommu, drhd)
3853 				iommu_disable_protect_mem_regions(iommu);
3854 		}
3855 
3856 		/*
3857 		 * Make sure the IOMMUs are switched off, even when we
3858 		 * boot into a kexec kernel and the previous kernel left
3859 		 * them enabled
3860 		 */
3861 		intel_disable_iommus();
3862 		goto out_free_dmar;
3863 	}
3864 
3865 	if (list_empty(&dmar_rmrr_units))
3866 		pr_info("No RMRR found\n");
3867 
3868 	if (list_empty(&dmar_atsr_units))
3869 		pr_info("No ATSR found\n");
3870 
3871 	if (list_empty(&dmar_satc_units))
3872 		pr_info("No SATC found\n");
3873 
3874 	init_no_remapping_devices();
3875 
3876 	ret = init_dmars();
3877 	if (ret) {
3878 		if (force_on)
3879 			panic("tboot: Failed to initialize DMARs\n");
3880 		pr_err("Initialization failed\n");
3881 		goto out_free_dmar;
3882 	}
3883 	up_write(&dmar_global_lock);
3884 
3885 	init_iommu_pm_ops();
3886 
3887 	down_read(&dmar_global_lock);
3888 	for_each_active_iommu(iommu, drhd) {
3889 		/*
3890 		 * The flush queue implementation does not perform
3891 		 * page-selective invalidations that are required for efficient
3892 		 * TLB flushes in virtual environments.  The benefit of batching
3893 		 * is likely to be much lower than the overhead of synchronizing
3894 		 * the virtual and physical IOMMU page-tables.
3895 		 */
3896 		if (cap_caching_mode(iommu->cap) &&
3897 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3898 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3899 			iommu_set_dma_strict();
3900 		}
3901 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3902 				       intel_iommu_groups,
3903 				       "%s", iommu->name);
3904 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3905 
3906 		iommu_pmu_register(iommu);
3907 	}
3908 	up_read(&dmar_global_lock);
3909 
3910 	if (si_domain && !hw_pass_through)
3911 		register_memory_notifier(&intel_iommu_memory_nb);
3912 
3913 	down_read(&dmar_global_lock);
3914 	if (probe_acpi_namespace_devices())
3915 		pr_warn("ACPI name space devices didn't probe correctly\n");
3916 
3917 	/* Finally, we enable the DMA remapping hardware. */
3918 	for_each_iommu(iommu, drhd) {
3919 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3920 			iommu_enable_translation(iommu);
3921 
3922 		iommu_disable_protect_mem_regions(iommu);
3923 	}
3924 	up_read(&dmar_global_lock);
3925 
3926 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3927 
3928 	intel_iommu_enabled = 1;
3929 
3930 	return 0;
3931 
3932 out_free_dmar:
3933 	intel_iommu_free_dmars();
3934 	up_write(&dmar_global_lock);
3935 	return ret;
3936 }
3937 
3938 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3939 {
3940 	struct device_domain_info *info = opaque;
3941 
3942 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3943 	return 0;
3944 }
3945 
3946 /*
3947  * NB - intel-iommu lacks any sort of reference counting for the users of
3948  * dependent devices.  If multiple endpoints have intersecting dependent
3949  * devices, unbinding the driver from any one of them will possibly leave
3950  * the others unable to operate.
3951  */
3952 static void domain_context_clear(struct device_domain_info *info)
3953 {
3954 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3955 		return;
3956 
3957 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3958 			       &domain_context_clear_one_cb, info);
3959 }
3960 
3961 static void dmar_remove_one_dev_info(struct device *dev)
3962 {
3963 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3964 	struct dmar_domain *domain = info->domain;
3965 	struct intel_iommu *iommu = info->iommu;
3966 	unsigned long flags;
3967 
3968 	if (!dev_is_real_dma_subdevice(info->dev)) {
3969 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3970 			intel_pasid_tear_down_entry(iommu, info->dev,
3971 					PASID_RID2PASID, false);
3972 
3973 		iommu_disable_pci_caps(info);
3974 		domain_context_clear(info);
3975 	}
3976 
3977 	spin_lock_irqsave(&domain->lock, flags);
3978 	list_del(&info->link);
3979 	spin_unlock_irqrestore(&domain->lock, flags);
3980 
3981 	domain_detach_iommu(domain, iommu);
3982 	info->domain = NULL;
3983 }
3984 
3985 /*
3986  * Clear the page table pointer in context or pasid table entries so that
3987  * all DMA requests without PASID from the device are blocked. If the page
3988  * table has been set, clean up the data structures.
3989  */
3990 static void device_block_translation(struct device *dev)
3991 {
3992 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3993 	struct intel_iommu *iommu = info->iommu;
3994 	unsigned long flags;
3995 
3996 	iommu_disable_pci_caps(info);
3997 	if (!dev_is_real_dma_subdevice(dev)) {
3998 		if (sm_supported(iommu))
3999 			intel_pasid_tear_down_entry(iommu, dev,
4000 						    PASID_RID2PASID, false);
4001 		else
4002 			domain_context_clear(info);
4003 	}
4004 
4005 	if (!info->domain)
4006 		return;
4007 
4008 	spin_lock_irqsave(&info->domain->lock, flags);
4009 	list_del(&info->link);
4010 	spin_unlock_irqrestore(&info->domain->lock, flags);
4011 
4012 	domain_detach_iommu(info->domain, iommu);
4013 	info->domain = NULL;
4014 }
4015 
4016 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4017 {
4018 	int adjust_width;
4019 
4020 	/* calculate AGAW */
4021 	domain->gaw = guest_width;
4022 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4023 	domain->agaw = width_to_agaw(adjust_width);
4024 
4025 	domain->iommu_coherency = false;
4026 	domain->iommu_superpage = 0;
4027 	domain->max_addr = 0;
4028 
4029 	/* always allocate the top pgd */
4030 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4031 	if (!domain->pgd)
4032 		return -ENOMEM;
4033 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4034 	return 0;
4035 }
4036 
4037 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4038 				      struct device *dev)
4039 {
4040 	device_block_translation(dev);
4041 	return 0;
4042 }
4043 
4044 static struct iommu_domain blocking_domain = {
4045 	.ops = &(const struct iommu_domain_ops) {
4046 		.attach_dev	= blocking_domain_attach_dev,
4047 		.free		= intel_iommu_domain_free
4048 	}
4049 };
4050 
4051 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4052 {
4053 	struct dmar_domain *dmar_domain;
4054 	struct iommu_domain *domain;
4055 
4056 	switch (type) {
4057 	case IOMMU_DOMAIN_BLOCKED:
4058 		return &blocking_domain;
4059 	case IOMMU_DOMAIN_DMA:
4060 	case IOMMU_DOMAIN_UNMANAGED:
4061 		dmar_domain = alloc_domain(type);
4062 		if (!dmar_domain) {
4063 			pr_err("Can't allocate dmar_domain\n");
4064 			return NULL;
4065 		}
4066 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4067 			pr_err("Domain initialization failed\n");
4068 			domain_exit(dmar_domain);
4069 			return NULL;
4070 		}
4071 
4072 		domain = &dmar_domain->domain;
4073 		domain->geometry.aperture_start = 0;
4074 		domain->geometry.aperture_end   =
4075 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4076 		domain->geometry.force_aperture = true;
4077 
4078 		return domain;
4079 	case IOMMU_DOMAIN_IDENTITY:
4080 		return &si_domain->domain;
4081 	case IOMMU_DOMAIN_SVA:
4082 		return intel_svm_domain_alloc();
4083 	default:
4084 		return NULL;
4085 	}
4086 
4087 	return NULL;
4088 }
4089 
4090 static void intel_iommu_domain_free(struct iommu_domain *domain)
4091 {
4092 	if (domain != &si_domain->domain && domain != &blocking_domain)
4093 		domain_exit(to_dmar_domain(domain));
4094 }
4095 
4096 static int prepare_domain_attach_device(struct iommu_domain *domain,
4097 					struct device *dev)
4098 {
4099 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4100 	struct intel_iommu *iommu;
4101 	int addr_width;
4102 
4103 	iommu = device_to_iommu(dev, NULL, NULL);
4104 	if (!iommu)
4105 		return -ENODEV;
4106 
4107 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4108 		return -EINVAL;
4109 
4110 	/* check if this iommu agaw is sufficient for max mapped address */
4111 	addr_width = agaw_to_width(iommu->agaw);
4112 	if (addr_width > cap_mgaw(iommu->cap))
4113 		addr_width = cap_mgaw(iommu->cap);
4114 
4115 	if (dmar_domain->max_addr > (1LL << addr_width))
4116 		return -EINVAL;
4117 	dmar_domain->gaw = addr_width;
4118 
4119 	/*
4120 	 * Knock out extra levels of page tables if necessary
4121 	 */
4122 	while (iommu->agaw < dmar_domain->agaw) {
4123 		struct dma_pte *pte;
4124 
4125 		pte = dmar_domain->pgd;
4126 		if (dma_pte_present(pte)) {
4127 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4128 			free_pgtable_page(pte);
4129 		}
4130 		dmar_domain->agaw--;
4131 	}
4132 
4133 	return 0;
4134 }
4135 
4136 static int intel_iommu_attach_device(struct iommu_domain *domain,
4137 				     struct device *dev)
4138 {
4139 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4140 	int ret;
4141 
4142 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4143 	    device_is_rmrr_locked(dev)) {
4144 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4145 		return -EPERM;
4146 	}
4147 
4148 	if (info->domain)
4149 		device_block_translation(dev);
4150 
4151 	ret = prepare_domain_attach_device(domain, dev);
4152 	if (ret)
4153 		return ret;
4154 
4155 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4156 }
4157 
4158 static int intel_iommu_map(struct iommu_domain *domain,
4159 			   unsigned long iova, phys_addr_t hpa,
4160 			   size_t size, int iommu_prot, gfp_t gfp)
4161 {
4162 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4163 	u64 max_addr;
4164 	int prot = 0;
4165 
4166 	if (iommu_prot & IOMMU_READ)
4167 		prot |= DMA_PTE_READ;
4168 	if (iommu_prot & IOMMU_WRITE)
4169 		prot |= DMA_PTE_WRITE;
4170 	if (dmar_domain->set_pte_snp)
4171 		prot |= DMA_PTE_SNP;
4172 
4173 	max_addr = iova + size;
4174 	if (dmar_domain->max_addr < max_addr) {
4175 		u64 end;
4176 
4177 		/* check if minimum agaw is sufficient for mapped address */
4178 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4179 		if (end < max_addr) {
4180 			pr_err("%s: iommu width (%d) is not "
4181 			       "sufficient for the mapped address (%llx)\n",
4182 			       __func__, dmar_domain->gaw, max_addr);
4183 			return -EFAULT;
4184 		}
4185 		dmar_domain->max_addr = max_addr;
4186 	}
4187 	/* Round up size to next multiple of PAGE_SIZE, if it and
4188 	   the low bits of hpa would take us onto the next page */
4189 	size = aligned_nrpages(hpa, size);
4190 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4191 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4192 }
4193 
4194 static int intel_iommu_map_pages(struct iommu_domain *domain,
4195 				 unsigned long iova, phys_addr_t paddr,
4196 				 size_t pgsize, size_t pgcount,
4197 				 int prot, gfp_t gfp, size_t *mapped)
4198 {
4199 	unsigned long pgshift = __ffs(pgsize);
4200 	size_t size = pgcount << pgshift;
4201 	int ret;
4202 
4203 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4204 		return -EINVAL;
4205 
4206 	if (!IS_ALIGNED(iova | paddr, pgsize))
4207 		return -EINVAL;
4208 
4209 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4210 	if (!ret && mapped)
4211 		*mapped = size;
4212 
4213 	return ret;
4214 }
4215 
4216 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4217 				unsigned long iova, size_t size,
4218 				struct iommu_iotlb_gather *gather)
4219 {
4220 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4221 	unsigned long start_pfn, last_pfn;
4222 	int level = 0;
4223 
4224 	/* Cope with horrid API which requires us to unmap more than the
4225 	   size argument if it happens to be a large-page mapping. */
4226 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4227 				     &level, GFP_ATOMIC)))
4228 		return 0;
4229 
4230 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4231 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4232 
4233 	start_pfn = iova >> VTD_PAGE_SHIFT;
4234 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4235 
4236 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4237 
4238 	if (dmar_domain->max_addr == iova + size)
4239 		dmar_domain->max_addr = iova;
4240 
4241 	/*
4242 	 * We do not use page-selective IOTLB invalidation in flush queue,
4243 	 * so there is no need to track page and sync iotlb.
4244 	 */
4245 	if (!iommu_iotlb_gather_queued(gather))
4246 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4247 
4248 	return size;
4249 }
4250 
4251 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4252 				      unsigned long iova,
4253 				      size_t pgsize, size_t pgcount,
4254 				      struct iommu_iotlb_gather *gather)
4255 {
4256 	unsigned long pgshift = __ffs(pgsize);
4257 	size_t size = pgcount << pgshift;
4258 
4259 	return intel_iommu_unmap(domain, iova, size, gather);
4260 }
4261 
4262 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4263 				 struct iommu_iotlb_gather *gather)
4264 {
4265 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4266 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4267 	size_t size = gather->end - gather->start;
4268 	struct iommu_domain_info *info;
4269 	unsigned long start_pfn;
4270 	unsigned long nrpages;
4271 	unsigned long i;
4272 
4273 	nrpages = aligned_nrpages(gather->start, size);
4274 	start_pfn = mm_to_dma_pfn(iova_pfn);
4275 
4276 	xa_for_each(&dmar_domain->iommu_array, i, info)
4277 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4278 				      start_pfn, nrpages,
4279 				      list_empty(&gather->freelist), 0);
4280 
4281 	put_pages_list(&gather->freelist);
4282 }
4283 
4284 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4285 					    dma_addr_t iova)
4286 {
4287 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4288 	struct dma_pte *pte;
4289 	int level = 0;
4290 	u64 phys = 0;
4291 
4292 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4293 			     GFP_ATOMIC);
4294 	if (pte && dma_pte_present(pte))
4295 		phys = dma_pte_addr(pte) +
4296 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4297 						VTD_PAGE_SHIFT) - 1));
4298 
4299 	return phys;
4300 }
4301 
4302 static bool domain_support_force_snooping(struct dmar_domain *domain)
4303 {
4304 	struct device_domain_info *info;
4305 	bool support = true;
4306 
4307 	assert_spin_locked(&domain->lock);
4308 	list_for_each_entry(info, &domain->devices, link) {
4309 		if (!ecap_sc_support(info->iommu->ecap)) {
4310 			support = false;
4311 			break;
4312 		}
4313 	}
4314 
4315 	return support;
4316 }
4317 
4318 static void domain_set_force_snooping(struct dmar_domain *domain)
4319 {
4320 	struct device_domain_info *info;
4321 
4322 	assert_spin_locked(&domain->lock);
4323 	/*
4324 	 * Second level page table supports per-PTE snoop control. The
4325 	 * iommu_map() interface will handle this by setting SNP bit.
4326 	 */
4327 	if (!domain->use_first_level) {
4328 		domain->set_pte_snp = true;
4329 		return;
4330 	}
4331 
4332 	list_for_each_entry(info, &domain->devices, link)
4333 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4334 						     PASID_RID2PASID);
4335 }
4336 
4337 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4338 {
4339 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4340 	unsigned long flags;
4341 
4342 	if (dmar_domain->force_snooping)
4343 		return true;
4344 
4345 	spin_lock_irqsave(&dmar_domain->lock, flags);
4346 	if (!domain_support_force_snooping(dmar_domain)) {
4347 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4348 		return false;
4349 	}
4350 
4351 	domain_set_force_snooping(dmar_domain);
4352 	dmar_domain->force_snooping = true;
4353 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4354 
4355 	return true;
4356 }
4357 
4358 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4359 {
4360 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4361 
4362 	switch (cap) {
4363 	case IOMMU_CAP_CACHE_COHERENCY:
4364 	case IOMMU_CAP_DEFERRED_FLUSH:
4365 		return true;
4366 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4367 		return dmar_platform_optin();
4368 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4369 		return ecap_sc_support(info->iommu->ecap);
4370 	default:
4371 		return false;
4372 	}
4373 }
4374 
4375 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4376 {
4377 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4378 	struct device_domain_info *info;
4379 	struct intel_iommu *iommu;
4380 	u8 bus, devfn;
4381 	int ret;
4382 
4383 	iommu = device_to_iommu(dev, &bus, &devfn);
4384 	if (!iommu || !iommu->iommu.ops)
4385 		return ERR_PTR(-ENODEV);
4386 
4387 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4388 	if (!info)
4389 		return ERR_PTR(-ENOMEM);
4390 
4391 	if (dev_is_real_dma_subdevice(dev)) {
4392 		info->bus = pdev->bus->number;
4393 		info->devfn = pdev->devfn;
4394 		info->segment = pci_domain_nr(pdev->bus);
4395 	} else {
4396 		info->bus = bus;
4397 		info->devfn = devfn;
4398 		info->segment = iommu->segment;
4399 	}
4400 
4401 	info->dev = dev;
4402 	info->iommu = iommu;
4403 	if (dev_is_pci(dev)) {
4404 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4405 		    pci_ats_supported(pdev) &&
4406 		    dmar_ats_supported(pdev, iommu)) {
4407 			info->ats_supported = 1;
4408 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4409 
4410 			/*
4411 			 * For IOMMU that supports device IOTLB throttling
4412 			 * (DIT), we assign PFSID to the invalidation desc
4413 			 * of a VF such that IOMMU HW can gauge queue depth
4414 			 * at PF level. If DIT is not set, PFSID will be
4415 			 * treated as reserved, which should be set to 0.
4416 			 */
4417 			if (ecap_dit(iommu->ecap))
4418 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4419 			info->ats_qdep = pci_ats_queue_depth(pdev);
4420 		}
4421 		if (sm_supported(iommu)) {
4422 			if (pasid_supported(iommu)) {
4423 				int features = pci_pasid_features(pdev);
4424 
4425 				if (features >= 0)
4426 					info->pasid_supported = features | 1;
4427 			}
4428 
4429 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4430 			    pci_pri_supported(pdev))
4431 				info->pri_supported = 1;
4432 		}
4433 	}
4434 
4435 	dev_iommu_priv_set(dev, info);
4436 
4437 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4438 		ret = intel_pasid_alloc_table(dev);
4439 		if (ret) {
4440 			dev_err(dev, "PASID table allocation failed\n");
4441 			dev_iommu_priv_set(dev, NULL);
4442 			kfree(info);
4443 			return ERR_PTR(ret);
4444 		}
4445 	}
4446 
4447 	return &iommu->iommu;
4448 }
4449 
4450 static void intel_iommu_release_device(struct device *dev)
4451 {
4452 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4453 
4454 	dmar_remove_one_dev_info(dev);
4455 	intel_pasid_free_table(dev);
4456 	dev_iommu_priv_set(dev, NULL);
4457 	kfree(info);
4458 	set_dma_ops(dev, NULL);
4459 }
4460 
4461 static void intel_iommu_probe_finalize(struct device *dev)
4462 {
4463 	set_dma_ops(dev, NULL);
4464 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4465 }
4466 
4467 static void intel_iommu_get_resv_regions(struct device *device,
4468 					 struct list_head *head)
4469 {
4470 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4471 	struct iommu_resv_region *reg;
4472 	struct dmar_rmrr_unit *rmrr;
4473 	struct device *i_dev;
4474 	int i;
4475 
4476 	rcu_read_lock();
4477 	for_each_rmrr_units(rmrr) {
4478 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4479 					  i, i_dev) {
4480 			struct iommu_resv_region *resv;
4481 			enum iommu_resv_type type;
4482 			size_t length;
4483 
4484 			if (i_dev != device &&
4485 			    !is_downstream_to_pci_bridge(device, i_dev))
4486 				continue;
4487 
4488 			length = rmrr->end_address - rmrr->base_address + 1;
4489 
4490 			type = device_rmrr_is_relaxable(device) ?
4491 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4492 
4493 			resv = iommu_alloc_resv_region(rmrr->base_address,
4494 						       length, prot, type,
4495 						       GFP_ATOMIC);
4496 			if (!resv)
4497 				break;
4498 
4499 			list_add_tail(&resv->list, head);
4500 		}
4501 	}
4502 	rcu_read_unlock();
4503 
4504 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4505 	if (dev_is_pci(device)) {
4506 		struct pci_dev *pdev = to_pci_dev(device);
4507 
4508 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4509 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4510 					IOMMU_RESV_DIRECT_RELAXABLE,
4511 					GFP_KERNEL);
4512 			if (reg)
4513 				list_add_tail(&reg->list, head);
4514 		}
4515 	}
4516 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4517 
4518 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4519 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4520 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4521 	if (!reg)
4522 		return;
4523 	list_add_tail(&reg->list, head);
4524 }
4525 
4526 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4527 {
4528 	if (dev_is_pci(dev))
4529 		return pci_device_group(dev);
4530 	return generic_device_group(dev);
4531 }
4532 
4533 static int intel_iommu_enable_sva(struct device *dev)
4534 {
4535 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4536 	struct intel_iommu *iommu;
4537 
4538 	if (!info || dmar_disabled)
4539 		return -EINVAL;
4540 
4541 	iommu = info->iommu;
4542 	if (!iommu)
4543 		return -EINVAL;
4544 
4545 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4546 		return -ENODEV;
4547 
4548 	if (!info->pasid_enabled || !info->ats_enabled)
4549 		return -EINVAL;
4550 
4551 	/*
4552 	 * Devices having device-specific I/O fault handling should not
4553 	 * support PCI/PRI. The IOMMU side has no means to check the
4554 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4555 	 * default that if the device driver enables SVA on a non-PRI
4556 	 * device, it will handle IOPF in its own way.
4557 	 */
4558 	if (!info->pri_supported)
4559 		return 0;
4560 
4561 	/* Devices supporting PRI should have it enabled. */
4562 	if (!info->pri_enabled)
4563 		return -EINVAL;
4564 
4565 	return 0;
4566 }
4567 
4568 static int intel_iommu_enable_iopf(struct device *dev)
4569 {
4570 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4571 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4572 	struct intel_iommu *iommu;
4573 	int ret;
4574 
4575 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4576 		return -ENODEV;
4577 
4578 	if (info->pri_enabled)
4579 		return -EBUSY;
4580 
4581 	iommu = info->iommu;
4582 	if (!iommu)
4583 		return -EINVAL;
4584 
4585 	/* PASID is required in PRG Response Message. */
4586 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4587 		return -EINVAL;
4588 
4589 	ret = pci_reset_pri(pdev);
4590 	if (ret)
4591 		return ret;
4592 
4593 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4594 	if (ret)
4595 		return ret;
4596 
4597 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4598 	if (ret)
4599 		goto iopf_remove_device;
4600 
4601 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4602 	if (ret)
4603 		goto iopf_unregister_handler;
4604 	info->pri_enabled = 1;
4605 
4606 	return 0;
4607 
4608 iopf_unregister_handler:
4609 	iommu_unregister_device_fault_handler(dev);
4610 iopf_remove_device:
4611 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4612 
4613 	return ret;
4614 }
4615 
4616 static int intel_iommu_disable_iopf(struct device *dev)
4617 {
4618 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4619 	struct intel_iommu *iommu = info->iommu;
4620 
4621 	if (!info->pri_enabled)
4622 		return -EINVAL;
4623 
4624 	/*
4625 	 * PCIe spec states that by clearing PRI enable bit, the Page
4626 	 * Request Interface will not issue new page requests, but has
4627 	 * outstanding page requests that have been transmitted or are
4628 	 * queued for transmission. This is supposed to be called after
4629 	 * the device driver has stopped DMA, all PASIDs have been
4630 	 * unbound and the outstanding PRQs have been drained.
4631 	 */
4632 	pci_disable_pri(to_pci_dev(dev));
4633 	info->pri_enabled = 0;
4634 
4635 	/*
4636 	 * With PRI disabled and outstanding PRQs drained, unregistering
4637 	 * fault handler and removing device from iopf queue should never
4638 	 * fail.
4639 	 */
4640 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4641 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4642 
4643 	return 0;
4644 }
4645 
4646 static int
4647 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4648 {
4649 	switch (feat) {
4650 	case IOMMU_DEV_FEAT_IOPF:
4651 		return intel_iommu_enable_iopf(dev);
4652 
4653 	case IOMMU_DEV_FEAT_SVA:
4654 		return intel_iommu_enable_sva(dev);
4655 
4656 	default:
4657 		return -ENODEV;
4658 	}
4659 }
4660 
4661 static int
4662 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4663 {
4664 	switch (feat) {
4665 	case IOMMU_DEV_FEAT_IOPF:
4666 		return intel_iommu_disable_iopf(dev);
4667 
4668 	case IOMMU_DEV_FEAT_SVA:
4669 		return 0;
4670 
4671 	default:
4672 		return -ENODEV;
4673 	}
4674 }
4675 
4676 static bool intel_iommu_is_attach_deferred(struct device *dev)
4677 {
4678 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4679 
4680 	return translation_pre_enabled(info->iommu) && !info->domain;
4681 }
4682 
4683 /*
4684  * Check that the device does not live on an external facing PCI port that is
4685  * marked as untrusted. Such devices should not be able to apply quirks and
4686  * thus not be able to bypass the IOMMU restrictions.
4687  */
4688 static bool risky_device(struct pci_dev *pdev)
4689 {
4690 	if (pdev->untrusted) {
4691 		pci_info(pdev,
4692 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4693 			 pdev->vendor, pdev->device);
4694 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4695 		return true;
4696 	}
4697 	return false;
4698 }
4699 
4700 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4701 				       unsigned long iova, size_t size)
4702 {
4703 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4704 	unsigned long pages = aligned_nrpages(iova, size);
4705 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4706 	struct iommu_domain_info *info;
4707 	unsigned long i;
4708 
4709 	xa_for_each(&dmar_domain->iommu_array, i, info)
4710 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4711 }
4712 
4713 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4714 {
4715 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4716 	struct iommu_domain *domain;
4717 
4718 	/* Domain type specific cleanup: */
4719 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4720 	if (domain) {
4721 		switch (domain->type) {
4722 		case IOMMU_DOMAIN_SVA:
4723 			intel_svm_remove_dev_pasid(dev, pasid);
4724 			break;
4725 		default:
4726 			/* should never reach here */
4727 			WARN_ON(1);
4728 			break;
4729 		}
4730 	}
4731 
4732 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4733 }
4734 
4735 const struct iommu_ops intel_iommu_ops = {
4736 	.capable		= intel_iommu_capable,
4737 	.domain_alloc		= intel_iommu_domain_alloc,
4738 	.probe_device		= intel_iommu_probe_device,
4739 	.probe_finalize		= intel_iommu_probe_finalize,
4740 	.release_device		= intel_iommu_release_device,
4741 	.get_resv_regions	= intel_iommu_get_resv_regions,
4742 	.device_group		= intel_iommu_device_group,
4743 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4744 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4745 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4746 	.def_domain_type	= device_def_domain_type,
4747 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4748 	.pgsize_bitmap		= SZ_4K,
4749 #ifdef CONFIG_INTEL_IOMMU_SVM
4750 	.page_response		= intel_svm_page_response,
4751 #endif
4752 	.default_domain_ops = &(const struct iommu_domain_ops) {
4753 		.attach_dev		= intel_iommu_attach_device,
4754 		.map_pages		= intel_iommu_map_pages,
4755 		.unmap_pages		= intel_iommu_unmap_pages,
4756 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4757 		.flush_iotlb_all        = intel_flush_iotlb_all,
4758 		.iotlb_sync		= intel_iommu_tlb_sync,
4759 		.iova_to_phys		= intel_iommu_iova_to_phys,
4760 		.free			= intel_iommu_domain_free,
4761 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4762 	}
4763 };
4764 
4765 static void quirk_iommu_igfx(struct pci_dev *dev)
4766 {
4767 	if (risky_device(dev))
4768 		return;
4769 
4770 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4771 	dmar_map_gfx = 0;
4772 }
4773 
4774 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4782 
4783 /* Broadwell igfx malfunctions with dmar */
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4808 
4809 static void quirk_iommu_rwbf(struct pci_dev *dev)
4810 {
4811 	if (risky_device(dev))
4812 		return;
4813 
4814 	/*
4815 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4816 	 * but needs it. Same seems to hold for the desktop versions.
4817 	 */
4818 	pci_info(dev, "Forcing write-buffer flush capability\n");
4819 	rwbf_quirk = 1;
4820 }
4821 
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4829 
4830 #define GGC 0x52
4831 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4832 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4833 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4834 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4835 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4836 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4837 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4838 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4839 
4840 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4841 {
4842 	unsigned short ggc;
4843 
4844 	if (risky_device(dev))
4845 		return;
4846 
4847 	if (pci_read_config_word(dev, GGC, &ggc))
4848 		return;
4849 
4850 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4851 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4852 		dmar_map_gfx = 0;
4853 	} else if (dmar_map_gfx) {
4854 		/* we have to ensure the gfx device is idle before we flush */
4855 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4856 		iommu_set_dma_strict();
4857 	}
4858 }
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4863 
4864 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4865 {
4866 	unsigned short ver;
4867 
4868 	if (!IS_GFX_DEVICE(dev))
4869 		return;
4870 
4871 	ver = (dev->device >> 8) & 0xff;
4872 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4873 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4874 	    ver != 0x9a && ver != 0xa7)
4875 		return;
4876 
4877 	if (risky_device(dev))
4878 		return;
4879 
4880 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4881 	iommu_skip_te_disable = 1;
4882 }
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4884 
4885 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4886    ISOCH DMAR unit for the Azalia sound device, but not give it any
4887    TLB entries, which causes it to deadlock. Check for that.  We do
4888    this in a function called from init_dmars(), instead of in a PCI
4889    quirk, because we don't want to print the obnoxious "BIOS broken"
4890    message if VT-d is actually disabled.
4891 */
4892 static void __init check_tylersburg_isoch(void)
4893 {
4894 	struct pci_dev *pdev;
4895 	uint32_t vtisochctrl;
4896 
4897 	/* If there's no Azalia in the system anyway, forget it. */
4898 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4899 	if (!pdev)
4900 		return;
4901 
4902 	if (risky_device(pdev)) {
4903 		pci_dev_put(pdev);
4904 		return;
4905 	}
4906 
4907 	pci_dev_put(pdev);
4908 
4909 	/* System Management Registers. Might be hidden, in which case
4910 	   we can't do the sanity check. But that's OK, because the
4911 	   known-broken BIOSes _don't_ actually hide it, so far. */
4912 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4913 	if (!pdev)
4914 		return;
4915 
4916 	if (risky_device(pdev)) {
4917 		pci_dev_put(pdev);
4918 		return;
4919 	}
4920 
4921 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4922 		pci_dev_put(pdev);
4923 		return;
4924 	}
4925 
4926 	pci_dev_put(pdev);
4927 
4928 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4929 	if (vtisochctrl & 1)
4930 		return;
4931 
4932 	/* Drop all bits other than the number of TLB entries */
4933 	vtisochctrl &= 0x1c;
4934 
4935 	/* If we have the recommended number of TLB entries (16), fine. */
4936 	if (vtisochctrl == 0x10)
4937 		return;
4938 
4939 	/* Zero TLB entries? You get to ride the short bus to school. */
4940 	if (!vtisochctrl) {
4941 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4942 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4943 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4944 		     dmi_get_system_info(DMI_BIOS_VERSION),
4945 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4946 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4947 		return;
4948 	}
4949 
4950 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4951 	       vtisochctrl);
4952 }
4953 
4954 /*
4955  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4956  * invalidation completion before posted writes initiated with translated address
4957  * that utilized translations matching the invalidation address range, violating
4958  * the invalidation completion ordering.
4959  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4960  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4961  * under the control of the trusted/privileged host device driver must use this
4962  * quirk.
4963  * Device TLBs are invalidated under the following six conditions:
4964  * 1. Device driver does DMA API unmap IOVA
4965  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4966  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4967  *    exit_mmap() due to crash
4968  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4969  *    VM has to free pages that were unmapped
4970  * 5. Userspace driver unmaps a DMA buffer
4971  * 6. Cache invalidation in vSVA usage (upcoming)
4972  *
4973  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4974  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4975  * invalidate TLB the same way as normal user unmap which will use this quirk.
4976  * The dTLB invalidation after PASID cache flush does not need this quirk.
4977  *
4978  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4979  */
4980 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4981 			       unsigned long address, unsigned long mask,
4982 			       u32 pasid, u16 qdep)
4983 {
4984 	u16 sid;
4985 
4986 	if (likely(!info->dtlb_extra_inval))
4987 		return;
4988 
4989 	sid = PCI_DEVID(info->bus, info->devfn);
4990 	if (pasid == PASID_RID2PASID) {
4991 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4992 				   qdep, address, mask);
4993 	} else {
4994 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4995 					 pasid, qdep, address, mask);
4996 	}
4997 }
4998 
4999 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5000 
5001 /*
5002  * Function to submit a command to the enhanced command interface. The
5003  * valid enhanced command descriptions are defined in Table 47 of the
5004  * VT-d spec. The VT-d hardware implementation may support some but not
5005  * all commands, which can be determined by checking the Enhanced
5006  * Command Capability Register.
5007  *
5008  * Return values:
5009  *  - 0: Command successful without any error;
5010  *  - Negative: software error value;
5011  *  - Nonzero positive: failure status code defined in Table 48.
5012  */
5013 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5014 {
5015 	unsigned long flags;
5016 	u64 res;
5017 	int ret;
5018 
5019 	if (!cap_ecmds(iommu->cap))
5020 		return -ENODEV;
5021 
5022 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5023 
5024 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5025 	if (res & DMA_ECMD_ECRSP_IP) {
5026 		ret = -EBUSY;
5027 		goto err;
5028 	}
5029 
5030 	/*
5031 	 * Unconditionally write the operand B, because
5032 	 * - There is no side effect if an ecmd doesn't require an
5033 	 *   operand B, but we set the register to some value.
5034 	 * - It's not invoked in any critical path. The extra MMIO
5035 	 *   write doesn't bring any performance concerns.
5036 	 */
5037 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5038 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5039 
5040 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5041 		      !(res & DMA_ECMD_ECRSP_IP), res);
5042 
5043 	if (res & DMA_ECMD_ECRSP_IP) {
5044 		ret = -ETIMEDOUT;
5045 		goto err;
5046 	}
5047 
5048 	ret = ecmd_get_status_code(res);
5049 err:
5050 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5051 
5052 	return ret;
5053 }
5054