xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 17ae8136549f512e3fbc78cb78402df6a211cfb5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 
26 #include "iommu.h"
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
282 
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285 
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288 
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
293 
294 #define IDENTMAP_GFX		2
295 #define IDENTMAP_AZALIA		4
296 
297 const struct iommu_ops intel_iommu_ops;
298 
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
300 {
301 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302 }
303 
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307 }
308 
309 static void init_translation_status(struct intel_iommu *iommu)
310 {
311 	u32 gsts;
312 
313 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 	if (gsts & DMA_GSTS_TES)
315 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316 }
317 
318 static int __init intel_iommu_setup(char *str)
319 {
320 	if (!str)
321 		return -EINVAL;
322 
323 	while (*str) {
324 		if (!strncmp(str, "on", 2)) {
325 			dmar_disabled = 0;
326 			pr_info("IOMMU enabled\n");
327 		} else if (!strncmp(str, "off", 3)) {
328 			dmar_disabled = 1;
329 			no_platform_optin = 1;
330 			pr_info("IOMMU disabled\n");
331 		} else if (!strncmp(str, "igfx_off", 8)) {
332 			dmar_map_gfx = 0;
333 			pr_info("Disable GFX device mapping\n");
334 		} else if (!strncmp(str, "forcedac", 8)) {
335 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 			iommu_dma_forcedac = true;
337 		} else if (!strncmp(str, "strict", 6)) {
338 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 			iommu_set_dma_strict();
340 		} else if (!strncmp(str, "sp_off", 6)) {
341 			pr_info("Disable supported super page\n");
342 			intel_iommu_superpage = 0;
343 		} else if (!strncmp(str, "sm_on", 5)) {
344 			pr_info("Enable scalable mode if hardware supports\n");
345 			intel_iommu_sm = 1;
346 		} else if (!strncmp(str, "sm_off", 6)) {
347 			pr_info("Scalable mode is disallowed\n");
348 			intel_iommu_sm = 0;
349 		} else if (!strncmp(str, "tboot_noforce", 13)) {
350 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 			intel_iommu_tboot_noforce = 1;
352 		} else {
353 			pr_notice("Unknown option - '%s'\n", str);
354 		}
355 
356 		str += strcspn(str, ",");
357 		while (*str == ',')
358 			str++;
359 	}
360 
361 	return 1;
362 }
363 __setup("intel_iommu=", intel_iommu_setup);
364 
365 void *alloc_pgtable_page(int node, gfp_t gfp)
366 {
367 	struct page *page;
368 	void *vaddr = NULL;
369 
370 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
371 	if (page)
372 		vaddr = page_address(page);
373 	return vaddr;
374 }
375 
376 void free_pgtable_page(void *vaddr)
377 {
378 	free_page((unsigned long)vaddr);
379 }
380 
381 static inline int domain_type_is_si(struct dmar_domain *domain)
382 {
383 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384 }
385 
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
387 				       unsigned long pfn)
388 {
389 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390 
391 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392 }
393 
394 /*
395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397  * the returned SAGAW.
398  */
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400 {
401 	unsigned long fl_sagaw, sl_sagaw;
402 
403 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 	sl_sagaw = cap_sagaw(iommu->cap);
405 
406 	/* Second level only. */
407 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 		return sl_sagaw;
409 
410 	/* First level only. */
411 	if (!ecap_slts(iommu->ecap))
412 		return fl_sagaw;
413 
414 	return fl_sagaw & sl_sagaw;
415 }
416 
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418 {
419 	unsigned long sagaw;
420 	int agaw;
421 
422 	sagaw = __iommu_calculate_sagaw(iommu);
423 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 		if (test_bit(agaw, &sagaw))
425 			break;
426 	}
427 
428 	return agaw;
429 }
430 
431 /*
432  * Calculate max SAGAW for each iommu.
433  */
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435 {
436 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437 }
438 
439 /*
440  * calculate agaw for each iommu.
441  * "SAGAW" may be different across iommus, use a default agaw, and
442  * get a supported less agaw for iommus that don't support the default agaw.
443  */
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
445 {
446 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447 }
448 
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450 {
451 	return sm_supported(iommu) ?
452 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453 }
454 
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
456 {
457 	struct iommu_domain_info *info;
458 	struct dmar_drhd_unit *drhd;
459 	struct intel_iommu *iommu;
460 	bool found = false;
461 	unsigned long i;
462 
463 	domain->iommu_coherency = true;
464 	xa_for_each(&domain->iommu_array, i, info) {
465 		found = true;
466 		if (!iommu_paging_structure_coherency(info->iommu)) {
467 			domain->iommu_coherency = false;
468 			break;
469 		}
470 	}
471 	if (found)
472 		return;
473 
474 	/* No hardware attached; use lowest common denominator */
475 	rcu_read_lock();
476 	for_each_active_iommu(iommu, drhd) {
477 		if (!iommu_paging_structure_coherency(iommu)) {
478 			domain->iommu_coherency = false;
479 			break;
480 		}
481 	}
482 	rcu_read_unlock();
483 }
484 
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 					 struct intel_iommu *skip)
487 {
488 	struct dmar_drhd_unit *drhd;
489 	struct intel_iommu *iommu;
490 	int mask = 0x3;
491 
492 	if (!intel_iommu_superpage)
493 		return 0;
494 
495 	/* set iommu_superpage to the smallest common denominator */
496 	rcu_read_lock();
497 	for_each_active_iommu(iommu, drhd) {
498 		if (iommu != skip) {
499 			if (domain && domain->use_first_level) {
500 				if (!cap_fl1gp_support(iommu->cap))
501 					mask = 0x1;
502 			} else {
503 				mask &= cap_super_page_val(iommu->cap);
504 			}
505 
506 			if (!mask)
507 				break;
508 		}
509 	}
510 	rcu_read_unlock();
511 
512 	return fls(mask);
513 }
514 
515 static int domain_update_device_node(struct dmar_domain *domain)
516 {
517 	struct device_domain_info *info;
518 	int nid = NUMA_NO_NODE;
519 	unsigned long flags;
520 
521 	spin_lock_irqsave(&domain->lock, flags);
522 	list_for_each_entry(info, &domain->devices, link) {
523 		/*
524 		 * There could possibly be multiple device numa nodes as devices
525 		 * within the same domain may sit behind different IOMMUs. There
526 		 * isn't perfect answer in such situation, so we select first
527 		 * come first served policy.
528 		 */
529 		nid = dev_to_node(info->dev);
530 		if (nid != NUMA_NO_NODE)
531 			break;
532 	}
533 	spin_unlock_irqrestore(&domain->lock, flags);
534 
535 	return nid;
536 }
537 
538 static void domain_update_iotlb(struct dmar_domain *domain);
539 
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542 {
543 	unsigned long bitmap = 0;
544 
545 	/*
546 	 * 1-level super page supports page size of 2MiB, 2-level super page
547 	 * supports page size of both 2MiB and 1GiB.
548 	 */
549 	if (domain->iommu_superpage == 1)
550 		bitmap |= SZ_2M;
551 	else if (domain->iommu_superpage == 2)
552 		bitmap |= SZ_2M | SZ_1G;
553 
554 	return bitmap;
555 }
556 
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560 	domain_update_iommu_coherency(domain);
561 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562 
563 	/*
564 	 * If RHSA is missing, we should default to the device numa domain
565 	 * as fall back.
566 	 */
567 	if (domain->nid == NUMA_NO_NODE)
568 		domain->nid = domain_update_device_node(domain);
569 
570 	/*
571 	 * First-level translation restricts the input-address to a
572 	 * canonical address (i.e., address bits 63:N have the same
573 	 * value as address bit [N-1], where N is 48-bits with 4-level
574 	 * paging and 57-bits with 5-level paging). Hence, skip bit
575 	 * [N-1].
576 	 */
577 	if (domain->use_first_level)
578 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 	else
580 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581 
582 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 	domain_update_iotlb(domain);
584 }
585 
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 					 u8 devfn, int alloc)
588 {
589 	struct root_entry *root = &iommu->root_entry[bus];
590 	struct context_entry *context;
591 	u64 *entry;
592 
593 	/*
594 	 * Except that the caller requested to allocate a new entry,
595 	 * returning a copied context entry makes no sense.
596 	 */
597 	if (!alloc && context_copied(iommu, bus, devfn))
598 		return NULL;
599 
600 	entry = &root->lo;
601 	if (sm_supported(iommu)) {
602 		if (devfn >= 0x80) {
603 			devfn -= 0x80;
604 			entry = &root->hi;
605 		}
606 		devfn *= 2;
607 	}
608 	if (*entry & 1)
609 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 	else {
611 		unsigned long phy_addr;
612 		if (!alloc)
613 			return NULL;
614 
615 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
616 		if (!context)
617 			return NULL;
618 
619 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 		phy_addr = virt_to_phys((void *)context);
621 		*entry = phy_addr | 1;
622 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
623 	}
624 	return &context[devfn];
625 }
626 
627 /**
628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629  *				 sub-hierarchy of a candidate PCI-PCI bridge
630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631  * @bridge: the candidate PCI-PCI bridge
632  *
633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634  */
635 static bool
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637 {
638 	struct pci_dev *pdev, *pbridge;
639 
640 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 		return false;
642 
643 	pdev = to_pci_dev(dev);
644 	pbridge = to_pci_dev(bridge);
645 
646 	if (pbridge->subordinate &&
647 	    pbridge->subordinate->number <= pdev->bus->number &&
648 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 		return true;
650 
651 	return false;
652 }
653 
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655 {
656 	struct dmar_drhd_unit *drhd;
657 	u32 vtbar;
658 	int rc;
659 
660 	/* We know that this device on this chipset has its own IOMMU.
661 	 * If we find it under a different IOMMU, then the BIOS is lying
662 	 * to us. Hope that the IOMMU for this device is actually
663 	 * disabled, and it needs no translation...
664 	 */
665 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 	if (rc) {
667 		/* "can't" happen */
668 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 		return false;
670 	}
671 	vtbar &= 0xffff0000;
672 
673 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
674 	drhd = dmar_find_matched_drhd_unit(pdev);
675 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 		return true;
679 	}
680 
681 	return false;
682 }
683 
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685 {
686 	if (!iommu || iommu->drhd->ignored)
687 		return true;
688 
689 	if (dev_is_pci(dev)) {
690 		struct pci_dev *pdev = to_pci_dev(dev);
691 
692 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 		    quirk_ioat_snb_local_iommu(pdev))
695 			return true;
696 	}
697 
698 	return false;
699 }
700 
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702 {
703 	struct dmar_drhd_unit *drhd = NULL;
704 	struct pci_dev *pdev = NULL;
705 	struct intel_iommu *iommu;
706 	struct device *tmp;
707 	u16 segment = 0;
708 	int i;
709 
710 	if (!dev)
711 		return NULL;
712 
713 	if (dev_is_pci(dev)) {
714 		struct pci_dev *pf_pdev;
715 
716 		pdev = pci_real_dma_dev(to_pci_dev(dev));
717 
718 		/* VFs aren't listed in scope tables; we need to look up
719 		 * the PF instead to find the IOMMU. */
720 		pf_pdev = pci_physfn(pdev);
721 		dev = &pf_pdev->dev;
722 		segment = pci_domain_nr(pdev->bus);
723 	} else if (has_acpi_companion(dev))
724 		dev = &ACPI_COMPANION(dev)->dev;
725 
726 	rcu_read_lock();
727 	for_each_iommu(iommu, drhd) {
728 		if (pdev && segment != drhd->segment)
729 			continue;
730 
731 		for_each_active_dev_scope(drhd->devices,
732 					  drhd->devices_cnt, i, tmp) {
733 			if (tmp == dev) {
734 				/* For a VF use its original BDF# not that of the PF
735 				 * which we used for the IOMMU lookup. Strictly speaking
736 				 * we could do this for all PCI devices; we only need to
737 				 * get the BDF# from the scope table for ACPI matches. */
738 				if (pdev && pdev->is_virtfn)
739 					goto got_pdev;
740 
741 				if (bus && devfn) {
742 					*bus = drhd->devices[i].bus;
743 					*devfn = drhd->devices[i].devfn;
744 				}
745 				goto out;
746 			}
747 
748 			if (is_downstream_to_pci_bridge(dev, tmp))
749 				goto got_pdev;
750 		}
751 
752 		if (pdev && drhd->include_all) {
753 got_pdev:
754 			if (bus && devfn) {
755 				*bus = pdev->bus->number;
756 				*devfn = pdev->devfn;
757 			}
758 			goto out;
759 		}
760 	}
761 	iommu = NULL;
762 out:
763 	if (iommu_is_dummy(iommu, dev))
764 		iommu = NULL;
765 
766 	rcu_read_unlock();
767 
768 	return iommu;
769 }
770 
771 static void domain_flush_cache(struct dmar_domain *domain,
772 			       void *addr, int size)
773 {
774 	if (!domain->iommu_coherency)
775 		clflush_cache_range(addr, size);
776 }
777 
778 static void free_context_table(struct intel_iommu *iommu)
779 {
780 	struct context_entry *context;
781 	int i;
782 
783 	if (!iommu->root_entry)
784 		return;
785 
786 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 		context = iommu_context_addr(iommu, i, 0, 0);
788 		if (context)
789 			free_pgtable_page(context);
790 
791 		if (!sm_supported(iommu))
792 			continue;
793 
794 		context = iommu_context_addr(iommu, i, 0x80, 0);
795 		if (context)
796 			free_pgtable_page(context);
797 	}
798 
799 	free_pgtable_page(iommu->root_entry);
800 	iommu->root_entry = NULL;
801 }
802 
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806 {
807 	struct dma_pte *pte;
808 	int offset;
809 
810 	while (1) {
811 		offset = pfn_level_offset(pfn, level);
812 		pte = &parent[offset];
813 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 			pr_info("PTE not present at level %d\n", level);
815 			break;
816 		}
817 
818 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819 
820 		if (level == 1)
821 			break;
822 
823 		parent = phys_to_virt(dma_pte_addr(pte));
824 		level--;
825 	}
826 }
827 
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 			  unsigned long long addr, u32 pasid)
830 {
831 	struct pasid_dir_entry *dir, *pde;
832 	struct pasid_entry *entries, *pte;
833 	struct context_entry *ctx_entry;
834 	struct root_entry *rt_entry;
835 	int i, dir_index, index, level;
836 	u8 devfn = source_id & 0xff;
837 	u8 bus = source_id >> 8;
838 	struct dma_pte *pgtable;
839 
840 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841 
842 	/* root entry dump */
843 	rt_entry = &iommu->root_entry[bus];
844 	if (!rt_entry) {
845 		pr_info("root table entry is not present\n");
846 		return;
847 	}
848 
849 	if (sm_supported(iommu))
850 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 			rt_entry->hi, rt_entry->lo);
852 	else
853 		pr_info("root entry: 0x%016llx", rt_entry->lo);
854 
855 	/* context entry dump */
856 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 	if (!ctx_entry) {
858 		pr_info("context table entry is not present\n");
859 		return;
860 	}
861 
862 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 		ctx_entry->hi, ctx_entry->lo);
864 
865 	/* legacy mode does not require PASID entries */
866 	if (!sm_supported(iommu)) {
867 		level = agaw_to_level(ctx_entry->hi & 7);
868 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 		goto pgtable_walk;
870 	}
871 
872 	/* get the pointer to pasid directory entry */
873 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 	if (!dir) {
875 		pr_info("pasid directory entry is not present\n");
876 		return;
877 	}
878 	/* For request-without-pasid, get the pasid from context entry */
879 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
880 		pasid = PASID_RID2PASID;
881 
882 	dir_index = pasid >> PASID_PDE_SHIFT;
883 	pde = &dir[dir_index];
884 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885 
886 	/* get the pointer to the pasid table entry */
887 	entries = get_pasid_table_from_pde(pde);
888 	if (!entries) {
889 		pr_info("pasid table entry is not present\n");
890 		return;
891 	}
892 	index = pasid & PASID_PTE_MASK;
893 	pte = &entries[index];
894 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896 
897 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 	} else {
901 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 	}
904 
905 pgtable_walk:
906 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907 }
908 #endif
909 
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 				      unsigned long pfn, int *target_level,
912 				      gfp_t gfp)
913 {
914 	struct dma_pte *parent, *pte;
915 	int level = agaw_to_level(domain->agaw);
916 	int offset;
917 
918 	if (!domain_pfn_supported(domain, pfn))
919 		/* Address beyond IOMMU's addressing capabilities. */
920 		return NULL;
921 
922 	parent = domain->pgd;
923 
924 	while (1) {
925 		void *tmp_page;
926 
927 		offset = pfn_level_offset(pfn, level);
928 		pte = &parent[offset];
929 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
930 			break;
931 		if (level == *target_level)
932 			break;
933 
934 		if (!dma_pte_present(pte)) {
935 			uint64_t pteval;
936 
937 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
938 
939 			if (!tmp_page)
940 				return NULL;
941 
942 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
943 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
944 			if (domain->use_first_level)
945 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
946 
947 			if (cmpxchg64(&pte->val, 0ULL, pteval))
948 				/* Someone else set it while we were thinking; use theirs. */
949 				free_pgtable_page(tmp_page);
950 			else
951 				domain_flush_cache(domain, pte, sizeof(*pte));
952 		}
953 		if (level == 1)
954 			break;
955 
956 		parent = phys_to_virt(dma_pte_addr(pte));
957 		level--;
958 	}
959 
960 	if (!*target_level)
961 		*target_level = level;
962 
963 	return pte;
964 }
965 
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968 					 unsigned long pfn,
969 					 int level, int *large_page)
970 {
971 	struct dma_pte *parent, *pte;
972 	int total = agaw_to_level(domain->agaw);
973 	int offset;
974 
975 	parent = domain->pgd;
976 	while (level <= total) {
977 		offset = pfn_level_offset(pfn, total);
978 		pte = &parent[offset];
979 		if (level == total)
980 			return pte;
981 
982 		if (!dma_pte_present(pte)) {
983 			*large_page = total;
984 			break;
985 		}
986 
987 		if (dma_pte_superpage(pte)) {
988 			*large_page = total;
989 			return pte;
990 		}
991 
992 		parent = phys_to_virt(dma_pte_addr(pte));
993 		total--;
994 	}
995 	return NULL;
996 }
997 
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 				unsigned long start_pfn,
1001 				unsigned long last_pfn)
1002 {
1003 	unsigned int large_page;
1004 	struct dma_pte *first_pte, *pte;
1005 
1006 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1007 	    WARN_ON(start_pfn > last_pfn))
1008 		return;
1009 
1010 	/* we don't need lock here; nobody else touches the iova range */
1011 	do {
1012 		large_page = 1;
1013 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014 		if (!pte) {
1015 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016 			continue;
1017 		}
1018 		do {
1019 			dma_clear_pte(pte);
1020 			start_pfn += lvl_to_nr_pages(large_page);
1021 			pte++;
1022 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023 
1024 		domain_flush_cache(domain, first_pte,
1025 				   (void *)pte - (void *)first_pte);
1026 
1027 	} while (start_pfn && start_pfn <= last_pfn);
1028 }
1029 
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 			       int retain_level, struct dma_pte *pte,
1032 			       unsigned long pfn, unsigned long start_pfn,
1033 			       unsigned long last_pfn)
1034 {
1035 	pfn = max(start_pfn, pfn);
1036 	pte = &pte[pfn_level_offset(pfn, level)];
1037 
1038 	do {
1039 		unsigned long level_pfn;
1040 		struct dma_pte *level_pte;
1041 
1042 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043 			goto next;
1044 
1045 		level_pfn = pfn & level_mask(level);
1046 		level_pte = phys_to_virt(dma_pte_addr(pte));
1047 
1048 		if (level > 2) {
1049 			dma_pte_free_level(domain, level - 1, retain_level,
1050 					   level_pte, level_pfn, start_pfn,
1051 					   last_pfn);
1052 		}
1053 
1054 		/*
1055 		 * Free the page table if we're below the level we want to
1056 		 * retain and the range covers the entire table.
1057 		 */
1058 		if (level < retain_level && !(start_pfn > level_pfn ||
1059 		      last_pfn < level_pfn + level_size(level) - 1)) {
1060 			dma_clear_pte(pte);
1061 			domain_flush_cache(domain, pte, sizeof(*pte));
1062 			free_pgtable_page(level_pte);
1063 		}
1064 next:
1065 		pfn += level_size(level);
1066 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068 
1069 /*
1070  * clear last level (leaf) ptes and free page table pages below the
1071  * level we wish to keep intact.
1072  */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 				   unsigned long start_pfn,
1075 				   unsigned long last_pfn,
1076 				   int retain_level)
1077 {
1078 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1079 
1080 	/* We don't need lock here; nobody else touches the iova range */
1081 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1082 			   domain->pgd, 0, start_pfn, last_pfn);
1083 
1084 	/* free pgd */
1085 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1086 		free_pgtable_page(domain->pgd);
1087 		domain->pgd = NULL;
1088 	}
1089 }
1090 
1091 /* When a page at a given level is being unlinked from its parent, we don't
1092    need to *modify* it at all. All we need to do is make a list of all the
1093    pages which can be freed just as soon as we've flushed the IOTLB and we
1094    know the hardware page-walk will no longer touch them.
1095    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1096    be freed. */
1097 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1098 				    int level, struct dma_pte *pte,
1099 				    struct list_head *freelist)
1100 {
1101 	struct page *pg;
1102 
1103 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1104 	list_add_tail(&pg->lru, freelist);
1105 
1106 	if (level == 1)
1107 		return;
1108 
1109 	pte = page_address(pg);
1110 	do {
1111 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1113 		pte++;
1114 	} while (!first_pte_in_page(pte));
1115 }
1116 
1117 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1118 				struct dma_pte *pte, unsigned long pfn,
1119 				unsigned long start_pfn, unsigned long last_pfn,
1120 				struct list_head *freelist)
1121 {
1122 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1123 
1124 	pfn = max(start_pfn, pfn);
1125 	pte = &pte[pfn_level_offset(pfn, level)];
1126 
1127 	do {
1128 		unsigned long level_pfn = pfn & level_mask(level);
1129 
1130 		if (!dma_pte_present(pte))
1131 			goto next;
1132 
1133 		/* If range covers entire pagetable, free it */
1134 		if (start_pfn <= level_pfn &&
1135 		    last_pfn >= level_pfn + level_size(level) - 1) {
1136 			/* These suborbinate page tables are going away entirely. Don't
1137 			   bother to clear them; we're just going to *free* them. */
1138 			if (level > 1 && !dma_pte_superpage(pte))
1139 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1140 
1141 			dma_clear_pte(pte);
1142 			if (!first_pte)
1143 				first_pte = pte;
1144 			last_pte = pte;
1145 		} else if (level > 1) {
1146 			/* Recurse down into a level that isn't *entirely* obsolete */
1147 			dma_pte_clear_level(domain, level - 1,
1148 					    phys_to_virt(dma_pte_addr(pte)),
1149 					    level_pfn, start_pfn, last_pfn,
1150 					    freelist);
1151 		}
1152 next:
1153 		pfn = level_pfn + level_size(level);
1154 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155 
1156 	if (first_pte)
1157 		domain_flush_cache(domain, first_pte,
1158 				   (void *)++last_pte - (void *)first_pte);
1159 }
1160 
1161 /* We can't just free the pages because the IOMMU may still be walking
1162    the page tables, and may have cached the intermediate levels. The
1163    pages can only be freed after the IOTLB flush has been done. */
1164 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1165 			 unsigned long last_pfn, struct list_head *freelist)
1166 {
1167 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1168 	    WARN_ON(start_pfn > last_pfn))
1169 		return;
1170 
1171 	/* we don't need lock here; nobody else touches the iova range */
1172 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1173 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1174 
1175 	/* free pgd */
1176 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177 		struct page *pgd_page = virt_to_page(domain->pgd);
1178 		list_add_tail(&pgd_page->lru, freelist);
1179 		domain->pgd = NULL;
1180 	}
1181 }
1182 
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1185 {
1186 	struct root_entry *root;
1187 
1188 	root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1189 	if (!root) {
1190 		pr_err("Allocating root entry for %s failed\n",
1191 			iommu->name);
1192 		return -ENOMEM;
1193 	}
1194 
1195 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1196 	iommu->root_entry = root;
1197 
1198 	return 0;
1199 }
1200 
1201 static void iommu_set_root_entry(struct intel_iommu *iommu)
1202 {
1203 	u64 addr;
1204 	u32 sts;
1205 	unsigned long flag;
1206 
1207 	addr = virt_to_phys(iommu->root_entry);
1208 	if (sm_supported(iommu))
1209 		addr |= DMA_RTADDR_SMT;
1210 
1211 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1212 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213 
1214 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215 
1216 	/* Make sure hardware complete it */
1217 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1218 		      readl, (sts & DMA_GSTS_RTPS), sts);
1219 
1220 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1221 
1222 	/*
1223 	 * Hardware invalidates all DMA remapping hardware translation
1224 	 * caches as part of SRTP flow.
1225 	 */
1226 	if (cap_esrtps(iommu->cap))
1227 		return;
1228 
1229 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1230 	if (sm_supported(iommu))
1231 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1232 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1233 }
1234 
1235 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1236 {
1237 	u32 val;
1238 	unsigned long flag;
1239 
1240 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1241 		return;
1242 
1243 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1244 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1245 
1246 	/* Make sure hardware complete it */
1247 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1248 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1249 
1250 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 }
1252 
1253 /* return value determine if we need a write buffer flush */
1254 static void __iommu_flush_context(struct intel_iommu *iommu,
1255 				  u16 did, u16 source_id, u8 function_mask,
1256 				  u64 type)
1257 {
1258 	u64 val = 0;
1259 	unsigned long flag;
1260 
1261 	switch (type) {
1262 	case DMA_CCMD_GLOBAL_INVL:
1263 		val = DMA_CCMD_GLOBAL_INVL;
1264 		break;
1265 	case DMA_CCMD_DOMAIN_INVL:
1266 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1267 		break;
1268 	case DMA_CCMD_DEVICE_INVL:
1269 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1270 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1271 		break;
1272 	default:
1273 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1274 			iommu->name, type);
1275 		return;
1276 	}
1277 	val |= DMA_CCMD_ICC;
1278 
1279 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1281 
1282 	/* Make sure hardware complete it */
1283 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1285 
1286 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1287 }
1288 
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291 				u64 addr, unsigned int size_order, u64 type)
1292 {
1293 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294 	u64 val = 0, val_iva = 0;
1295 	unsigned long flag;
1296 
1297 	switch (type) {
1298 	case DMA_TLB_GLOBAL_FLUSH:
1299 		/* global flush doesn't need set IVA_REG */
1300 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1301 		break;
1302 	case DMA_TLB_DSI_FLUSH:
1303 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304 		break;
1305 	case DMA_TLB_PSI_FLUSH:
1306 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307 		/* IH bit is passed in as part of address */
1308 		val_iva = size_order | addr;
1309 		break;
1310 	default:
1311 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1312 			iommu->name, type);
1313 		return;
1314 	}
1315 	/* Note: set drain read/write */
1316 #if 0
1317 	/*
1318 	 * This is probably to be super secure.. Looks like we can
1319 	 * ignore it without any impact.
1320 	 */
1321 	if (cap_read_drain(iommu->cap))
1322 		val |= DMA_TLB_READ_DRAIN;
1323 #endif
1324 	if (cap_write_drain(iommu->cap))
1325 		val |= DMA_TLB_WRITE_DRAIN;
1326 
1327 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328 	/* Note: Only uses first TLB reg currently */
1329 	if (val_iva)
1330 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1331 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1332 
1333 	/* Make sure hardware complete it */
1334 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1335 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1336 
1337 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1338 
1339 	/* check IOTLB invalidation granularity */
1340 	if (DMA_TLB_IAIG(val) == 0)
1341 		pr_err("Flush IOTLB failed\n");
1342 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1343 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1344 			(unsigned long long)DMA_TLB_IIRG(type),
1345 			(unsigned long long)DMA_TLB_IAIG(val));
1346 }
1347 
1348 static struct device_domain_info *
1349 domain_lookup_dev_info(struct dmar_domain *domain,
1350 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1351 {
1352 	struct device_domain_info *info;
1353 	unsigned long flags;
1354 
1355 	spin_lock_irqsave(&domain->lock, flags);
1356 	list_for_each_entry(info, &domain->devices, link) {
1357 		if (info->iommu == iommu && info->bus == bus &&
1358 		    info->devfn == devfn) {
1359 			spin_unlock_irqrestore(&domain->lock, flags);
1360 			return info;
1361 		}
1362 	}
1363 	spin_unlock_irqrestore(&domain->lock, flags);
1364 
1365 	return NULL;
1366 }
1367 
1368 static void domain_update_iotlb(struct dmar_domain *domain)
1369 {
1370 	struct device_domain_info *info;
1371 	bool has_iotlb_device = false;
1372 	unsigned long flags;
1373 
1374 	spin_lock_irqsave(&domain->lock, flags);
1375 	list_for_each_entry(info, &domain->devices, link) {
1376 		if (info->ats_enabled) {
1377 			has_iotlb_device = true;
1378 			break;
1379 		}
1380 	}
1381 	domain->has_iotlb_device = has_iotlb_device;
1382 	spin_unlock_irqrestore(&domain->lock, flags);
1383 }
1384 
1385 /*
1386  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1387  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1388  * check because it applies only to the built-in QAT devices and it doesn't
1389  * grant additional privileges.
1390  */
1391 #define BUGGY_QAT_DEVID_MASK 0x4940
1392 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1393 {
1394 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1395 		return false;
1396 
1397 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1398 		return false;
1399 
1400 	return true;
1401 }
1402 
1403 static void iommu_enable_pci_caps(struct device_domain_info *info)
1404 {
1405 	struct pci_dev *pdev;
1406 
1407 	if (!dev_is_pci(info->dev))
1408 		return;
1409 
1410 	pdev = to_pci_dev(info->dev);
1411 
1412 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1413 	   the device if you enable PASID support after ATS support is
1414 	   undefined. So always enable PASID support on devices which
1415 	   have it, even if we can't yet know if we're ever going to
1416 	   use it. */
1417 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1418 		info->pasid_enabled = 1;
1419 
1420 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1421 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1422 		info->ats_enabled = 1;
1423 		domain_update_iotlb(info->domain);
1424 	}
1425 }
1426 
1427 static void iommu_disable_pci_caps(struct device_domain_info *info)
1428 {
1429 	struct pci_dev *pdev;
1430 
1431 	if (!dev_is_pci(info->dev))
1432 		return;
1433 
1434 	pdev = to_pci_dev(info->dev);
1435 
1436 	if (info->ats_enabled) {
1437 		pci_disable_ats(pdev);
1438 		info->ats_enabled = 0;
1439 		domain_update_iotlb(info->domain);
1440 	}
1441 
1442 	if (info->pasid_enabled) {
1443 		pci_disable_pasid(pdev);
1444 		info->pasid_enabled = 0;
1445 	}
1446 }
1447 
1448 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1449 				    u64 addr, unsigned int mask)
1450 {
1451 	u16 sid, qdep;
1452 
1453 	if (!info || !info->ats_enabled)
1454 		return;
1455 
1456 	sid = info->bus << 8 | info->devfn;
1457 	qdep = info->ats_qdep;
1458 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1459 			   qdep, addr, mask);
1460 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1461 }
1462 
1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464 				  u64 addr, unsigned mask)
1465 {
1466 	struct device_domain_info *info;
1467 	unsigned long flags;
1468 
1469 	if (!domain->has_iotlb_device)
1470 		return;
1471 
1472 	spin_lock_irqsave(&domain->lock, flags);
1473 	list_for_each_entry(info, &domain->devices, link)
1474 		__iommu_flush_dev_iotlb(info, addr, mask);
1475 	spin_unlock_irqrestore(&domain->lock, flags);
1476 }
1477 
1478 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1479 				  struct dmar_domain *domain,
1480 				  unsigned long pfn, unsigned int pages,
1481 				  int ih, int map)
1482 {
1483 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1484 	unsigned int mask = ilog2(aligned_pages);
1485 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1486 	u16 did = domain_id_iommu(domain, iommu);
1487 
1488 	if (WARN_ON(!pages))
1489 		return;
1490 
1491 	if (ih)
1492 		ih = 1 << 6;
1493 
1494 	if (domain->use_first_level) {
1495 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1496 	} else {
1497 		unsigned long bitmask = aligned_pages - 1;
1498 
1499 		/*
1500 		 * PSI masks the low order bits of the base address. If the
1501 		 * address isn't aligned to the mask, then compute a mask value
1502 		 * needed to ensure the target range is flushed.
1503 		 */
1504 		if (unlikely(bitmask & pfn)) {
1505 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1506 
1507 			/*
1508 			 * Since end_pfn <= pfn + bitmask, the only way bits
1509 			 * higher than bitmask can differ in pfn and end_pfn is
1510 			 * by carrying. This means after masking out bitmask,
1511 			 * high bits starting with the first set bit in
1512 			 * shared_bits are all equal in both pfn and end_pfn.
1513 			 */
1514 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1515 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1516 		}
1517 
1518 		/*
1519 		 * Fallback to domain selective flush if no PSI support or
1520 		 * the size is too big.
1521 		 */
1522 		if (!cap_pgsel_inv(iommu->cap) ||
1523 		    mask > cap_max_amask_val(iommu->cap))
1524 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1525 							DMA_TLB_DSI_FLUSH);
1526 		else
1527 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1528 							DMA_TLB_PSI_FLUSH);
1529 	}
1530 
1531 	/*
1532 	 * In caching mode, changes of pages from non-present to present require
1533 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1534 	 */
1535 	if (!cap_caching_mode(iommu->cap) || !map)
1536 		iommu_flush_dev_iotlb(domain, addr, mask);
1537 }
1538 
1539 /* Notification for newly created mappings */
1540 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1541 					struct dmar_domain *domain,
1542 					unsigned long pfn, unsigned int pages)
1543 {
1544 	/*
1545 	 * It's a non-present to present mapping. Only flush if caching mode
1546 	 * and second level.
1547 	 */
1548 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1549 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1550 	else
1551 		iommu_flush_write_buffer(iommu);
1552 }
1553 
1554 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1555 {
1556 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1557 	struct iommu_domain_info *info;
1558 	unsigned long idx;
1559 
1560 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1561 		struct intel_iommu *iommu = info->iommu;
1562 		u16 did = domain_id_iommu(dmar_domain, iommu);
1563 
1564 		if (dmar_domain->use_first_level)
1565 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1566 		else
1567 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1568 						 DMA_TLB_DSI_FLUSH);
1569 
1570 		if (!cap_caching_mode(iommu->cap))
1571 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1572 	}
1573 }
1574 
1575 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1576 {
1577 	u32 pmen;
1578 	unsigned long flags;
1579 
1580 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1581 		return;
1582 
1583 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1584 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1585 	pmen &= ~DMA_PMEN_EPM;
1586 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1587 
1588 	/* wait for the protected region status bit to clear */
1589 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1590 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1591 
1592 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1593 }
1594 
1595 static void iommu_enable_translation(struct intel_iommu *iommu)
1596 {
1597 	u32 sts;
1598 	unsigned long flags;
1599 
1600 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1601 	iommu->gcmd |= DMA_GCMD_TE;
1602 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1603 
1604 	/* Make sure hardware complete it */
1605 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1606 		      readl, (sts & DMA_GSTS_TES), sts);
1607 
1608 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1609 }
1610 
1611 static void iommu_disable_translation(struct intel_iommu *iommu)
1612 {
1613 	u32 sts;
1614 	unsigned long flag;
1615 
1616 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1617 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1618 		return;
1619 
1620 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1621 	iommu->gcmd &= ~DMA_GCMD_TE;
1622 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1623 
1624 	/* Make sure hardware complete it */
1625 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1626 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1627 
1628 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1629 }
1630 
1631 static int iommu_init_domains(struct intel_iommu *iommu)
1632 {
1633 	u32 ndomains;
1634 
1635 	ndomains = cap_ndoms(iommu->cap);
1636 	pr_debug("%s: Number of Domains supported <%d>\n",
1637 		 iommu->name, ndomains);
1638 
1639 	spin_lock_init(&iommu->lock);
1640 
1641 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1642 	if (!iommu->domain_ids)
1643 		return -ENOMEM;
1644 
1645 	/*
1646 	 * If Caching mode is set, then invalid translations are tagged
1647 	 * with domain-id 0, hence we need to pre-allocate it. We also
1648 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1649 	 * make sure it is not used for a real domain.
1650 	 */
1651 	set_bit(0, iommu->domain_ids);
1652 
1653 	/*
1654 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1655 	 * entry for first-level or pass-through translation modes should
1656 	 * be programmed with a domain id different from those used for
1657 	 * second-level or nested translation. We reserve a domain id for
1658 	 * this purpose.
1659 	 */
1660 	if (sm_supported(iommu))
1661 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1662 
1663 	return 0;
1664 }
1665 
1666 static void disable_dmar_iommu(struct intel_iommu *iommu)
1667 {
1668 	if (!iommu->domain_ids)
1669 		return;
1670 
1671 	/*
1672 	 * All iommu domains must have been detached from the devices,
1673 	 * hence there should be no domain IDs in use.
1674 	 */
1675 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1676 		    > NUM_RESERVED_DID))
1677 		return;
1678 
1679 	if (iommu->gcmd & DMA_GCMD_TE)
1680 		iommu_disable_translation(iommu);
1681 }
1682 
1683 static void free_dmar_iommu(struct intel_iommu *iommu)
1684 {
1685 	if (iommu->domain_ids) {
1686 		bitmap_free(iommu->domain_ids);
1687 		iommu->domain_ids = NULL;
1688 	}
1689 
1690 	if (iommu->copied_tables) {
1691 		bitmap_free(iommu->copied_tables);
1692 		iommu->copied_tables = NULL;
1693 	}
1694 
1695 	/* free context mapping */
1696 	free_context_table(iommu);
1697 
1698 #ifdef CONFIG_INTEL_IOMMU_SVM
1699 	if (pasid_supported(iommu)) {
1700 		if (ecap_prs(iommu->ecap))
1701 			intel_svm_finish_prq(iommu);
1702 	}
1703 #endif
1704 }
1705 
1706 /*
1707  * Check and return whether first level is used by default for
1708  * DMA translation.
1709  */
1710 static bool first_level_by_default(unsigned int type)
1711 {
1712 	/* Only SL is available in legacy mode */
1713 	if (!scalable_mode_support())
1714 		return false;
1715 
1716 	/* Only level (either FL or SL) is available, just use it */
1717 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1718 		return intel_cap_flts_sanity();
1719 
1720 	/* Both levels are available, decide it based on domain type */
1721 	return type != IOMMU_DOMAIN_UNMANAGED;
1722 }
1723 
1724 static struct dmar_domain *alloc_domain(unsigned int type)
1725 {
1726 	struct dmar_domain *domain;
1727 
1728 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1729 	if (!domain)
1730 		return NULL;
1731 
1732 	domain->nid = NUMA_NO_NODE;
1733 	if (first_level_by_default(type))
1734 		domain->use_first_level = true;
1735 	domain->has_iotlb_device = false;
1736 	INIT_LIST_HEAD(&domain->devices);
1737 	spin_lock_init(&domain->lock);
1738 	xa_init(&domain->iommu_array);
1739 
1740 	return domain;
1741 }
1742 
1743 static int domain_attach_iommu(struct dmar_domain *domain,
1744 			       struct intel_iommu *iommu)
1745 {
1746 	struct iommu_domain_info *info, *curr;
1747 	unsigned long ndomains;
1748 	int num, ret = -ENOSPC;
1749 
1750 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1751 	if (!info)
1752 		return -ENOMEM;
1753 
1754 	spin_lock(&iommu->lock);
1755 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1756 	if (curr) {
1757 		curr->refcnt++;
1758 		spin_unlock(&iommu->lock);
1759 		kfree(info);
1760 		return 0;
1761 	}
1762 
1763 	ndomains = cap_ndoms(iommu->cap);
1764 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1765 	if (num >= ndomains) {
1766 		pr_err("%s: No free domain ids\n", iommu->name);
1767 		goto err_unlock;
1768 	}
1769 
1770 	set_bit(num, iommu->domain_ids);
1771 	info->refcnt	= 1;
1772 	info->did	= num;
1773 	info->iommu	= iommu;
1774 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1775 			  NULL, info, GFP_ATOMIC);
1776 	if (curr) {
1777 		ret = xa_err(curr) ? : -EBUSY;
1778 		goto err_clear;
1779 	}
1780 	domain_update_iommu_cap(domain);
1781 
1782 	spin_unlock(&iommu->lock);
1783 	return 0;
1784 
1785 err_clear:
1786 	clear_bit(info->did, iommu->domain_ids);
1787 err_unlock:
1788 	spin_unlock(&iommu->lock);
1789 	kfree(info);
1790 	return ret;
1791 }
1792 
1793 static void domain_detach_iommu(struct dmar_domain *domain,
1794 				struct intel_iommu *iommu)
1795 {
1796 	struct iommu_domain_info *info;
1797 
1798 	spin_lock(&iommu->lock);
1799 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1800 	if (--info->refcnt == 0) {
1801 		clear_bit(info->did, iommu->domain_ids);
1802 		xa_erase(&domain->iommu_array, iommu->seq_id);
1803 		domain->nid = NUMA_NO_NODE;
1804 		domain_update_iommu_cap(domain);
1805 		kfree(info);
1806 	}
1807 	spin_unlock(&iommu->lock);
1808 }
1809 
1810 static inline int guestwidth_to_adjustwidth(int gaw)
1811 {
1812 	int agaw;
1813 	int r = (gaw - 12) % 9;
1814 
1815 	if (r == 0)
1816 		agaw = gaw;
1817 	else
1818 		agaw = gaw + 9 - r;
1819 	if (agaw > 64)
1820 		agaw = 64;
1821 	return agaw;
1822 }
1823 
1824 static void domain_exit(struct dmar_domain *domain)
1825 {
1826 	if (domain->pgd) {
1827 		LIST_HEAD(freelist);
1828 
1829 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1830 		put_pages_list(&freelist);
1831 	}
1832 
1833 	if (WARN_ON(!list_empty(&domain->devices)))
1834 		return;
1835 
1836 	kfree(domain);
1837 }
1838 
1839 /*
1840  * Get the PASID directory size for scalable mode context entry.
1841  * Value of X in the PDTS field of a scalable mode context entry
1842  * indicates PASID directory with 2^(X + 7) entries.
1843  */
1844 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1845 {
1846 	unsigned long pds, max_pde;
1847 
1848 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1849 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1850 	if (pds < 7)
1851 		return 0;
1852 
1853 	return pds - 7;
1854 }
1855 
1856 /*
1857  * Set the RID_PASID field of a scalable mode context entry. The
1858  * IOMMU hardware will use the PASID value set in this field for
1859  * DMA translations of DMA requests without PASID.
1860  */
1861 static inline void
1862 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1863 {
1864 	context->hi |= pasid & ((1 << 20) - 1);
1865 }
1866 
1867 /*
1868  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1869  * entry.
1870  */
1871 static inline void context_set_sm_dte(struct context_entry *context)
1872 {
1873 	context->lo |= BIT_ULL(2);
1874 }
1875 
1876 /*
1877  * Set the PRE(Page Request Enable) field of a scalable mode context
1878  * entry.
1879  */
1880 static inline void context_set_sm_pre(struct context_entry *context)
1881 {
1882 	context->lo |= BIT_ULL(4);
1883 }
1884 
1885 /* Convert value to context PASID directory size field coding. */
1886 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1887 
1888 static int domain_context_mapping_one(struct dmar_domain *domain,
1889 				      struct intel_iommu *iommu,
1890 				      struct pasid_table *table,
1891 				      u8 bus, u8 devfn)
1892 {
1893 	struct device_domain_info *info =
1894 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1895 	u16 did = domain_id_iommu(domain, iommu);
1896 	int translation = CONTEXT_TT_MULTI_LEVEL;
1897 	struct context_entry *context;
1898 	int ret;
1899 
1900 	WARN_ON(did == 0);
1901 
1902 	if (hw_pass_through && domain_type_is_si(domain))
1903 		translation = CONTEXT_TT_PASS_THROUGH;
1904 
1905 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1906 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1907 
1908 	spin_lock(&iommu->lock);
1909 	ret = -ENOMEM;
1910 	context = iommu_context_addr(iommu, bus, devfn, 1);
1911 	if (!context)
1912 		goto out_unlock;
1913 
1914 	ret = 0;
1915 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1916 		goto out_unlock;
1917 
1918 	/*
1919 	 * For kdump cases, old valid entries may be cached due to the
1920 	 * in-flight DMA and copied pgtable, but there is no unmapping
1921 	 * behaviour for them, thus we need an explicit cache flush for
1922 	 * the newly-mapped device. For kdump, at this point, the device
1923 	 * is supposed to finish reset at its driver probe stage, so no
1924 	 * in-flight DMA will exist, and we don't need to worry anymore
1925 	 * hereafter.
1926 	 */
1927 	if (context_copied(iommu, bus, devfn)) {
1928 		u16 did_old = context_domain_id(context);
1929 
1930 		if (did_old < cap_ndoms(iommu->cap)) {
1931 			iommu->flush.flush_context(iommu, did_old,
1932 						   (((u16)bus) << 8) | devfn,
1933 						   DMA_CCMD_MASK_NOBIT,
1934 						   DMA_CCMD_DEVICE_INVL);
1935 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1936 						 DMA_TLB_DSI_FLUSH);
1937 		}
1938 
1939 		clear_context_copied(iommu, bus, devfn);
1940 	}
1941 
1942 	context_clear_entry(context);
1943 
1944 	if (sm_supported(iommu)) {
1945 		unsigned long pds;
1946 
1947 		WARN_ON(!table);
1948 
1949 		/* Setup the PASID DIR pointer: */
1950 		pds = context_get_sm_pds(table);
1951 		context->lo = (u64)virt_to_phys(table->table) |
1952 				context_pdts(pds);
1953 
1954 		/* Setup the RID_PASID field: */
1955 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1956 
1957 		/*
1958 		 * Setup the Device-TLB enable bit and Page request
1959 		 * Enable bit:
1960 		 */
1961 		if (info && info->ats_supported)
1962 			context_set_sm_dte(context);
1963 		if (info && info->pri_supported)
1964 			context_set_sm_pre(context);
1965 		if (info && info->pasid_supported)
1966 			context_set_pasid(context);
1967 	} else {
1968 		struct dma_pte *pgd = domain->pgd;
1969 		int agaw;
1970 
1971 		context_set_domain_id(context, did);
1972 
1973 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1974 			/*
1975 			 * Skip top levels of page tables for iommu which has
1976 			 * less agaw than default. Unnecessary for PT mode.
1977 			 */
1978 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1979 				ret = -ENOMEM;
1980 				pgd = phys_to_virt(dma_pte_addr(pgd));
1981 				if (!dma_pte_present(pgd))
1982 					goto out_unlock;
1983 			}
1984 
1985 			if (info && info->ats_supported)
1986 				translation = CONTEXT_TT_DEV_IOTLB;
1987 			else
1988 				translation = CONTEXT_TT_MULTI_LEVEL;
1989 
1990 			context_set_address_root(context, virt_to_phys(pgd));
1991 			context_set_address_width(context, agaw);
1992 		} else {
1993 			/*
1994 			 * In pass through mode, AW must be programmed to
1995 			 * indicate the largest AGAW value supported by
1996 			 * hardware. And ASR is ignored by hardware.
1997 			 */
1998 			context_set_address_width(context, iommu->msagaw);
1999 		}
2000 
2001 		context_set_translation_type(context, translation);
2002 	}
2003 
2004 	context_set_fault_enable(context);
2005 	context_set_present(context);
2006 	if (!ecap_coherent(iommu->ecap))
2007 		clflush_cache_range(context, sizeof(*context));
2008 
2009 	/*
2010 	 * It's a non-present to present mapping. If hardware doesn't cache
2011 	 * non-present entry we only need to flush the write-buffer. If the
2012 	 * _does_ cache non-present entries, then it does so in the special
2013 	 * domain #0, which we have to flush:
2014 	 */
2015 	if (cap_caching_mode(iommu->cap)) {
2016 		iommu->flush.flush_context(iommu, 0,
2017 					   (((u16)bus) << 8) | devfn,
2018 					   DMA_CCMD_MASK_NOBIT,
2019 					   DMA_CCMD_DEVICE_INVL);
2020 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2021 	} else {
2022 		iommu_flush_write_buffer(iommu);
2023 	}
2024 
2025 	ret = 0;
2026 
2027 out_unlock:
2028 	spin_unlock(&iommu->lock);
2029 
2030 	return ret;
2031 }
2032 
2033 struct domain_context_mapping_data {
2034 	struct dmar_domain *domain;
2035 	struct intel_iommu *iommu;
2036 	struct pasid_table *table;
2037 };
2038 
2039 static int domain_context_mapping_cb(struct pci_dev *pdev,
2040 				     u16 alias, void *opaque)
2041 {
2042 	struct domain_context_mapping_data *data = opaque;
2043 
2044 	return domain_context_mapping_one(data->domain, data->iommu,
2045 					  data->table, PCI_BUS_NUM(alias),
2046 					  alias & 0xff);
2047 }
2048 
2049 static int
2050 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2051 {
2052 	struct domain_context_mapping_data data;
2053 	struct pasid_table *table;
2054 	struct intel_iommu *iommu;
2055 	u8 bus, devfn;
2056 
2057 	iommu = device_to_iommu(dev, &bus, &devfn);
2058 	if (!iommu)
2059 		return -ENODEV;
2060 
2061 	table = intel_pasid_get_table(dev);
2062 
2063 	if (!dev_is_pci(dev))
2064 		return domain_context_mapping_one(domain, iommu, table,
2065 						  bus, devfn);
2066 
2067 	data.domain = domain;
2068 	data.iommu = iommu;
2069 	data.table = table;
2070 
2071 	return pci_for_each_dma_alias(to_pci_dev(dev),
2072 				      &domain_context_mapping_cb, &data);
2073 }
2074 
2075 /* Returns a number of VTD pages, but aligned to MM page size */
2076 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2077 					    size_t size)
2078 {
2079 	host_addr &= ~PAGE_MASK;
2080 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2081 }
2082 
2083 /* Return largest possible superpage level for a given mapping */
2084 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2085 					  unsigned long iov_pfn,
2086 					  unsigned long phy_pfn,
2087 					  unsigned long pages)
2088 {
2089 	int support, level = 1;
2090 	unsigned long pfnmerge;
2091 
2092 	support = domain->iommu_superpage;
2093 
2094 	/* To use a large page, the virtual *and* physical addresses
2095 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2096 	   of them will mean we have to use smaller pages. So just
2097 	   merge them and check both at once. */
2098 	pfnmerge = iov_pfn | phy_pfn;
2099 
2100 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2101 		pages >>= VTD_STRIDE_SHIFT;
2102 		if (!pages)
2103 			break;
2104 		pfnmerge >>= VTD_STRIDE_SHIFT;
2105 		level++;
2106 		support--;
2107 	}
2108 	return level;
2109 }
2110 
2111 /*
2112  * Ensure that old small page tables are removed to make room for superpage(s).
2113  * We're going to add new large pages, so make sure we don't remove their parent
2114  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2115  */
2116 static void switch_to_super_page(struct dmar_domain *domain,
2117 				 unsigned long start_pfn,
2118 				 unsigned long end_pfn, int level)
2119 {
2120 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2121 	struct iommu_domain_info *info;
2122 	struct dma_pte *pte = NULL;
2123 	unsigned long i;
2124 
2125 	while (start_pfn <= end_pfn) {
2126 		if (!pte)
2127 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2128 					     GFP_ATOMIC);
2129 
2130 		if (dma_pte_present(pte)) {
2131 			dma_pte_free_pagetable(domain, start_pfn,
2132 					       start_pfn + lvl_pages - 1,
2133 					       level + 1);
2134 
2135 			xa_for_each(&domain->iommu_array, i, info)
2136 				iommu_flush_iotlb_psi(info->iommu, domain,
2137 						      start_pfn, lvl_pages,
2138 						      0, 0);
2139 		}
2140 
2141 		pte++;
2142 		start_pfn += lvl_pages;
2143 		if (first_pte_in_page(pte))
2144 			pte = NULL;
2145 	}
2146 }
2147 
2148 static int
2149 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2150 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2151 		 gfp_t gfp)
2152 {
2153 	struct dma_pte *first_pte = NULL, *pte = NULL;
2154 	unsigned int largepage_lvl = 0;
2155 	unsigned long lvl_pages = 0;
2156 	phys_addr_t pteval;
2157 	u64 attr;
2158 
2159 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2160 		return -EINVAL;
2161 
2162 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2163 		return -EINVAL;
2164 
2165 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2166 	attr |= DMA_FL_PTE_PRESENT;
2167 	if (domain->use_first_level) {
2168 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2169 		if (prot & DMA_PTE_WRITE)
2170 			attr |= DMA_FL_PTE_DIRTY;
2171 	}
2172 
2173 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2174 
2175 	while (nr_pages > 0) {
2176 		uint64_t tmp;
2177 
2178 		if (!pte) {
2179 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2180 					phys_pfn, nr_pages);
2181 
2182 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2183 					     gfp);
2184 			if (!pte)
2185 				return -ENOMEM;
2186 			first_pte = pte;
2187 
2188 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2189 
2190 			/* It is large page*/
2191 			if (largepage_lvl > 1) {
2192 				unsigned long end_pfn;
2193 				unsigned long pages_to_remove;
2194 
2195 				pteval |= DMA_PTE_LARGE_PAGE;
2196 				pages_to_remove = min_t(unsigned long, nr_pages,
2197 							nr_pte_to_next_page(pte) * lvl_pages);
2198 				end_pfn = iov_pfn + pages_to_remove - 1;
2199 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2200 			} else {
2201 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2202 			}
2203 
2204 		}
2205 		/* We don't need lock here, nobody else
2206 		 * touches the iova range
2207 		 */
2208 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2209 		if (tmp) {
2210 			static int dumps = 5;
2211 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2212 				iov_pfn, tmp, (unsigned long long)pteval);
2213 			if (dumps) {
2214 				dumps--;
2215 				debug_dma_dump_mappings(NULL);
2216 			}
2217 			WARN_ON(1);
2218 		}
2219 
2220 		nr_pages -= lvl_pages;
2221 		iov_pfn += lvl_pages;
2222 		phys_pfn += lvl_pages;
2223 		pteval += lvl_pages * VTD_PAGE_SIZE;
2224 
2225 		/* If the next PTE would be the first in a new page, then we
2226 		 * need to flush the cache on the entries we've just written.
2227 		 * And then we'll need to recalculate 'pte', so clear it and
2228 		 * let it get set again in the if (!pte) block above.
2229 		 *
2230 		 * If we're done (!nr_pages) we need to flush the cache too.
2231 		 *
2232 		 * Also if we've been setting superpages, we may need to
2233 		 * recalculate 'pte' and switch back to smaller pages for the
2234 		 * end of the mapping, if the trailing size is not enough to
2235 		 * use another superpage (i.e. nr_pages < lvl_pages).
2236 		 */
2237 		pte++;
2238 		if (!nr_pages || first_pte_in_page(pte) ||
2239 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2240 			domain_flush_cache(domain, first_pte,
2241 					   (void *)pte - (void *)first_pte);
2242 			pte = NULL;
2243 		}
2244 	}
2245 
2246 	return 0;
2247 }
2248 
2249 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2250 {
2251 	struct intel_iommu *iommu = info->iommu;
2252 	struct context_entry *context;
2253 	u16 did_old;
2254 
2255 	if (!iommu)
2256 		return;
2257 
2258 	spin_lock(&iommu->lock);
2259 	context = iommu_context_addr(iommu, bus, devfn, 0);
2260 	if (!context) {
2261 		spin_unlock(&iommu->lock);
2262 		return;
2263 	}
2264 
2265 	if (sm_supported(iommu)) {
2266 		if (hw_pass_through && domain_type_is_si(info->domain))
2267 			did_old = FLPT_DEFAULT_DID;
2268 		else
2269 			did_old = domain_id_iommu(info->domain, iommu);
2270 	} else {
2271 		did_old = context_domain_id(context);
2272 	}
2273 
2274 	context_clear_entry(context);
2275 	__iommu_flush_cache(iommu, context, sizeof(*context));
2276 	spin_unlock(&iommu->lock);
2277 	iommu->flush.flush_context(iommu,
2278 				   did_old,
2279 				   (((u16)bus) << 8) | devfn,
2280 				   DMA_CCMD_MASK_NOBIT,
2281 				   DMA_CCMD_DEVICE_INVL);
2282 
2283 	if (sm_supported(iommu))
2284 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2285 
2286 	iommu->flush.flush_iotlb(iommu,
2287 				 did_old,
2288 				 0,
2289 				 0,
2290 				 DMA_TLB_DSI_FLUSH);
2291 
2292 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2293 }
2294 
2295 static int domain_setup_first_level(struct intel_iommu *iommu,
2296 				    struct dmar_domain *domain,
2297 				    struct device *dev,
2298 				    u32 pasid)
2299 {
2300 	struct dma_pte *pgd = domain->pgd;
2301 	int agaw, level;
2302 	int flags = 0;
2303 
2304 	/*
2305 	 * Skip top levels of page tables for iommu which has
2306 	 * less agaw than default. Unnecessary for PT mode.
2307 	 */
2308 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2309 		pgd = phys_to_virt(dma_pte_addr(pgd));
2310 		if (!dma_pte_present(pgd))
2311 			return -ENOMEM;
2312 	}
2313 
2314 	level = agaw_to_level(agaw);
2315 	if (level != 4 && level != 5)
2316 		return -EINVAL;
2317 
2318 	if (level == 5)
2319 		flags |= PASID_FLAG_FL5LP;
2320 
2321 	if (domain->force_snooping)
2322 		flags |= PASID_FLAG_PAGE_SNOOP;
2323 
2324 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2325 					     domain_id_iommu(domain, iommu),
2326 					     flags);
2327 }
2328 
2329 static bool dev_is_real_dma_subdevice(struct device *dev)
2330 {
2331 	return dev && dev_is_pci(dev) &&
2332 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2333 }
2334 
2335 static int iommu_domain_identity_map(struct dmar_domain *domain,
2336 				     unsigned long first_vpfn,
2337 				     unsigned long last_vpfn)
2338 {
2339 	/*
2340 	 * RMRR range might have overlap with physical memory range,
2341 	 * clear it first
2342 	 */
2343 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2344 
2345 	return __domain_mapping(domain, first_vpfn,
2346 				first_vpfn, last_vpfn - first_vpfn + 1,
2347 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2348 }
2349 
2350 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2351 
2352 static int __init si_domain_init(int hw)
2353 {
2354 	struct dmar_rmrr_unit *rmrr;
2355 	struct device *dev;
2356 	int i, nid, ret;
2357 
2358 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2359 	if (!si_domain)
2360 		return -EFAULT;
2361 
2362 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2363 		domain_exit(si_domain);
2364 		si_domain = NULL;
2365 		return -EFAULT;
2366 	}
2367 
2368 	if (hw)
2369 		return 0;
2370 
2371 	for_each_online_node(nid) {
2372 		unsigned long start_pfn, end_pfn;
2373 		int i;
2374 
2375 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2376 			ret = iommu_domain_identity_map(si_domain,
2377 					mm_to_dma_pfn(start_pfn),
2378 					mm_to_dma_pfn(end_pfn));
2379 			if (ret)
2380 				return ret;
2381 		}
2382 	}
2383 
2384 	/*
2385 	 * Identity map the RMRRs so that devices with RMRRs could also use
2386 	 * the si_domain.
2387 	 */
2388 	for_each_rmrr_units(rmrr) {
2389 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2390 					  i, dev) {
2391 			unsigned long long start = rmrr->base_address;
2392 			unsigned long long end = rmrr->end_address;
2393 
2394 			if (WARN_ON(end < start ||
2395 				    end >> agaw_to_width(si_domain->agaw)))
2396 				continue;
2397 
2398 			ret = iommu_domain_identity_map(si_domain,
2399 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2400 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2401 			if (ret)
2402 				return ret;
2403 		}
2404 	}
2405 
2406 	return 0;
2407 }
2408 
2409 static int dmar_domain_attach_device(struct dmar_domain *domain,
2410 				     struct device *dev)
2411 {
2412 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2413 	struct intel_iommu *iommu;
2414 	unsigned long flags;
2415 	u8 bus, devfn;
2416 	int ret;
2417 
2418 	iommu = device_to_iommu(dev, &bus, &devfn);
2419 	if (!iommu)
2420 		return -ENODEV;
2421 
2422 	ret = domain_attach_iommu(domain, iommu);
2423 	if (ret)
2424 		return ret;
2425 	info->domain = domain;
2426 	spin_lock_irqsave(&domain->lock, flags);
2427 	list_add(&info->link, &domain->devices);
2428 	spin_unlock_irqrestore(&domain->lock, flags);
2429 
2430 	/* PASID table is mandatory for a PCI device in scalable mode. */
2431 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2432 		/* Setup the PASID entry for requests without PASID: */
2433 		if (hw_pass_through && domain_type_is_si(domain))
2434 			ret = intel_pasid_setup_pass_through(iommu, domain,
2435 					dev, PASID_RID2PASID);
2436 		else if (domain->use_first_level)
2437 			ret = domain_setup_first_level(iommu, domain, dev,
2438 					PASID_RID2PASID);
2439 		else
2440 			ret = intel_pasid_setup_second_level(iommu, domain,
2441 					dev, PASID_RID2PASID);
2442 		if (ret) {
2443 			dev_err(dev, "Setup RID2PASID failed\n");
2444 			device_block_translation(dev);
2445 			return ret;
2446 		}
2447 	}
2448 
2449 	ret = domain_context_mapping(domain, dev);
2450 	if (ret) {
2451 		dev_err(dev, "Domain context map failed\n");
2452 		device_block_translation(dev);
2453 		return ret;
2454 	}
2455 
2456 	iommu_enable_pci_caps(info);
2457 
2458 	return 0;
2459 }
2460 
2461 static bool device_has_rmrr(struct device *dev)
2462 {
2463 	struct dmar_rmrr_unit *rmrr;
2464 	struct device *tmp;
2465 	int i;
2466 
2467 	rcu_read_lock();
2468 	for_each_rmrr_units(rmrr) {
2469 		/*
2470 		 * Return TRUE if this RMRR contains the device that
2471 		 * is passed in.
2472 		 */
2473 		for_each_active_dev_scope(rmrr->devices,
2474 					  rmrr->devices_cnt, i, tmp)
2475 			if (tmp == dev ||
2476 			    is_downstream_to_pci_bridge(dev, tmp)) {
2477 				rcu_read_unlock();
2478 				return true;
2479 			}
2480 	}
2481 	rcu_read_unlock();
2482 	return false;
2483 }
2484 
2485 /**
2486  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2487  * is relaxable (ie. is allowed to be not enforced under some conditions)
2488  * @dev: device handle
2489  *
2490  * We assume that PCI USB devices with RMRRs have them largely
2491  * for historical reasons and that the RMRR space is not actively used post
2492  * boot.  This exclusion may change if vendors begin to abuse it.
2493  *
2494  * The same exception is made for graphics devices, with the requirement that
2495  * any use of the RMRR regions will be torn down before assigning the device
2496  * to a guest.
2497  *
2498  * Return: true if the RMRR is relaxable, false otherwise
2499  */
2500 static bool device_rmrr_is_relaxable(struct device *dev)
2501 {
2502 	struct pci_dev *pdev;
2503 
2504 	if (!dev_is_pci(dev))
2505 		return false;
2506 
2507 	pdev = to_pci_dev(dev);
2508 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2509 		return true;
2510 	else
2511 		return false;
2512 }
2513 
2514 /*
2515  * There are a couple cases where we need to restrict the functionality of
2516  * devices associated with RMRRs.  The first is when evaluating a device for
2517  * identity mapping because problems exist when devices are moved in and out
2518  * of domains and their respective RMRR information is lost.  This means that
2519  * a device with associated RMRRs will never be in a "passthrough" domain.
2520  * The second is use of the device through the IOMMU API.  This interface
2521  * expects to have full control of the IOVA space for the device.  We cannot
2522  * satisfy both the requirement that RMRR access is maintained and have an
2523  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2524  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2525  * We therefore prevent devices associated with an RMRR from participating in
2526  * the IOMMU API, which eliminates them from device assignment.
2527  *
2528  * In both cases, devices which have relaxable RMRRs are not concerned by this
2529  * restriction. See device_rmrr_is_relaxable comment.
2530  */
2531 static bool device_is_rmrr_locked(struct device *dev)
2532 {
2533 	if (!device_has_rmrr(dev))
2534 		return false;
2535 
2536 	if (device_rmrr_is_relaxable(dev))
2537 		return false;
2538 
2539 	return true;
2540 }
2541 
2542 /*
2543  * Return the required default domain type for a specific device.
2544  *
2545  * @dev: the device in query
2546  * @startup: true if this is during early boot
2547  *
2548  * Returns:
2549  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2550  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2551  *  - 0: both identity and dynamic domains work for this device
2552  */
2553 static int device_def_domain_type(struct device *dev)
2554 {
2555 	if (dev_is_pci(dev)) {
2556 		struct pci_dev *pdev = to_pci_dev(dev);
2557 
2558 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2559 			return IOMMU_DOMAIN_IDENTITY;
2560 
2561 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2562 			return IOMMU_DOMAIN_IDENTITY;
2563 	}
2564 
2565 	return 0;
2566 }
2567 
2568 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2569 {
2570 	/*
2571 	 * Start from the sane iommu hardware state.
2572 	 * If the queued invalidation is already initialized by us
2573 	 * (for example, while enabling interrupt-remapping) then
2574 	 * we got the things already rolling from a sane state.
2575 	 */
2576 	if (!iommu->qi) {
2577 		/*
2578 		 * Clear any previous faults.
2579 		 */
2580 		dmar_fault(-1, iommu);
2581 		/*
2582 		 * Disable queued invalidation if supported and already enabled
2583 		 * before OS handover.
2584 		 */
2585 		dmar_disable_qi(iommu);
2586 	}
2587 
2588 	if (dmar_enable_qi(iommu)) {
2589 		/*
2590 		 * Queued Invalidate not enabled, use Register Based Invalidate
2591 		 */
2592 		iommu->flush.flush_context = __iommu_flush_context;
2593 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2594 		pr_info("%s: Using Register based invalidation\n",
2595 			iommu->name);
2596 	} else {
2597 		iommu->flush.flush_context = qi_flush_context;
2598 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2599 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2600 	}
2601 }
2602 
2603 static int copy_context_table(struct intel_iommu *iommu,
2604 			      struct root_entry *old_re,
2605 			      struct context_entry **tbl,
2606 			      int bus, bool ext)
2607 {
2608 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2609 	struct context_entry *new_ce = NULL, ce;
2610 	struct context_entry *old_ce = NULL;
2611 	struct root_entry re;
2612 	phys_addr_t old_ce_phys;
2613 
2614 	tbl_idx = ext ? bus * 2 : bus;
2615 	memcpy(&re, old_re, sizeof(re));
2616 
2617 	for (devfn = 0; devfn < 256; devfn++) {
2618 		/* First calculate the correct index */
2619 		idx = (ext ? devfn * 2 : devfn) % 256;
2620 
2621 		if (idx == 0) {
2622 			/* First save what we may have and clean up */
2623 			if (new_ce) {
2624 				tbl[tbl_idx] = new_ce;
2625 				__iommu_flush_cache(iommu, new_ce,
2626 						    VTD_PAGE_SIZE);
2627 				pos = 1;
2628 			}
2629 
2630 			if (old_ce)
2631 				memunmap(old_ce);
2632 
2633 			ret = 0;
2634 			if (devfn < 0x80)
2635 				old_ce_phys = root_entry_lctp(&re);
2636 			else
2637 				old_ce_phys = root_entry_uctp(&re);
2638 
2639 			if (!old_ce_phys) {
2640 				if (ext && devfn == 0) {
2641 					/* No LCTP, try UCTP */
2642 					devfn = 0x7f;
2643 					continue;
2644 				} else {
2645 					goto out;
2646 				}
2647 			}
2648 
2649 			ret = -ENOMEM;
2650 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2651 					MEMREMAP_WB);
2652 			if (!old_ce)
2653 				goto out;
2654 
2655 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2656 			if (!new_ce)
2657 				goto out_unmap;
2658 
2659 			ret = 0;
2660 		}
2661 
2662 		/* Now copy the context entry */
2663 		memcpy(&ce, old_ce + idx, sizeof(ce));
2664 
2665 		if (!context_present(&ce))
2666 			continue;
2667 
2668 		did = context_domain_id(&ce);
2669 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2670 			set_bit(did, iommu->domain_ids);
2671 
2672 		set_context_copied(iommu, bus, devfn);
2673 		new_ce[idx] = ce;
2674 	}
2675 
2676 	tbl[tbl_idx + pos] = new_ce;
2677 
2678 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2679 
2680 out_unmap:
2681 	memunmap(old_ce);
2682 
2683 out:
2684 	return ret;
2685 }
2686 
2687 static int copy_translation_tables(struct intel_iommu *iommu)
2688 {
2689 	struct context_entry **ctxt_tbls;
2690 	struct root_entry *old_rt;
2691 	phys_addr_t old_rt_phys;
2692 	int ctxt_table_entries;
2693 	u64 rtaddr_reg;
2694 	int bus, ret;
2695 	bool new_ext, ext;
2696 
2697 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2698 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2699 	new_ext    = !!sm_supported(iommu);
2700 
2701 	/*
2702 	 * The RTT bit can only be changed when translation is disabled,
2703 	 * but disabling translation means to open a window for data
2704 	 * corruption. So bail out and don't copy anything if we would
2705 	 * have to change the bit.
2706 	 */
2707 	if (new_ext != ext)
2708 		return -EINVAL;
2709 
2710 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2711 	if (!iommu->copied_tables)
2712 		return -ENOMEM;
2713 
2714 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2715 	if (!old_rt_phys)
2716 		return -EINVAL;
2717 
2718 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2719 	if (!old_rt)
2720 		return -ENOMEM;
2721 
2722 	/* This is too big for the stack - allocate it from slab */
2723 	ctxt_table_entries = ext ? 512 : 256;
2724 	ret = -ENOMEM;
2725 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2726 	if (!ctxt_tbls)
2727 		goto out_unmap;
2728 
2729 	for (bus = 0; bus < 256; bus++) {
2730 		ret = copy_context_table(iommu, &old_rt[bus],
2731 					 ctxt_tbls, bus, ext);
2732 		if (ret) {
2733 			pr_err("%s: Failed to copy context table for bus %d\n",
2734 				iommu->name, bus);
2735 			continue;
2736 		}
2737 	}
2738 
2739 	spin_lock(&iommu->lock);
2740 
2741 	/* Context tables are copied, now write them to the root_entry table */
2742 	for (bus = 0; bus < 256; bus++) {
2743 		int idx = ext ? bus * 2 : bus;
2744 		u64 val;
2745 
2746 		if (ctxt_tbls[idx]) {
2747 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2748 			iommu->root_entry[bus].lo = val;
2749 		}
2750 
2751 		if (!ext || !ctxt_tbls[idx + 1])
2752 			continue;
2753 
2754 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2755 		iommu->root_entry[bus].hi = val;
2756 	}
2757 
2758 	spin_unlock(&iommu->lock);
2759 
2760 	kfree(ctxt_tbls);
2761 
2762 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2763 
2764 	ret = 0;
2765 
2766 out_unmap:
2767 	memunmap(old_rt);
2768 
2769 	return ret;
2770 }
2771 
2772 static int __init init_dmars(void)
2773 {
2774 	struct dmar_drhd_unit *drhd;
2775 	struct intel_iommu *iommu;
2776 	int ret;
2777 
2778 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2779 	if (ret)
2780 		goto free_iommu;
2781 
2782 	for_each_iommu(iommu, drhd) {
2783 		if (drhd->ignored) {
2784 			iommu_disable_translation(iommu);
2785 			continue;
2786 		}
2787 
2788 		/*
2789 		 * Find the max pasid size of all IOMMU's in the system.
2790 		 * We need to ensure the system pasid table is no bigger
2791 		 * than the smallest supported.
2792 		 */
2793 		if (pasid_supported(iommu)) {
2794 			u32 temp = 2 << ecap_pss(iommu->ecap);
2795 
2796 			intel_pasid_max_id = min_t(u32, temp,
2797 						   intel_pasid_max_id);
2798 		}
2799 
2800 		intel_iommu_init_qi(iommu);
2801 
2802 		ret = iommu_init_domains(iommu);
2803 		if (ret)
2804 			goto free_iommu;
2805 
2806 		init_translation_status(iommu);
2807 
2808 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2809 			iommu_disable_translation(iommu);
2810 			clear_translation_pre_enabled(iommu);
2811 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2812 				iommu->name);
2813 		}
2814 
2815 		/*
2816 		 * TBD:
2817 		 * we could share the same root & context tables
2818 		 * among all IOMMU's. Need to Split it later.
2819 		 */
2820 		ret = iommu_alloc_root_entry(iommu);
2821 		if (ret)
2822 			goto free_iommu;
2823 
2824 		if (translation_pre_enabled(iommu)) {
2825 			pr_info("Translation already enabled - trying to copy translation structures\n");
2826 
2827 			ret = copy_translation_tables(iommu);
2828 			if (ret) {
2829 				/*
2830 				 * We found the IOMMU with translation
2831 				 * enabled - but failed to copy over the
2832 				 * old root-entry table. Try to proceed
2833 				 * by disabling translation now and
2834 				 * allocating a clean root-entry table.
2835 				 * This might cause DMAR faults, but
2836 				 * probably the dump will still succeed.
2837 				 */
2838 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2839 				       iommu->name);
2840 				iommu_disable_translation(iommu);
2841 				clear_translation_pre_enabled(iommu);
2842 			} else {
2843 				pr_info("Copied translation tables from previous kernel for %s\n",
2844 					iommu->name);
2845 			}
2846 		}
2847 
2848 		if (!ecap_pass_through(iommu->ecap))
2849 			hw_pass_through = 0;
2850 		intel_svm_check(iommu);
2851 	}
2852 
2853 	/*
2854 	 * Now that qi is enabled on all iommus, set the root entry and flush
2855 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2856 	 * flush_context function will loop forever and the boot hangs.
2857 	 */
2858 	for_each_active_iommu(iommu, drhd) {
2859 		iommu_flush_write_buffer(iommu);
2860 		iommu_set_root_entry(iommu);
2861 	}
2862 
2863 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2864 	dmar_map_gfx = 0;
2865 #endif
2866 
2867 	if (!dmar_map_gfx)
2868 		iommu_identity_mapping |= IDENTMAP_GFX;
2869 
2870 	check_tylersburg_isoch();
2871 
2872 	ret = si_domain_init(hw_pass_through);
2873 	if (ret)
2874 		goto free_iommu;
2875 
2876 	/*
2877 	 * for each drhd
2878 	 *   enable fault log
2879 	 *   global invalidate context cache
2880 	 *   global invalidate iotlb
2881 	 *   enable translation
2882 	 */
2883 	for_each_iommu(iommu, drhd) {
2884 		if (drhd->ignored) {
2885 			/*
2886 			 * we always have to disable PMRs or DMA may fail on
2887 			 * this device
2888 			 */
2889 			if (force_on)
2890 				iommu_disable_protect_mem_regions(iommu);
2891 			continue;
2892 		}
2893 
2894 		iommu_flush_write_buffer(iommu);
2895 
2896 #ifdef CONFIG_INTEL_IOMMU_SVM
2897 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2898 			/*
2899 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2900 			 * could cause possible lock race condition.
2901 			 */
2902 			up_write(&dmar_global_lock);
2903 			ret = intel_svm_enable_prq(iommu);
2904 			down_write(&dmar_global_lock);
2905 			if (ret)
2906 				goto free_iommu;
2907 		}
2908 #endif
2909 		ret = dmar_set_interrupt(iommu);
2910 		if (ret)
2911 			goto free_iommu;
2912 	}
2913 
2914 	return 0;
2915 
2916 free_iommu:
2917 	for_each_active_iommu(iommu, drhd) {
2918 		disable_dmar_iommu(iommu);
2919 		free_dmar_iommu(iommu);
2920 	}
2921 	if (si_domain) {
2922 		domain_exit(si_domain);
2923 		si_domain = NULL;
2924 	}
2925 
2926 	return ret;
2927 }
2928 
2929 static void __init init_no_remapping_devices(void)
2930 {
2931 	struct dmar_drhd_unit *drhd;
2932 	struct device *dev;
2933 	int i;
2934 
2935 	for_each_drhd_unit(drhd) {
2936 		if (!drhd->include_all) {
2937 			for_each_active_dev_scope(drhd->devices,
2938 						  drhd->devices_cnt, i, dev)
2939 				break;
2940 			/* ignore DMAR unit if no devices exist */
2941 			if (i == drhd->devices_cnt)
2942 				drhd->ignored = 1;
2943 		}
2944 	}
2945 
2946 	for_each_active_drhd_unit(drhd) {
2947 		if (drhd->include_all)
2948 			continue;
2949 
2950 		for_each_active_dev_scope(drhd->devices,
2951 					  drhd->devices_cnt, i, dev)
2952 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2953 				break;
2954 		if (i < drhd->devices_cnt)
2955 			continue;
2956 
2957 		/* This IOMMU has *only* gfx devices. Either bypass it or
2958 		   set the gfx_mapped flag, as appropriate */
2959 		drhd->gfx_dedicated = 1;
2960 		if (!dmar_map_gfx)
2961 			drhd->ignored = 1;
2962 	}
2963 }
2964 
2965 #ifdef CONFIG_SUSPEND
2966 static int init_iommu_hw(void)
2967 {
2968 	struct dmar_drhd_unit *drhd;
2969 	struct intel_iommu *iommu = NULL;
2970 
2971 	for_each_active_iommu(iommu, drhd)
2972 		if (iommu->qi)
2973 			dmar_reenable_qi(iommu);
2974 
2975 	for_each_iommu(iommu, drhd) {
2976 		if (drhd->ignored) {
2977 			/*
2978 			 * we always have to disable PMRs or DMA may fail on
2979 			 * this device
2980 			 */
2981 			if (force_on)
2982 				iommu_disable_protect_mem_regions(iommu);
2983 			continue;
2984 		}
2985 
2986 		iommu_flush_write_buffer(iommu);
2987 		iommu_set_root_entry(iommu);
2988 		iommu_enable_translation(iommu);
2989 		iommu_disable_protect_mem_regions(iommu);
2990 	}
2991 
2992 	return 0;
2993 }
2994 
2995 static void iommu_flush_all(void)
2996 {
2997 	struct dmar_drhd_unit *drhd;
2998 	struct intel_iommu *iommu;
2999 
3000 	for_each_active_iommu(iommu, drhd) {
3001 		iommu->flush.flush_context(iommu, 0, 0, 0,
3002 					   DMA_CCMD_GLOBAL_INVL);
3003 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3004 					 DMA_TLB_GLOBAL_FLUSH);
3005 	}
3006 }
3007 
3008 static int iommu_suspend(void)
3009 {
3010 	struct dmar_drhd_unit *drhd;
3011 	struct intel_iommu *iommu = NULL;
3012 	unsigned long flag;
3013 
3014 	for_each_active_iommu(iommu, drhd) {
3015 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3016 					     GFP_KERNEL);
3017 		if (!iommu->iommu_state)
3018 			goto nomem;
3019 	}
3020 
3021 	iommu_flush_all();
3022 
3023 	for_each_active_iommu(iommu, drhd) {
3024 		iommu_disable_translation(iommu);
3025 
3026 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3027 
3028 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3029 			readl(iommu->reg + DMAR_FECTL_REG);
3030 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3031 			readl(iommu->reg + DMAR_FEDATA_REG);
3032 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3033 			readl(iommu->reg + DMAR_FEADDR_REG);
3034 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3035 			readl(iommu->reg + DMAR_FEUADDR_REG);
3036 
3037 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3038 	}
3039 	return 0;
3040 
3041 nomem:
3042 	for_each_active_iommu(iommu, drhd)
3043 		kfree(iommu->iommu_state);
3044 
3045 	return -ENOMEM;
3046 }
3047 
3048 static void iommu_resume(void)
3049 {
3050 	struct dmar_drhd_unit *drhd;
3051 	struct intel_iommu *iommu = NULL;
3052 	unsigned long flag;
3053 
3054 	if (init_iommu_hw()) {
3055 		if (force_on)
3056 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3057 		else
3058 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3059 		return;
3060 	}
3061 
3062 	for_each_active_iommu(iommu, drhd) {
3063 
3064 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3065 
3066 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3067 			iommu->reg + DMAR_FECTL_REG);
3068 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3069 			iommu->reg + DMAR_FEDATA_REG);
3070 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3071 			iommu->reg + DMAR_FEADDR_REG);
3072 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3073 			iommu->reg + DMAR_FEUADDR_REG);
3074 
3075 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3076 	}
3077 
3078 	for_each_active_iommu(iommu, drhd)
3079 		kfree(iommu->iommu_state);
3080 }
3081 
3082 static struct syscore_ops iommu_syscore_ops = {
3083 	.resume		= iommu_resume,
3084 	.suspend	= iommu_suspend,
3085 };
3086 
3087 static void __init init_iommu_pm_ops(void)
3088 {
3089 	register_syscore_ops(&iommu_syscore_ops);
3090 }
3091 
3092 #else
3093 static inline void init_iommu_pm_ops(void) {}
3094 #endif	/* CONFIG_PM */
3095 
3096 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3097 {
3098 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3099 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3100 	    rmrr->end_address <= rmrr->base_address ||
3101 	    arch_rmrr_sanity_check(rmrr))
3102 		return -EINVAL;
3103 
3104 	return 0;
3105 }
3106 
3107 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3108 {
3109 	struct acpi_dmar_reserved_memory *rmrr;
3110 	struct dmar_rmrr_unit *rmrru;
3111 
3112 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3113 	if (rmrr_sanity_check(rmrr)) {
3114 		pr_warn(FW_BUG
3115 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3116 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3117 			   rmrr->base_address, rmrr->end_address,
3118 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3119 			   dmi_get_system_info(DMI_BIOS_VERSION),
3120 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3121 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3122 	}
3123 
3124 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3125 	if (!rmrru)
3126 		goto out;
3127 
3128 	rmrru->hdr = header;
3129 
3130 	rmrru->base_address = rmrr->base_address;
3131 	rmrru->end_address = rmrr->end_address;
3132 
3133 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3134 				((void *)rmrr) + rmrr->header.length,
3135 				&rmrru->devices_cnt);
3136 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3137 		goto free_rmrru;
3138 
3139 	list_add(&rmrru->list, &dmar_rmrr_units);
3140 
3141 	return 0;
3142 free_rmrru:
3143 	kfree(rmrru);
3144 out:
3145 	return -ENOMEM;
3146 }
3147 
3148 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3149 {
3150 	struct dmar_atsr_unit *atsru;
3151 	struct acpi_dmar_atsr *tmp;
3152 
3153 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3154 				dmar_rcu_check()) {
3155 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3156 		if (atsr->segment != tmp->segment)
3157 			continue;
3158 		if (atsr->header.length != tmp->header.length)
3159 			continue;
3160 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3161 			return atsru;
3162 	}
3163 
3164 	return NULL;
3165 }
3166 
3167 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3168 {
3169 	struct acpi_dmar_atsr *atsr;
3170 	struct dmar_atsr_unit *atsru;
3171 
3172 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3173 		return 0;
3174 
3175 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3176 	atsru = dmar_find_atsr(atsr);
3177 	if (atsru)
3178 		return 0;
3179 
3180 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3181 	if (!atsru)
3182 		return -ENOMEM;
3183 
3184 	/*
3185 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3186 	 * copy the memory content because the memory buffer will be freed
3187 	 * on return.
3188 	 */
3189 	atsru->hdr = (void *)(atsru + 1);
3190 	memcpy(atsru->hdr, hdr, hdr->length);
3191 	atsru->include_all = atsr->flags & 0x1;
3192 	if (!atsru->include_all) {
3193 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3194 				(void *)atsr + atsr->header.length,
3195 				&atsru->devices_cnt);
3196 		if (atsru->devices_cnt && atsru->devices == NULL) {
3197 			kfree(atsru);
3198 			return -ENOMEM;
3199 		}
3200 	}
3201 
3202 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3203 
3204 	return 0;
3205 }
3206 
3207 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3208 {
3209 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3210 	kfree(atsru);
3211 }
3212 
3213 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3214 {
3215 	struct acpi_dmar_atsr *atsr;
3216 	struct dmar_atsr_unit *atsru;
3217 
3218 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3219 	atsru = dmar_find_atsr(atsr);
3220 	if (atsru) {
3221 		list_del_rcu(&atsru->list);
3222 		synchronize_rcu();
3223 		intel_iommu_free_atsr(atsru);
3224 	}
3225 
3226 	return 0;
3227 }
3228 
3229 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3230 {
3231 	int i;
3232 	struct device *dev;
3233 	struct acpi_dmar_atsr *atsr;
3234 	struct dmar_atsr_unit *atsru;
3235 
3236 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3237 	atsru = dmar_find_atsr(atsr);
3238 	if (!atsru)
3239 		return 0;
3240 
3241 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3242 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3243 					  i, dev)
3244 			return -EBUSY;
3245 	}
3246 
3247 	return 0;
3248 }
3249 
3250 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3251 {
3252 	struct dmar_satc_unit *satcu;
3253 	struct acpi_dmar_satc *tmp;
3254 
3255 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3256 				dmar_rcu_check()) {
3257 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3258 		if (satc->segment != tmp->segment)
3259 			continue;
3260 		if (satc->header.length != tmp->header.length)
3261 			continue;
3262 		if (memcmp(satc, tmp, satc->header.length) == 0)
3263 			return satcu;
3264 	}
3265 
3266 	return NULL;
3267 }
3268 
3269 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3270 {
3271 	struct acpi_dmar_satc *satc;
3272 	struct dmar_satc_unit *satcu;
3273 
3274 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3275 		return 0;
3276 
3277 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3278 	satcu = dmar_find_satc(satc);
3279 	if (satcu)
3280 		return 0;
3281 
3282 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3283 	if (!satcu)
3284 		return -ENOMEM;
3285 
3286 	satcu->hdr = (void *)(satcu + 1);
3287 	memcpy(satcu->hdr, hdr, hdr->length);
3288 	satcu->atc_required = satc->flags & 0x1;
3289 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3290 					      (void *)satc + satc->header.length,
3291 					      &satcu->devices_cnt);
3292 	if (satcu->devices_cnt && !satcu->devices) {
3293 		kfree(satcu);
3294 		return -ENOMEM;
3295 	}
3296 	list_add_rcu(&satcu->list, &dmar_satc_units);
3297 
3298 	return 0;
3299 }
3300 
3301 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3302 {
3303 	int sp, ret;
3304 	struct intel_iommu *iommu = dmaru->iommu;
3305 
3306 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3307 	if (ret)
3308 		goto out;
3309 
3310 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3311 		pr_warn("%s: Doesn't support hardware pass through.\n",
3312 			iommu->name);
3313 		return -ENXIO;
3314 	}
3315 
3316 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3317 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3318 		pr_warn("%s: Doesn't support large page.\n",
3319 			iommu->name);
3320 		return -ENXIO;
3321 	}
3322 
3323 	/*
3324 	 * Disable translation if already enabled prior to OS handover.
3325 	 */
3326 	if (iommu->gcmd & DMA_GCMD_TE)
3327 		iommu_disable_translation(iommu);
3328 
3329 	ret = iommu_init_domains(iommu);
3330 	if (ret == 0)
3331 		ret = iommu_alloc_root_entry(iommu);
3332 	if (ret)
3333 		goto out;
3334 
3335 	intel_svm_check(iommu);
3336 
3337 	if (dmaru->ignored) {
3338 		/*
3339 		 * we always have to disable PMRs or DMA may fail on this device
3340 		 */
3341 		if (force_on)
3342 			iommu_disable_protect_mem_regions(iommu);
3343 		return 0;
3344 	}
3345 
3346 	intel_iommu_init_qi(iommu);
3347 	iommu_flush_write_buffer(iommu);
3348 
3349 #ifdef CONFIG_INTEL_IOMMU_SVM
3350 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3351 		ret = intel_svm_enable_prq(iommu);
3352 		if (ret)
3353 			goto disable_iommu;
3354 	}
3355 #endif
3356 	ret = dmar_set_interrupt(iommu);
3357 	if (ret)
3358 		goto disable_iommu;
3359 
3360 	iommu_set_root_entry(iommu);
3361 	iommu_enable_translation(iommu);
3362 
3363 	iommu_disable_protect_mem_regions(iommu);
3364 	return 0;
3365 
3366 disable_iommu:
3367 	disable_dmar_iommu(iommu);
3368 out:
3369 	free_dmar_iommu(iommu);
3370 	return ret;
3371 }
3372 
3373 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3374 {
3375 	int ret = 0;
3376 	struct intel_iommu *iommu = dmaru->iommu;
3377 
3378 	if (!intel_iommu_enabled)
3379 		return 0;
3380 	if (iommu == NULL)
3381 		return -EINVAL;
3382 
3383 	if (insert) {
3384 		ret = intel_iommu_add(dmaru);
3385 	} else {
3386 		disable_dmar_iommu(iommu);
3387 		free_dmar_iommu(iommu);
3388 	}
3389 
3390 	return ret;
3391 }
3392 
3393 static void intel_iommu_free_dmars(void)
3394 {
3395 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3396 	struct dmar_atsr_unit *atsru, *atsr_n;
3397 	struct dmar_satc_unit *satcu, *satc_n;
3398 
3399 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3400 		list_del(&rmrru->list);
3401 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3402 		kfree(rmrru);
3403 	}
3404 
3405 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3406 		list_del(&atsru->list);
3407 		intel_iommu_free_atsr(atsru);
3408 	}
3409 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3410 		list_del(&satcu->list);
3411 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3412 		kfree(satcu);
3413 	}
3414 }
3415 
3416 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3417 {
3418 	struct dmar_satc_unit *satcu;
3419 	struct acpi_dmar_satc *satc;
3420 	struct device *tmp;
3421 	int i;
3422 
3423 	dev = pci_physfn(dev);
3424 	rcu_read_lock();
3425 
3426 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3427 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3428 		if (satc->segment != pci_domain_nr(dev->bus))
3429 			continue;
3430 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3431 			if (to_pci_dev(tmp) == dev)
3432 				goto out;
3433 	}
3434 	satcu = NULL;
3435 out:
3436 	rcu_read_unlock();
3437 	return satcu;
3438 }
3439 
3440 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3441 {
3442 	int i, ret = 1;
3443 	struct pci_bus *bus;
3444 	struct pci_dev *bridge = NULL;
3445 	struct device *tmp;
3446 	struct acpi_dmar_atsr *atsr;
3447 	struct dmar_atsr_unit *atsru;
3448 	struct dmar_satc_unit *satcu;
3449 
3450 	dev = pci_physfn(dev);
3451 	satcu = dmar_find_matched_satc_unit(dev);
3452 	if (satcu)
3453 		/*
3454 		 * This device supports ATS as it is in SATC table.
3455 		 * When IOMMU is in legacy mode, enabling ATS is done
3456 		 * automatically by HW for the device that requires
3457 		 * ATS, hence OS should not enable this device ATS
3458 		 * to avoid duplicated TLB invalidation.
3459 		 */
3460 		return !(satcu->atc_required && !sm_supported(iommu));
3461 
3462 	for (bus = dev->bus; bus; bus = bus->parent) {
3463 		bridge = bus->self;
3464 		/* If it's an integrated device, allow ATS */
3465 		if (!bridge)
3466 			return 1;
3467 		/* Connected via non-PCIe: no ATS */
3468 		if (!pci_is_pcie(bridge) ||
3469 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3470 			return 0;
3471 		/* If we found the root port, look it up in the ATSR */
3472 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3473 			break;
3474 	}
3475 
3476 	rcu_read_lock();
3477 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3478 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3479 		if (atsr->segment != pci_domain_nr(dev->bus))
3480 			continue;
3481 
3482 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3483 			if (tmp == &bridge->dev)
3484 				goto out;
3485 
3486 		if (atsru->include_all)
3487 			goto out;
3488 	}
3489 	ret = 0;
3490 out:
3491 	rcu_read_unlock();
3492 
3493 	return ret;
3494 }
3495 
3496 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3497 {
3498 	int ret;
3499 	struct dmar_rmrr_unit *rmrru;
3500 	struct dmar_atsr_unit *atsru;
3501 	struct dmar_satc_unit *satcu;
3502 	struct acpi_dmar_atsr *atsr;
3503 	struct acpi_dmar_reserved_memory *rmrr;
3504 	struct acpi_dmar_satc *satc;
3505 
3506 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3507 		return 0;
3508 
3509 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3510 		rmrr = container_of(rmrru->hdr,
3511 				    struct acpi_dmar_reserved_memory, header);
3512 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3513 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3514 				((void *)rmrr) + rmrr->header.length,
3515 				rmrr->segment, rmrru->devices,
3516 				rmrru->devices_cnt);
3517 			if (ret < 0)
3518 				return ret;
3519 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3520 			dmar_remove_dev_scope(info, rmrr->segment,
3521 				rmrru->devices, rmrru->devices_cnt);
3522 		}
3523 	}
3524 
3525 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3526 		if (atsru->include_all)
3527 			continue;
3528 
3529 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3530 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3531 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3532 					(void *)atsr + atsr->header.length,
3533 					atsr->segment, atsru->devices,
3534 					atsru->devices_cnt);
3535 			if (ret > 0)
3536 				break;
3537 			else if (ret < 0)
3538 				return ret;
3539 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3540 			if (dmar_remove_dev_scope(info, atsr->segment,
3541 					atsru->devices, atsru->devices_cnt))
3542 				break;
3543 		}
3544 	}
3545 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3546 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3547 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3548 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3549 					(void *)satc + satc->header.length,
3550 					satc->segment, satcu->devices,
3551 					satcu->devices_cnt);
3552 			if (ret > 0)
3553 				break;
3554 			else if (ret < 0)
3555 				return ret;
3556 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3557 			if (dmar_remove_dev_scope(info, satc->segment,
3558 					satcu->devices, satcu->devices_cnt))
3559 				break;
3560 		}
3561 	}
3562 
3563 	return 0;
3564 }
3565 
3566 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3567 				       unsigned long val, void *v)
3568 {
3569 	struct memory_notify *mhp = v;
3570 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3571 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3572 			mhp->nr_pages - 1);
3573 
3574 	switch (val) {
3575 	case MEM_GOING_ONLINE:
3576 		if (iommu_domain_identity_map(si_domain,
3577 					      start_vpfn, last_vpfn)) {
3578 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3579 				start_vpfn, last_vpfn);
3580 			return NOTIFY_BAD;
3581 		}
3582 		break;
3583 
3584 	case MEM_OFFLINE:
3585 	case MEM_CANCEL_ONLINE:
3586 		{
3587 			struct dmar_drhd_unit *drhd;
3588 			struct intel_iommu *iommu;
3589 			LIST_HEAD(freelist);
3590 
3591 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3592 
3593 			rcu_read_lock();
3594 			for_each_active_iommu(iommu, drhd)
3595 				iommu_flush_iotlb_psi(iommu, si_domain,
3596 					start_vpfn, mhp->nr_pages,
3597 					list_empty(&freelist), 0);
3598 			rcu_read_unlock();
3599 			put_pages_list(&freelist);
3600 		}
3601 		break;
3602 	}
3603 
3604 	return NOTIFY_OK;
3605 }
3606 
3607 static struct notifier_block intel_iommu_memory_nb = {
3608 	.notifier_call = intel_iommu_memory_notifier,
3609 	.priority = 0
3610 };
3611 
3612 static void intel_disable_iommus(void)
3613 {
3614 	struct intel_iommu *iommu = NULL;
3615 	struct dmar_drhd_unit *drhd;
3616 
3617 	for_each_iommu(iommu, drhd)
3618 		iommu_disable_translation(iommu);
3619 }
3620 
3621 void intel_iommu_shutdown(void)
3622 {
3623 	struct dmar_drhd_unit *drhd;
3624 	struct intel_iommu *iommu = NULL;
3625 
3626 	if (no_iommu || dmar_disabled)
3627 		return;
3628 
3629 	down_write(&dmar_global_lock);
3630 
3631 	/* Disable PMRs explicitly here. */
3632 	for_each_iommu(iommu, drhd)
3633 		iommu_disable_protect_mem_regions(iommu);
3634 
3635 	/* Make sure the IOMMUs are switched off */
3636 	intel_disable_iommus();
3637 
3638 	up_write(&dmar_global_lock);
3639 }
3640 
3641 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3642 {
3643 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3644 
3645 	return container_of(iommu_dev, struct intel_iommu, iommu);
3646 }
3647 
3648 static ssize_t version_show(struct device *dev,
3649 			    struct device_attribute *attr, char *buf)
3650 {
3651 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3653 	return sysfs_emit(buf, "%d:%d\n",
3654 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3655 }
3656 static DEVICE_ATTR_RO(version);
3657 
3658 static ssize_t address_show(struct device *dev,
3659 			    struct device_attribute *attr, char *buf)
3660 {
3661 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3662 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3663 }
3664 static DEVICE_ATTR_RO(address);
3665 
3666 static ssize_t cap_show(struct device *dev,
3667 			struct device_attribute *attr, char *buf)
3668 {
3669 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3670 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3671 }
3672 static DEVICE_ATTR_RO(cap);
3673 
3674 static ssize_t ecap_show(struct device *dev,
3675 			 struct device_attribute *attr, char *buf)
3676 {
3677 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3678 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3679 }
3680 static DEVICE_ATTR_RO(ecap);
3681 
3682 static ssize_t domains_supported_show(struct device *dev,
3683 				      struct device_attribute *attr, char *buf)
3684 {
3685 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3686 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3687 }
3688 static DEVICE_ATTR_RO(domains_supported);
3689 
3690 static ssize_t domains_used_show(struct device *dev,
3691 				 struct device_attribute *attr, char *buf)
3692 {
3693 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3694 	return sysfs_emit(buf, "%d\n",
3695 			  bitmap_weight(iommu->domain_ids,
3696 					cap_ndoms(iommu->cap)));
3697 }
3698 static DEVICE_ATTR_RO(domains_used);
3699 
3700 static struct attribute *intel_iommu_attrs[] = {
3701 	&dev_attr_version.attr,
3702 	&dev_attr_address.attr,
3703 	&dev_attr_cap.attr,
3704 	&dev_attr_ecap.attr,
3705 	&dev_attr_domains_supported.attr,
3706 	&dev_attr_domains_used.attr,
3707 	NULL,
3708 };
3709 
3710 static struct attribute_group intel_iommu_group = {
3711 	.name = "intel-iommu",
3712 	.attrs = intel_iommu_attrs,
3713 };
3714 
3715 const struct attribute_group *intel_iommu_groups[] = {
3716 	&intel_iommu_group,
3717 	NULL,
3718 };
3719 
3720 static inline bool has_external_pci(void)
3721 {
3722 	struct pci_dev *pdev = NULL;
3723 
3724 	for_each_pci_dev(pdev)
3725 		if (pdev->external_facing) {
3726 			pci_dev_put(pdev);
3727 			return true;
3728 		}
3729 
3730 	return false;
3731 }
3732 
3733 static int __init platform_optin_force_iommu(void)
3734 {
3735 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3736 		return 0;
3737 
3738 	if (no_iommu || dmar_disabled)
3739 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3740 
3741 	/*
3742 	 * If Intel-IOMMU is disabled by default, we will apply identity
3743 	 * map for all devices except those marked as being untrusted.
3744 	 */
3745 	if (dmar_disabled)
3746 		iommu_set_default_passthrough(false);
3747 
3748 	dmar_disabled = 0;
3749 	no_iommu = 0;
3750 
3751 	return 1;
3752 }
3753 
3754 static int __init probe_acpi_namespace_devices(void)
3755 {
3756 	struct dmar_drhd_unit *drhd;
3757 	/* To avoid a -Wunused-but-set-variable warning. */
3758 	struct intel_iommu *iommu __maybe_unused;
3759 	struct device *dev;
3760 	int i, ret = 0;
3761 
3762 	for_each_active_iommu(iommu, drhd) {
3763 		for_each_active_dev_scope(drhd->devices,
3764 					  drhd->devices_cnt, i, dev) {
3765 			struct acpi_device_physical_node *pn;
3766 			struct iommu_group *group;
3767 			struct acpi_device *adev;
3768 
3769 			if (dev->bus != &acpi_bus_type)
3770 				continue;
3771 
3772 			adev = to_acpi_device(dev);
3773 			mutex_lock(&adev->physical_node_lock);
3774 			list_for_each_entry(pn,
3775 					    &adev->physical_node_list, node) {
3776 				group = iommu_group_get(pn->dev);
3777 				if (group) {
3778 					iommu_group_put(group);
3779 					continue;
3780 				}
3781 
3782 				ret = iommu_probe_device(pn->dev);
3783 				if (ret)
3784 					break;
3785 			}
3786 			mutex_unlock(&adev->physical_node_lock);
3787 
3788 			if (ret)
3789 				return ret;
3790 		}
3791 	}
3792 
3793 	return 0;
3794 }
3795 
3796 static __init int tboot_force_iommu(void)
3797 {
3798 	if (!tboot_enabled())
3799 		return 0;
3800 
3801 	if (no_iommu || dmar_disabled)
3802 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3803 
3804 	dmar_disabled = 0;
3805 	no_iommu = 0;
3806 
3807 	return 1;
3808 }
3809 
3810 int __init intel_iommu_init(void)
3811 {
3812 	int ret = -ENODEV;
3813 	struct dmar_drhd_unit *drhd;
3814 	struct intel_iommu *iommu;
3815 
3816 	/*
3817 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3818 	 * opt in, so enforce that.
3819 	 */
3820 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3821 		    platform_optin_force_iommu();
3822 
3823 	down_write(&dmar_global_lock);
3824 	if (dmar_table_init()) {
3825 		if (force_on)
3826 			panic("tboot: Failed to initialize DMAR table\n");
3827 		goto out_free_dmar;
3828 	}
3829 
3830 	if (dmar_dev_scope_init() < 0) {
3831 		if (force_on)
3832 			panic("tboot: Failed to initialize DMAR device scope\n");
3833 		goto out_free_dmar;
3834 	}
3835 
3836 	up_write(&dmar_global_lock);
3837 
3838 	/*
3839 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3840 	 * complain later when we register it under the lock.
3841 	 */
3842 	dmar_register_bus_notifier();
3843 
3844 	down_write(&dmar_global_lock);
3845 
3846 	if (!no_iommu)
3847 		intel_iommu_debugfs_init();
3848 
3849 	if (no_iommu || dmar_disabled) {
3850 		/*
3851 		 * We exit the function here to ensure IOMMU's remapping and
3852 		 * mempool aren't setup, which means that the IOMMU's PMRs
3853 		 * won't be disabled via the call to init_dmars(). So disable
3854 		 * it explicitly here. The PMRs were setup by tboot prior to
3855 		 * calling SENTER, but the kernel is expected to reset/tear
3856 		 * down the PMRs.
3857 		 */
3858 		if (intel_iommu_tboot_noforce) {
3859 			for_each_iommu(iommu, drhd)
3860 				iommu_disable_protect_mem_regions(iommu);
3861 		}
3862 
3863 		/*
3864 		 * Make sure the IOMMUs are switched off, even when we
3865 		 * boot into a kexec kernel and the previous kernel left
3866 		 * them enabled
3867 		 */
3868 		intel_disable_iommus();
3869 		goto out_free_dmar;
3870 	}
3871 
3872 	if (list_empty(&dmar_rmrr_units))
3873 		pr_info("No RMRR found\n");
3874 
3875 	if (list_empty(&dmar_atsr_units))
3876 		pr_info("No ATSR found\n");
3877 
3878 	if (list_empty(&dmar_satc_units))
3879 		pr_info("No SATC found\n");
3880 
3881 	init_no_remapping_devices();
3882 
3883 	ret = init_dmars();
3884 	if (ret) {
3885 		if (force_on)
3886 			panic("tboot: Failed to initialize DMARs\n");
3887 		pr_err("Initialization failed\n");
3888 		goto out_free_dmar;
3889 	}
3890 	up_write(&dmar_global_lock);
3891 
3892 	init_iommu_pm_ops();
3893 
3894 	down_read(&dmar_global_lock);
3895 	for_each_active_iommu(iommu, drhd) {
3896 		/*
3897 		 * The flush queue implementation does not perform
3898 		 * page-selective invalidations that are required for efficient
3899 		 * TLB flushes in virtual environments.  The benefit of batching
3900 		 * is likely to be much lower than the overhead of synchronizing
3901 		 * the virtual and physical IOMMU page-tables.
3902 		 */
3903 		if (cap_caching_mode(iommu->cap) &&
3904 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3905 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3906 			iommu_set_dma_strict();
3907 		}
3908 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3909 				       intel_iommu_groups,
3910 				       "%s", iommu->name);
3911 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3912 
3913 		iommu_pmu_register(iommu);
3914 	}
3915 	up_read(&dmar_global_lock);
3916 
3917 	if (si_domain && !hw_pass_through)
3918 		register_memory_notifier(&intel_iommu_memory_nb);
3919 
3920 	down_read(&dmar_global_lock);
3921 	if (probe_acpi_namespace_devices())
3922 		pr_warn("ACPI name space devices didn't probe correctly\n");
3923 
3924 	/* Finally, we enable the DMA remapping hardware. */
3925 	for_each_iommu(iommu, drhd) {
3926 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3927 			iommu_enable_translation(iommu);
3928 
3929 		iommu_disable_protect_mem_regions(iommu);
3930 	}
3931 	up_read(&dmar_global_lock);
3932 
3933 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3934 
3935 	intel_iommu_enabled = 1;
3936 
3937 	return 0;
3938 
3939 out_free_dmar:
3940 	intel_iommu_free_dmars();
3941 	up_write(&dmar_global_lock);
3942 	return ret;
3943 }
3944 
3945 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3946 {
3947 	struct device_domain_info *info = opaque;
3948 
3949 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3950 	return 0;
3951 }
3952 
3953 /*
3954  * NB - intel-iommu lacks any sort of reference counting for the users of
3955  * dependent devices.  If multiple endpoints have intersecting dependent
3956  * devices, unbinding the driver from any one of them will possibly leave
3957  * the others unable to operate.
3958  */
3959 static void domain_context_clear(struct device_domain_info *info)
3960 {
3961 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3962 		return;
3963 
3964 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3965 			       &domain_context_clear_one_cb, info);
3966 }
3967 
3968 static void dmar_remove_one_dev_info(struct device *dev)
3969 {
3970 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3971 	struct dmar_domain *domain = info->domain;
3972 	struct intel_iommu *iommu = info->iommu;
3973 	unsigned long flags;
3974 
3975 	if (!dev_is_real_dma_subdevice(info->dev)) {
3976 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3977 			intel_pasid_tear_down_entry(iommu, info->dev,
3978 					PASID_RID2PASID, false);
3979 
3980 		iommu_disable_pci_caps(info);
3981 		domain_context_clear(info);
3982 	}
3983 
3984 	spin_lock_irqsave(&domain->lock, flags);
3985 	list_del(&info->link);
3986 	spin_unlock_irqrestore(&domain->lock, flags);
3987 
3988 	domain_detach_iommu(domain, iommu);
3989 	info->domain = NULL;
3990 }
3991 
3992 /*
3993  * Clear the page table pointer in context or pasid table entries so that
3994  * all DMA requests without PASID from the device are blocked. If the page
3995  * table has been set, clean up the data structures.
3996  */
3997 static void device_block_translation(struct device *dev)
3998 {
3999 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4000 	struct intel_iommu *iommu = info->iommu;
4001 	unsigned long flags;
4002 
4003 	iommu_disable_pci_caps(info);
4004 	if (!dev_is_real_dma_subdevice(dev)) {
4005 		if (sm_supported(iommu))
4006 			intel_pasid_tear_down_entry(iommu, dev,
4007 						    PASID_RID2PASID, false);
4008 		else
4009 			domain_context_clear(info);
4010 	}
4011 
4012 	if (!info->domain)
4013 		return;
4014 
4015 	spin_lock_irqsave(&info->domain->lock, flags);
4016 	list_del(&info->link);
4017 	spin_unlock_irqrestore(&info->domain->lock, flags);
4018 
4019 	domain_detach_iommu(info->domain, iommu);
4020 	info->domain = NULL;
4021 }
4022 
4023 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4024 {
4025 	int adjust_width;
4026 
4027 	/* calculate AGAW */
4028 	domain->gaw = guest_width;
4029 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4030 	domain->agaw = width_to_agaw(adjust_width);
4031 
4032 	domain->iommu_coherency = false;
4033 	domain->iommu_superpage = 0;
4034 	domain->max_addr = 0;
4035 
4036 	/* always allocate the top pgd */
4037 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4038 	if (!domain->pgd)
4039 		return -ENOMEM;
4040 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4041 	return 0;
4042 }
4043 
4044 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4045 				      struct device *dev)
4046 {
4047 	device_block_translation(dev);
4048 	return 0;
4049 }
4050 
4051 static struct iommu_domain blocking_domain = {
4052 	.ops = &(const struct iommu_domain_ops) {
4053 		.attach_dev	= blocking_domain_attach_dev,
4054 		.free		= intel_iommu_domain_free
4055 	}
4056 };
4057 
4058 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4059 {
4060 	struct dmar_domain *dmar_domain;
4061 	struct iommu_domain *domain;
4062 
4063 	switch (type) {
4064 	case IOMMU_DOMAIN_BLOCKED:
4065 		return &blocking_domain;
4066 	case IOMMU_DOMAIN_DMA:
4067 	case IOMMU_DOMAIN_DMA_FQ:
4068 	case IOMMU_DOMAIN_UNMANAGED:
4069 		dmar_domain = alloc_domain(type);
4070 		if (!dmar_domain) {
4071 			pr_err("Can't allocate dmar_domain\n");
4072 			return NULL;
4073 		}
4074 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4075 			pr_err("Domain initialization failed\n");
4076 			domain_exit(dmar_domain);
4077 			return NULL;
4078 		}
4079 
4080 		domain = &dmar_domain->domain;
4081 		domain->geometry.aperture_start = 0;
4082 		domain->geometry.aperture_end   =
4083 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4084 		domain->geometry.force_aperture = true;
4085 
4086 		return domain;
4087 	case IOMMU_DOMAIN_IDENTITY:
4088 		return &si_domain->domain;
4089 	case IOMMU_DOMAIN_SVA:
4090 		return intel_svm_domain_alloc();
4091 	default:
4092 		return NULL;
4093 	}
4094 
4095 	return NULL;
4096 }
4097 
4098 static void intel_iommu_domain_free(struct iommu_domain *domain)
4099 {
4100 	if (domain != &si_domain->domain && domain != &blocking_domain)
4101 		domain_exit(to_dmar_domain(domain));
4102 }
4103 
4104 static int prepare_domain_attach_device(struct iommu_domain *domain,
4105 					struct device *dev)
4106 {
4107 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4108 	struct intel_iommu *iommu;
4109 	int addr_width;
4110 
4111 	iommu = device_to_iommu(dev, NULL, NULL);
4112 	if (!iommu)
4113 		return -ENODEV;
4114 
4115 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4116 		return -EINVAL;
4117 
4118 	/* check if this iommu agaw is sufficient for max mapped address */
4119 	addr_width = agaw_to_width(iommu->agaw);
4120 	if (addr_width > cap_mgaw(iommu->cap))
4121 		addr_width = cap_mgaw(iommu->cap);
4122 
4123 	if (dmar_domain->max_addr > (1LL << addr_width))
4124 		return -EINVAL;
4125 	dmar_domain->gaw = addr_width;
4126 
4127 	/*
4128 	 * Knock out extra levels of page tables if necessary
4129 	 */
4130 	while (iommu->agaw < dmar_domain->agaw) {
4131 		struct dma_pte *pte;
4132 
4133 		pte = dmar_domain->pgd;
4134 		if (dma_pte_present(pte)) {
4135 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4136 			free_pgtable_page(pte);
4137 		}
4138 		dmar_domain->agaw--;
4139 	}
4140 
4141 	return 0;
4142 }
4143 
4144 static int intel_iommu_attach_device(struct iommu_domain *domain,
4145 				     struct device *dev)
4146 {
4147 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4148 	int ret;
4149 
4150 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4151 	    device_is_rmrr_locked(dev)) {
4152 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4153 		return -EPERM;
4154 	}
4155 
4156 	if (info->domain)
4157 		device_block_translation(dev);
4158 
4159 	ret = prepare_domain_attach_device(domain, dev);
4160 	if (ret)
4161 		return ret;
4162 
4163 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4164 }
4165 
4166 static int intel_iommu_map(struct iommu_domain *domain,
4167 			   unsigned long iova, phys_addr_t hpa,
4168 			   size_t size, int iommu_prot, gfp_t gfp)
4169 {
4170 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4171 	u64 max_addr;
4172 	int prot = 0;
4173 
4174 	if (iommu_prot & IOMMU_READ)
4175 		prot |= DMA_PTE_READ;
4176 	if (iommu_prot & IOMMU_WRITE)
4177 		prot |= DMA_PTE_WRITE;
4178 	if (dmar_domain->set_pte_snp)
4179 		prot |= DMA_PTE_SNP;
4180 
4181 	max_addr = iova + size;
4182 	if (dmar_domain->max_addr < max_addr) {
4183 		u64 end;
4184 
4185 		/* check if minimum agaw is sufficient for mapped address */
4186 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4187 		if (end < max_addr) {
4188 			pr_err("%s: iommu width (%d) is not "
4189 			       "sufficient for the mapped address (%llx)\n",
4190 			       __func__, dmar_domain->gaw, max_addr);
4191 			return -EFAULT;
4192 		}
4193 		dmar_domain->max_addr = max_addr;
4194 	}
4195 	/* Round up size to next multiple of PAGE_SIZE, if it and
4196 	   the low bits of hpa would take us onto the next page */
4197 	size = aligned_nrpages(hpa, size);
4198 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4199 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4200 }
4201 
4202 static int intel_iommu_map_pages(struct iommu_domain *domain,
4203 				 unsigned long iova, phys_addr_t paddr,
4204 				 size_t pgsize, size_t pgcount,
4205 				 int prot, gfp_t gfp, size_t *mapped)
4206 {
4207 	unsigned long pgshift = __ffs(pgsize);
4208 	size_t size = pgcount << pgshift;
4209 	int ret;
4210 
4211 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4212 		return -EINVAL;
4213 
4214 	if (!IS_ALIGNED(iova | paddr, pgsize))
4215 		return -EINVAL;
4216 
4217 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4218 	if (!ret && mapped)
4219 		*mapped = size;
4220 
4221 	return ret;
4222 }
4223 
4224 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4225 				unsigned long iova, size_t size,
4226 				struct iommu_iotlb_gather *gather)
4227 {
4228 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4229 	unsigned long start_pfn, last_pfn;
4230 	int level = 0;
4231 
4232 	/* Cope with horrid API which requires us to unmap more than the
4233 	   size argument if it happens to be a large-page mapping. */
4234 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4235 				     &level, GFP_ATOMIC)))
4236 		return 0;
4237 
4238 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4239 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4240 
4241 	start_pfn = iova >> VTD_PAGE_SHIFT;
4242 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4243 
4244 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4245 
4246 	if (dmar_domain->max_addr == iova + size)
4247 		dmar_domain->max_addr = iova;
4248 
4249 	/*
4250 	 * We do not use page-selective IOTLB invalidation in flush queue,
4251 	 * so there is no need to track page and sync iotlb.
4252 	 */
4253 	if (!iommu_iotlb_gather_queued(gather))
4254 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4255 
4256 	return size;
4257 }
4258 
4259 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4260 				      unsigned long iova,
4261 				      size_t pgsize, size_t pgcount,
4262 				      struct iommu_iotlb_gather *gather)
4263 {
4264 	unsigned long pgshift = __ffs(pgsize);
4265 	size_t size = pgcount << pgshift;
4266 
4267 	return intel_iommu_unmap(domain, iova, size, gather);
4268 }
4269 
4270 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4271 				 struct iommu_iotlb_gather *gather)
4272 {
4273 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4274 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4275 	size_t size = gather->end - gather->start;
4276 	struct iommu_domain_info *info;
4277 	unsigned long start_pfn;
4278 	unsigned long nrpages;
4279 	unsigned long i;
4280 
4281 	nrpages = aligned_nrpages(gather->start, size);
4282 	start_pfn = mm_to_dma_pfn(iova_pfn);
4283 
4284 	xa_for_each(&dmar_domain->iommu_array, i, info)
4285 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4286 				      start_pfn, nrpages,
4287 				      list_empty(&gather->freelist), 0);
4288 
4289 	put_pages_list(&gather->freelist);
4290 }
4291 
4292 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4293 					    dma_addr_t iova)
4294 {
4295 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4296 	struct dma_pte *pte;
4297 	int level = 0;
4298 	u64 phys = 0;
4299 
4300 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4301 			     GFP_ATOMIC);
4302 	if (pte && dma_pte_present(pte))
4303 		phys = dma_pte_addr(pte) +
4304 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4305 						VTD_PAGE_SHIFT) - 1));
4306 
4307 	return phys;
4308 }
4309 
4310 static bool domain_support_force_snooping(struct dmar_domain *domain)
4311 {
4312 	struct device_domain_info *info;
4313 	bool support = true;
4314 
4315 	assert_spin_locked(&domain->lock);
4316 	list_for_each_entry(info, &domain->devices, link) {
4317 		if (!ecap_sc_support(info->iommu->ecap)) {
4318 			support = false;
4319 			break;
4320 		}
4321 	}
4322 
4323 	return support;
4324 }
4325 
4326 static void domain_set_force_snooping(struct dmar_domain *domain)
4327 {
4328 	struct device_domain_info *info;
4329 
4330 	assert_spin_locked(&domain->lock);
4331 	/*
4332 	 * Second level page table supports per-PTE snoop control. The
4333 	 * iommu_map() interface will handle this by setting SNP bit.
4334 	 */
4335 	if (!domain->use_first_level) {
4336 		domain->set_pte_snp = true;
4337 		return;
4338 	}
4339 
4340 	list_for_each_entry(info, &domain->devices, link)
4341 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4342 						     PASID_RID2PASID);
4343 }
4344 
4345 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4346 {
4347 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4348 	unsigned long flags;
4349 
4350 	if (dmar_domain->force_snooping)
4351 		return true;
4352 
4353 	spin_lock_irqsave(&dmar_domain->lock, flags);
4354 	if (!domain_support_force_snooping(dmar_domain)) {
4355 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4356 		return false;
4357 	}
4358 
4359 	domain_set_force_snooping(dmar_domain);
4360 	dmar_domain->force_snooping = true;
4361 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4362 
4363 	return true;
4364 }
4365 
4366 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4367 {
4368 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4369 
4370 	switch (cap) {
4371 	case IOMMU_CAP_CACHE_COHERENCY:
4372 		return true;
4373 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4374 		return dmar_platform_optin();
4375 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4376 		return ecap_sc_support(info->iommu->ecap);
4377 	default:
4378 		return false;
4379 	}
4380 }
4381 
4382 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4383 {
4384 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4385 	struct device_domain_info *info;
4386 	struct intel_iommu *iommu;
4387 	u8 bus, devfn;
4388 	int ret;
4389 
4390 	iommu = device_to_iommu(dev, &bus, &devfn);
4391 	if (!iommu || !iommu->iommu.ops)
4392 		return ERR_PTR(-ENODEV);
4393 
4394 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4395 	if (!info)
4396 		return ERR_PTR(-ENOMEM);
4397 
4398 	if (dev_is_real_dma_subdevice(dev)) {
4399 		info->bus = pdev->bus->number;
4400 		info->devfn = pdev->devfn;
4401 		info->segment = pci_domain_nr(pdev->bus);
4402 	} else {
4403 		info->bus = bus;
4404 		info->devfn = devfn;
4405 		info->segment = iommu->segment;
4406 	}
4407 
4408 	info->dev = dev;
4409 	info->iommu = iommu;
4410 	if (dev_is_pci(dev)) {
4411 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4412 		    pci_ats_supported(pdev) &&
4413 		    dmar_ats_supported(pdev, iommu)) {
4414 			info->ats_supported = 1;
4415 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4416 
4417 			/*
4418 			 * For IOMMU that supports device IOTLB throttling
4419 			 * (DIT), we assign PFSID to the invalidation desc
4420 			 * of a VF such that IOMMU HW can gauge queue depth
4421 			 * at PF level. If DIT is not set, PFSID will be
4422 			 * treated as reserved, which should be set to 0.
4423 			 */
4424 			if (ecap_dit(iommu->ecap))
4425 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4426 			info->ats_qdep = pci_ats_queue_depth(pdev);
4427 		}
4428 		if (sm_supported(iommu)) {
4429 			if (pasid_supported(iommu)) {
4430 				int features = pci_pasid_features(pdev);
4431 
4432 				if (features >= 0)
4433 					info->pasid_supported = features | 1;
4434 			}
4435 
4436 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4437 			    pci_pri_supported(pdev))
4438 				info->pri_supported = 1;
4439 		}
4440 	}
4441 
4442 	dev_iommu_priv_set(dev, info);
4443 
4444 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4445 		ret = intel_pasid_alloc_table(dev);
4446 		if (ret) {
4447 			dev_err(dev, "PASID table allocation failed\n");
4448 			dev_iommu_priv_set(dev, NULL);
4449 			kfree(info);
4450 			return ERR_PTR(ret);
4451 		}
4452 	}
4453 
4454 	return &iommu->iommu;
4455 }
4456 
4457 static void intel_iommu_release_device(struct device *dev)
4458 {
4459 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4460 
4461 	dmar_remove_one_dev_info(dev);
4462 	intel_pasid_free_table(dev);
4463 	dev_iommu_priv_set(dev, NULL);
4464 	kfree(info);
4465 	set_dma_ops(dev, NULL);
4466 }
4467 
4468 static void intel_iommu_probe_finalize(struct device *dev)
4469 {
4470 	set_dma_ops(dev, NULL);
4471 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4472 }
4473 
4474 static void intel_iommu_get_resv_regions(struct device *device,
4475 					 struct list_head *head)
4476 {
4477 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4478 	struct iommu_resv_region *reg;
4479 	struct dmar_rmrr_unit *rmrr;
4480 	struct device *i_dev;
4481 	int i;
4482 
4483 	rcu_read_lock();
4484 	for_each_rmrr_units(rmrr) {
4485 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4486 					  i, i_dev) {
4487 			struct iommu_resv_region *resv;
4488 			enum iommu_resv_type type;
4489 			size_t length;
4490 
4491 			if (i_dev != device &&
4492 			    !is_downstream_to_pci_bridge(device, i_dev))
4493 				continue;
4494 
4495 			length = rmrr->end_address - rmrr->base_address + 1;
4496 
4497 			type = device_rmrr_is_relaxable(device) ?
4498 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4499 
4500 			resv = iommu_alloc_resv_region(rmrr->base_address,
4501 						       length, prot, type,
4502 						       GFP_ATOMIC);
4503 			if (!resv)
4504 				break;
4505 
4506 			list_add_tail(&resv->list, head);
4507 		}
4508 	}
4509 	rcu_read_unlock();
4510 
4511 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4512 	if (dev_is_pci(device)) {
4513 		struct pci_dev *pdev = to_pci_dev(device);
4514 
4515 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4516 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4517 					IOMMU_RESV_DIRECT_RELAXABLE,
4518 					GFP_KERNEL);
4519 			if (reg)
4520 				list_add_tail(&reg->list, head);
4521 		}
4522 	}
4523 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4524 
4525 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4526 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4527 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4528 	if (!reg)
4529 		return;
4530 	list_add_tail(&reg->list, head);
4531 }
4532 
4533 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4534 {
4535 	if (dev_is_pci(dev))
4536 		return pci_device_group(dev);
4537 	return generic_device_group(dev);
4538 }
4539 
4540 static int intel_iommu_enable_sva(struct device *dev)
4541 {
4542 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4543 	struct intel_iommu *iommu;
4544 
4545 	if (!info || dmar_disabled)
4546 		return -EINVAL;
4547 
4548 	iommu = info->iommu;
4549 	if (!iommu)
4550 		return -EINVAL;
4551 
4552 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4553 		return -ENODEV;
4554 
4555 	if (!info->pasid_enabled || !info->ats_enabled)
4556 		return -EINVAL;
4557 
4558 	/*
4559 	 * Devices having device-specific I/O fault handling should not
4560 	 * support PCI/PRI. The IOMMU side has no means to check the
4561 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4562 	 * default that if the device driver enables SVA on a non-PRI
4563 	 * device, it will handle IOPF in its own way.
4564 	 */
4565 	if (!info->pri_supported)
4566 		return 0;
4567 
4568 	/* Devices supporting PRI should have it enabled. */
4569 	if (!info->pri_enabled)
4570 		return -EINVAL;
4571 
4572 	return 0;
4573 }
4574 
4575 static int intel_iommu_enable_iopf(struct device *dev)
4576 {
4577 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4578 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4579 	struct intel_iommu *iommu;
4580 	int ret;
4581 
4582 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4583 		return -ENODEV;
4584 
4585 	if (info->pri_enabled)
4586 		return -EBUSY;
4587 
4588 	iommu = info->iommu;
4589 	if (!iommu)
4590 		return -EINVAL;
4591 
4592 	/* PASID is required in PRG Response Message. */
4593 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4594 		return -EINVAL;
4595 
4596 	ret = pci_reset_pri(pdev);
4597 	if (ret)
4598 		return ret;
4599 
4600 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4601 	if (ret)
4602 		return ret;
4603 
4604 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4605 	if (ret)
4606 		goto iopf_remove_device;
4607 
4608 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4609 	if (ret)
4610 		goto iopf_unregister_handler;
4611 	info->pri_enabled = 1;
4612 
4613 	return 0;
4614 
4615 iopf_unregister_handler:
4616 	iommu_unregister_device_fault_handler(dev);
4617 iopf_remove_device:
4618 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4619 
4620 	return ret;
4621 }
4622 
4623 static int intel_iommu_disable_iopf(struct device *dev)
4624 {
4625 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4626 	struct intel_iommu *iommu = info->iommu;
4627 
4628 	if (!info->pri_enabled)
4629 		return -EINVAL;
4630 
4631 	/*
4632 	 * PCIe spec states that by clearing PRI enable bit, the Page
4633 	 * Request Interface will not issue new page requests, but has
4634 	 * outstanding page requests that have been transmitted or are
4635 	 * queued for transmission. This is supposed to be called after
4636 	 * the device driver has stopped DMA, all PASIDs have been
4637 	 * unbound and the outstanding PRQs have been drained.
4638 	 */
4639 	pci_disable_pri(to_pci_dev(dev));
4640 	info->pri_enabled = 0;
4641 
4642 	/*
4643 	 * With PRI disabled and outstanding PRQs drained, unregistering
4644 	 * fault handler and removing device from iopf queue should never
4645 	 * fail.
4646 	 */
4647 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4648 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4649 
4650 	return 0;
4651 }
4652 
4653 static int
4654 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4655 {
4656 	switch (feat) {
4657 	case IOMMU_DEV_FEAT_IOPF:
4658 		return intel_iommu_enable_iopf(dev);
4659 
4660 	case IOMMU_DEV_FEAT_SVA:
4661 		return intel_iommu_enable_sva(dev);
4662 
4663 	default:
4664 		return -ENODEV;
4665 	}
4666 }
4667 
4668 static int
4669 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4670 {
4671 	switch (feat) {
4672 	case IOMMU_DEV_FEAT_IOPF:
4673 		return intel_iommu_disable_iopf(dev);
4674 
4675 	case IOMMU_DEV_FEAT_SVA:
4676 		return 0;
4677 
4678 	default:
4679 		return -ENODEV;
4680 	}
4681 }
4682 
4683 static bool intel_iommu_is_attach_deferred(struct device *dev)
4684 {
4685 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4686 
4687 	return translation_pre_enabled(info->iommu) && !info->domain;
4688 }
4689 
4690 /*
4691  * Check that the device does not live on an external facing PCI port that is
4692  * marked as untrusted. Such devices should not be able to apply quirks and
4693  * thus not be able to bypass the IOMMU restrictions.
4694  */
4695 static bool risky_device(struct pci_dev *pdev)
4696 {
4697 	if (pdev->untrusted) {
4698 		pci_info(pdev,
4699 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4700 			 pdev->vendor, pdev->device);
4701 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4702 		return true;
4703 	}
4704 	return false;
4705 }
4706 
4707 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4708 				       unsigned long iova, size_t size)
4709 {
4710 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4711 	unsigned long pages = aligned_nrpages(iova, size);
4712 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4713 	struct iommu_domain_info *info;
4714 	unsigned long i;
4715 
4716 	xa_for_each(&dmar_domain->iommu_array, i, info)
4717 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4718 }
4719 
4720 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4721 {
4722 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4723 	struct iommu_domain *domain;
4724 
4725 	/* Domain type specific cleanup: */
4726 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4727 	if (domain) {
4728 		switch (domain->type) {
4729 		case IOMMU_DOMAIN_SVA:
4730 			intel_svm_remove_dev_pasid(dev, pasid);
4731 			break;
4732 		default:
4733 			/* should never reach here */
4734 			WARN_ON(1);
4735 			break;
4736 		}
4737 	}
4738 
4739 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4740 }
4741 
4742 const struct iommu_ops intel_iommu_ops = {
4743 	.capable		= intel_iommu_capable,
4744 	.domain_alloc		= intel_iommu_domain_alloc,
4745 	.probe_device		= intel_iommu_probe_device,
4746 	.probe_finalize		= intel_iommu_probe_finalize,
4747 	.release_device		= intel_iommu_release_device,
4748 	.get_resv_regions	= intel_iommu_get_resv_regions,
4749 	.device_group		= intel_iommu_device_group,
4750 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4751 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4752 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4753 	.def_domain_type	= device_def_domain_type,
4754 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4755 	.pgsize_bitmap		= SZ_4K,
4756 #ifdef CONFIG_INTEL_IOMMU_SVM
4757 	.page_response		= intel_svm_page_response,
4758 #endif
4759 	.default_domain_ops = &(const struct iommu_domain_ops) {
4760 		.attach_dev		= intel_iommu_attach_device,
4761 		.map_pages		= intel_iommu_map_pages,
4762 		.unmap_pages		= intel_iommu_unmap_pages,
4763 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4764 		.flush_iotlb_all        = intel_flush_iotlb_all,
4765 		.iotlb_sync		= intel_iommu_tlb_sync,
4766 		.iova_to_phys		= intel_iommu_iova_to_phys,
4767 		.free			= intel_iommu_domain_free,
4768 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4769 	}
4770 };
4771 
4772 static void quirk_iommu_igfx(struct pci_dev *dev)
4773 {
4774 	if (risky_device(dev))
4775 		return;
4776 
4777 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4778 	dmar_map_gfx = 0;
4779 }
4780 
4781 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4789 
4790 /* Broadwell igfx malfunctions with dmar */
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4815 
4816 static void quirk_iommu_rwbf(struct pci_dev *dev)
4817 {
4818 	if (risky_device(dev))
4819 		return;
4820 
4821 	/*
4822 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4823 	 * but needs it. Same seems to hold for the desktop versions.
4824 	 */
4825 	pci_info(dev, "Forcing write-buffer flush capability\n");
4826 	rwbf_quirk = 1;
4827 }
4828 
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4836 
4837 #define GGC 0x52
4838 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4839 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4840 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4841 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4842 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4843 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4844 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4845 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4846 
4847 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4848 {
4849 	unsigned short ggc;
4850 
4851 	if (risky_device(dev))
4852 		return;
4853 
4854 	if (pci_read_config_word(dev, GGC, &ggc))
4855 		return;
4856 
4857 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4858 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4859 		dmar_map_gfx = 0;
4860 	} else if (dmar_map_gfx) {
4861 		/* we have to ensure the gfx device is idle before we flush */
4862 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4863 		iommu_set_dma_strict();
4864 	}
4865 }
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4870 
4871 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4872 {
4873 	unsigned short ver;
4874 
4875 	if (!IS_GFX_DEVICE(dev))
4876 		return;
4877 
4878 	ver = (dev->device >> 8) & 0xff;
4879 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4880 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4881 	    ver != 0x9a && ver != 0xa7)
4882 		return;
4883 
4884 	if (risky_device(dev))
4885 		return;
4886 
4887 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4888 	iommu_skip_te_disable = 1;
4889 }
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4891 
4892 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4893    ISOCH DMAR unit for the Azalia sound device, but not give it any
4894    TLB entries, which causes it to deadlock. Check for that.  We do
4895    this in a function called from init_dmars(), instead of in a PCI
4896    quirk, because we don't want to print the obnoxious "BIOS broken"
4897    message if VT-d is actually disabled.
4898 */
4899 static void __init check_tylersburg_isoch(void)
4900 {
4901 	struct pci_dev *pdev;
4902 	uint32_t vtisochctrl;
4903 
4904 	/* If there's no Azalia in the system anyway, forget it. */
4905 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4906 	if (!pdev)
4907 		return;
4908 
4909 	if (risky_device(pdev)) {
4910 		pci_dev_put(pdev);
4911 		return;
4912 	}
4913 
4914 	pci_dev_put(pdev);
4915 
4916 	/* System Management Registers. Might be hidden, in which case
4917 	   we can't do the sanity check. But that's OK, because the
4918 	   known-broken BIOSes _don't_ actually hide it, so far. */
4919 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4920 	if (!pdev)
4921 		return;
4922 
4923 	if (risky_device(pdev)) {
4924 		pci_dev_put(pdev);
4925 		return;
4926 	}
4927 
4928 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4929 		pci_dev_put(pdev);
4930 		return;
4931 	}
4932 
4933 	pci_dev_put(pdev);
4934 
4935 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4936 	if (vtisochctrl & 1)
4937 		return;
4938 
4939 	/* Drop all bits other than the number of TLB entries */
4940 	vtisochctrl &= 0x1c;
4941 
4942 	/* If we have the recommended number of TLB entries (16), fine. */
4943 	if (vtisochctrl == 0x10)
4944 		return;
4945 
4946 	/* Zero TLB entries? You get to ride the short bus to school. */
4947 	if (!vtisochctrl) {
4948 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4949 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4950 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4951 		     dmi_get_system_info(DMI_BIOS_VERSION),
4952 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4953 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4954 		return;
4955 	}
4956 
4957 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4958 	       vtisochctrl);
4959 }
4960 
4961 /*
4962  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4963  * invalidation completion before posted writes initiated with translated address
4964  * that utilized translations matching the invalidation address range, violating
4965  * the invalidation completion ordering.
4966  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4967  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4968  * under the control of the trusted/privileged host device driver must use this
4969  * quirk.
4970  * Device TLBs are invalidated under the following six conditions:
4971  * 1. Device driver does DMA API unmap IOVA
4972  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4973  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4974  *    exit_mmap() due to crash
4975  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4976  *    VM has to free pages that were unmapped
4977  * 5. Userspace driver unmaps a DMA buffer
4978  * 6. Cache invalidation in vSVA usage (upcoming)
4979  *
4980  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4981  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4982  * invalidate TLB the same way as normal user unmap which will use this quirk.
4983  * The dTLB invalidation after PASID cache flush does not need this quirk.
4984  *
4985  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4986  */
4987 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4988 			       unsigned long address, unsigned long mask,
4989 			       u32 pasid, u16 qdep)
4990 {
4991 	u16 sid;
4992 
4993 	if (likely(!info->dtlb_extra_inval))
4994 		return;
4995 
4996 	sid = PCI_DEVID(info->bus, info->devfn);
4997 	if (pasid == PASID_RID2PASID) {
4998 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4999 				   qdep, address, mask);
5000 	} else {
5001 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5002 					 pasid, qdep, address, mask);
5003 	}
5004 }
5005 
5006 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5007 
5008 /*
5009  * Function to submit a command to the enhanced command interface. The
5010  * valid enhanced command descriptions are defined in Table 47 of the
5011  * VT-d spec. The VT-d hardware implementation may support some but not
5012  * all commands, which can be determined by checking the Enhanced
5013  * Command Capability Register.
5014  *
5015  * Return values:
5016  *  - 0: Command successful without any error;
5017  *  - Negative: software error value;
5018  *  - Nonzero positive: failure status code defined in Table 48.
5019  */
5020 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5021 {
5022 	unsigned long flags;
5023 	u64 res;
5024 	int ret;
5025 
5026 	if (!cap_ecmds(iommu->cap))
5027 		return -ENODEV;
5028 
5029 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5030 
5031 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5032 	if (res & DMA_ECMD_ECRSP_IP) {
5033 		ret = -EBUSY;
5034 		goto err;
5035 	}
5036 
5037 	/*
5038 	 * Unconditionally write the operand B, because
5039 	 * - There is no side effect if an ecmd doesn't require an
5040 	 *   operand B, but we set the register to some value.
5041 	 * - It's not invoked in any critical path. The extra MMIO
5042 	 *   write doesn't bring any performance concerns.
5043 	 */
5044 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5045 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5046 
5047 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5048 		      !(res & DMA_ECMD_ECRSP_IP), res);
5049 
5050 	if (res & DMA_ECMD_ECRSP_IP) {
5051 		ret = -ETIMEDOUT;
5052 		goto err;
5053 	}
5054 
5055 	ret = ecmd_get_status_code(res);
5056 err:
5057 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5058 
5059 	return ret;
5060 }
5061