xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision bfa87ac8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void dmar_remove_one_dev_info(struct device *dev);
281 
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284 
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287 
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292 
293 #define IDENTMAP_GFX		2
294 #define IDENTMAP_AZALIA		4
295 
296 const struct iommu_ops intel_iommu_ops;
297 
298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302 
303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307 
308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310 	u32 gsts;
311 
312 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
313 	if (gsts & DMA_GSTS_TES)
314 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316 
317 static int __init intel_iommu_setup(char *str)
318 {
319 	if (!str)
320 		return -EINVAL;
321 
322 	while (*str) {
323 		if (!strncmp(str, "on", 2)) {
324 			dmar_disabled = 0;
325 			pr_info("IOMMU enabled\n");
326 		} else if (!strncmp(str, "off", 3)) {
327 			dmar_disabled = 1;
328 			no_platform_optin = 1;
329 			pr_info("IOMMU disabled\n");
330 		} else if (!strncmp(str, "igfx_off", 8)) {
331 			dmar_map_gfx = 0;
332 			pr_info("Disable GFX device mapping\n");
333 		} else if (!strncmp(str, "forcedac", 8)) {
334 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335 			iommu_dma_forcedac = true;
336 		} else if (!strncmp(str, "strict", 6)) {
337 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338 			iommu_set_dma_strict();
339 		} else if (!strncmp(str, "sp_off", 6)) {
340 			pr_info("Disable supported super page\n");
341 			intel_iommu_superpage = 0;
342 		} else if (!strncmp(str, "sm_on", 5)) {
343 			pr_info("Enable scalable mode if hardware supports\n");
344 			intel_iommu_sm = 1;
345 		} else if (!strncmp(str, "sm_off", 6)) {
346 			pr_info("Scalable mode is disallowed\n");
347 			intel_iommu_sm = 0;
348 		} else if (!strncmp(str, "tboot_noforce", 13)) {
349 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350 			intel_iommu_tboot_noforce = 1;
351 		} else {
352 			pr_notice("Unknown option - '%s'\n", str);
353 		}
354 
355 		str += strcspn(str, ",");
356 		while (*str == ',')
357 			str++;
358 	}
359 
360 	return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363 
364 void *alloc_pgtable_page(int node)
365 {
366 	struct page *page;
367 	void *vaddr = NULL;
368 
369 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370 	if (page)
371 		vaddr = page_address(page);
372 	return vaddr;
373 }
374 
375 void free_pgtable_page(void *vaddr)
376 {
377 	free_page((unsigned long)vaddr);
378 }
379 
380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384 
385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389 
390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 				       unsigned long pfn)
392 {
393 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394 
395 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397 
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 	unsigned long fl_sagaw, sl_sagaw;
406 
407 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 	sl_sagaw = cap_sagaw(iommu->cap);
409 
410 	/* Second level only. */
411 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 		return sl_sagaw;
413 
414 	/* First level only. */
415 	if (!ecap_slts(iommu->ecap))
416 		return fl_sagaw;
417 
418 	return fl_sagaw & sl_sagaw;
419 }
420 
421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 	unsigned long sagaw;
424 	int agaw;
425 
426 	sagaw = __iommu_calculate_sagaw(iommu);
427 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 		if (test_bit(agaw, &sagaw))
429 			break;
430 	}
431 
432 	return agaw;
433 }
434 
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442 
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452 
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 	return sm_supported(iommu) ?
456 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458 
459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 	struct iommu_domain_info *info;
462 	struct dmar_drhd_unit *drhd;
463 	struct intel_iommu *iommu;
464 	bool found = false;
465 	unsigned long i;
466 
467 	domain->iommu_coherency = true;
468 	xa_for_each(&domain->iommu_array, i, info) {
469 		found = true;
470 		if (!iommu_paging_structure_coherency(info->iommu)) {
471 			domain->iommu_coherency = false;
472 			break;
473 		}
474 	}
475 	if (found)
476 		return;
477 
478 	/* No hardware attached; use lowest common denominator */
479 	rcu_read_lock();
480 	for_each_active_iommu(iommu, drhd) {
481 		if (!iommu_paging_structure_coherency(iommu)) {
482 			domain->iommu_coherency = false;
483 			break;
484 		}
485 	}
486 	rcu_read_unlock();
487 }
488 
489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 					 struct intel_iommu *skip)
491 {
492 	struct dmar_drhd_unit *drhd;
493 	struct intel_iommu *iommu;
494 	int mask = 0x3;
495 
496 	if (!intel_iommu_superpage)
497 		return 0;
498 
499 	/* set iommu_superpage to the smallest common denominator */
500 	rcu_read_lock();
501 	for_each_active_iommu(iommu, drhd) {
502 		if (iommu != skip) {
503 			if (domain && domain_use_first_level(domain)) {
504 				if (!cap_fl1gp_support(iommu->cap))
505 					mask = 0x1;
506 			} else {
507 				mask &= cap_super_page_val(iommu->cap);
508 			}
509 
510 			if (!mask)
511 				break;
512 		}
513 	}
514 	rcu_read_unlock();
515 
516 	return fls(mask);
517 }
518 
519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 	struct device_domain_info *info;
522 	int nid = NUMA_NO_NODE;
523 	unsigned long flags;
524 
525 	spin_lock_irqsave(&domain->lock, flags);
526 	list_for_each_entry(info, &domain->devices, link) {
527 		/*
528 		 * There could possibly be multiple device numa nodes as devices
529 		 * within the same domain may sit behind different IOMMUs. There
530 		 * isn't perfect answer in such situation, so we select first
531 		 * come first served policy.
532 		 */
533 		nid = dev_to_node(info->dev);
534 		if (nid != NUMA_NO_NODE)
535 			break;
536 	}
537 	spin_unlock_irqrestore(&domain->lock, flags);
538 
539 	return nid;
540 }
541 
542 static void domain_update_iotlb(struct dmar_domain *domain);
543 
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 	unsigned long bitmap = 0;
548 
549 	/*
550 	 * 1-level super page supports page size of 2MiB, 2-level super page
551 	 * supports page size of both 2MiB and 1GiB.
552 	 */
553 	if (domain->iommu_superpage == 1)
554 		bitmap |= SZ_2M;
555 	else if (domain->iommu_superpage == 2)
556 		bitmap |= SZ_2M | SZ_1G;
557 
558 	return bitmap;
559 }
560 
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 	domain_update_iommu_coherency(domain);
565 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566 
567 	/*
568 	 * If RHSA is missing, we should default to the device numa domain
569 	 * as fall back.
570 	 */
571 	if (domain->nid == NUMA_NO_NODE)
572 		domain->nid = domain_update_device_node(domain);
573 
574 	/*
575 	 * First-level translation restricts the input-address to a
576 	 * canonical address (i.e., address bits 63:N have the same
577 	 * value as address bit [N-1], where N is 48-bits with 4-level
578 	 * paging and 57-bits with 5-level paging). Hence, skip bit
579 	 * [N-1].
580 	 */
581 	if (domain_use_first_level(domain))
582 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 	else
584 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585 
586 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 	domain_update_iotlb(domain);
588 }
589 
590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 					 u8 devfn, int alloc)
592 {
593 	struct root_entry *root = &iommu->root_entry[bus];
594 	struct context_entry *context;
595 	u64 *entry;
596 
597 	/*
598 	 * Except that the caller requested to allocate a new entry,
599 	 * returning a copied context entry makes no sense.
600 	 */
601 	if (!alloc && context_copied(iommu, bus, devfn))
602 		return NULL;
603 
604 	entry = &root->lo;
605 	if (sm_supported(iommu)) {
606 		if (devfn >= 0x80) {
607 			devfn -= 0x80;
608 			entry = &root->hi;
609 		}
610 		devfn *= 2;
611 	}
612 	if (*entry & 1)
613 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 	else {
615 		unsigned long phy_addr;
616 		if (!alloc)
617 			return NULL;
618 
619 		context = alloc_pgtable_page(iommu->node);
620 		if (!context)
621 			return NULL;
622 
623 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 		phy_addr = virt_to_phys((void *)context);
625 		*entry = phy_addr | 1;
626 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
627 	}
628 	return &context[devfn];
629 }
630 
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *				 sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 	struct pci_dev *pdev, *pbridge;
643 
644 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 		return false;
646 
647 	pdev = to_pci_dev(dev);
648 	pbridge = to_pci_dev(bridge);
649 
650 	if (pbridge->subordinate &&
651 	    pbridge->subordinate->number <= pdev->bus->number &&
652 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 		return true;
654 
655 	return false;
656 }
657 
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 	struct dmar_drhd_unit *drhd;
661 	u32 vtbar;
662 	int rc;
663 
664 	/* We know that this device on this chipset has its own IOMMU.
665 	 * If we find it under a different IOMMU, then the BIOS is lying
666 	 * to us. Hope that the IOMMU for this device is actually
667 	 * disabled, and it needs no translation...
668 	 */
669 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 	if (rc) {
671 		/* "can't" happen */
672 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 		return false;
674 	}
675 	vtbar &= 0xffff0000;
676 
677 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
678 	drhd = dmar_find_matched_drhd_unit(pdev);
679 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 		return true;
683 	}
684 
685 	return false;
686 }
687 
688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 	if (!iommu || iommu->drhd->ignored)
691 		return true;
692 
693 	if (dev_is_pci(dev)) {
694 		struct pci_dev *pdev = to_pci_dev(dev);
695 
696 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 		    quirk_ioat_snb_local_iommu(pdev))
699 			return true;
700 	}
701 
702 	return false;
703 }
704 
705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 	struct dmar_drhd_unit *drhd = NULL;
708 	struct pci_dev *pdev = NULL;
709 	struct intel_iommu *iommu;
710 	struct device *tmp;
711 	u16 segment = 0;
712 	int i;
713 
714 	if (!dev)
715 		return NULL;
716 
717 	if (dev_is_pci(dev)) {
718 		struct pci_dev *pf_pdev;
719 
720 		pdev = pci_real_dma_dev(to_pci_dev(dev));
721 
722 		/* VFs aren't listed in scope tables; we need to look up
723 		 * the PF instead to find the IOMMU. */
724 		pf_pdev = pci_physfn(pdev);
725 		dev = &pf_pdev->dev;
726 		segment = pci_domain_nr(pdev->bus);
727 	} else if (has_acpi_companion(dev))
728 		dev = &ACPI_COMPANION(dev)->dev;
729 
730 	rcu_read_lock();
731 	for_each_iommu(iommu, drhd) {
732 		if (pdev && segment != drhd->segment)
733 			continue;
734 
735 		for_each_active_dev_scope(drhd->devices,
736 					  drhd->devices_cnt, i, tmp) {
737 			if (tmp == dev) {
738 				/* For a VF use its original BDF# not that of the PF
739 				 * which we used for the IOMMU lookup. Strictly speaking
740 				 * we could do this for all PCI devices; we only need to
741 				 * get the BDF# from the scope table for ACPI matches. */
742 				if (pdev && pdev->is_virtfn)
743 					goto got_pdev;
744 
745 				if (bus && devfn) {
746 					*bus = drhd->devices[i].bus;
747 					*devfn = drhd->devices[i].devfn;
748 				}
749 				goto out;
750 			}
751 
752 			if (is_downstream_to_pci_bridge(dev, tmp))
753 				goto got_pdev;
754 		}
755 
756 		if (pdev && drhd->include_all) {
757 got_pdev:
758 			if (bus && devfn) {
759 				*bus = pdev->bus->number;
760 				*devfn = pdev->devfn;
761 			}
762 			goto out;
763 		}
764 	}
765 	iommu = NULL;
766 out:
767 	if (iommu_is_dummy(iommu, dev))
768 		iommu = NULL;
769 
770 	rcu_read_unlock();
771 
772 	return iommu;
773 }
774 
775 static void domain_flush_cache(struct dmar_domain *domain,
776 			       void *addr, int size)
777 {
778 	if (!domain->iommu_coherency)
779 		clflush_cache_range(addr, size);
780 }
781 
782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784 	struct context_entry *context;
785 	int ret = 0;
786 
787 	spin_lock(&iommu->lock);
788 	context = iommu_context_addr(iommu, bus, devfn, 0);
789 	if (context)
790 		ret = context_present(context);
791 	spin_unlock(&iommu->lock);
792 	return ret;
793 }
794 
795 static void free_context_table(struct intel_iommu *iommu)
796 {
797 	struct context_entry *context;
798 	int i;
799 
800 	if (!iommu->root_entry)
801 		return;
802 
803 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
804 		context = iommu_context_addr(iommu, i, 0, 0);
805 		if (context)
806 			free_pgtable_page(context);
807 
808 		if (!sm_supported(iommu))
809 			continue;
810 
811 		context = iommu_context_addr(iommu, i, 0x80, 0);
812 		if (context)
813 			free_pgtable_page(context);
814 	}
815 
816 	free_pgtable_page(iommu->root_entry);
817 	iommu->root_entry = NULL;
818 }
819 
820 #ifdef CONFIG_DMAR_DEBUG
821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824 	struct dma_pte *pte;
825 	int offset;
826 
827 	while (1) {
828 		offset = pfn_level_offset(pfn, level);
829 		pte = &parent[offset];
830 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831 			pr_info("PTE not present at level %d\n", level);
832 			break;
833 		}
834 
835 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836 
837 		if (level == 1)
838 			break;
839 
840 		parent = phys_to_virt(dma_pte_addr(pte));
841 		level--;
842 	}
843 }
844 
845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846 			  unsigned long long addr, u32 pasid)
847 {
848 	struct pasid_dir_entry *dir, *pde;
849 	struct pasid_entry *entries, *pte;
850 	struct context_entry *ctx_entry;
851 	struct root_entry *rt_entry;
852 	int i, dir_index, index, level;
853 	u8 devfn = source_id & 0xff;
854 	u8 bus = source_id >> 8;
855 	struct dma_pte *pgtable;
856 
857 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858 
859 	/* root entry dump */
860 	rt_entry = &iommu->root_entry[bus];
861 	if (!rt_entry) {
862 		pr_info("root table entry is not present\n");
863 		return;
864 	}
865 
866 	if (sm_supported(iommu))
867 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868 			rt_entry->hi, rt_entry->lo);
869 	else
870 		pr_info("root entry: 0x%016llx", rt_entry->lo);
871 
872 	/* context entry dump */
873 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874 	if (!ctx_entry) {
875 		pr_info("context table entry is not present\n");
876 		return;
877 	}
878 
879 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880 		ctx_entry->hi, ctx_entry->lo);
881 
882 	/* legacy mode does not require PASID entries */
883 	if (!sm_supported(iommu)) {
884 		level = agaw_to_level(ctx_entry->hi & 7);
885 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886 		goto pgtable_walk;
887 	}
888 
889 	/* get the pointer to pasid directory entry */
890 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891 	if (!dir) {
892 		pr_info("pasid directory entry is not present\n");
893 		return;
894 	}
895 	/* For request-without-pasid, get the pasid from context entry */
896 	if (intel_iommu_sm && pasid == INVALID_IOASID)
897 		pasid = PASID_RID2PASID;
898 
899 	dir_index = pasid >> PASID_PDE_SHIFT;
900 	pde = &dir[dir_index];
901 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902 
903 	/* get the pointer to the pasid table entry */
904 	entries = get_pasid_table_from_pde(pde);
905 	if (!entries) {
906 		pr_info("pasid table entry is not present\n");
907 		return;
908 	}
909 	index = pasid & PASID_PTE_MASK;
910 	pte = &entries[index];
911 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level)
929 {
930 	struct dma_pte *parent, *pte;
931 	int level = agaw_to_level(domain->agaw);
932 	int offset;
933 
934 	BUG_ON(!domain->pgd);
935 
936 	if (!domain_pfn_supported(domain, pfn))
937 		/* Address beyond IOMMU's addressing capabilities. */
938 		return NULL;
939 
940 	parent = domain->pgd;
941 
942 	while (1) {
943 		void *tmp_page;
944 
945 		offset = pfn_level_offset(pfn, level);
946 		pte = &parent[offset];
947 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948 			break;
949 		if (level == *target_level)
950 			break;
951 
952 		if (!dma_pte_present(pte)) {
953 			uint64_t pteval;
954 
955 			tmp_page = alloc_pgtable_page(domain->nid);
956 
957 			if (!tmp_page)
958 				return NULL;
959 
960 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962 			if (domain_use_first_level(domain))
963 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
964 
965 			if (cmpxchg64(&pte->val, 0ULL, pteval))
966 				/* Someone else set it while we were thinking; use theirs. */
967 				free_pgtable_page(tmp_page);
968 			else
969 				domain_flush_cache(domain, pte, sizeof(*pte));
970 		}
971 		if (level == 1)
972 			break;
973 
974 		parent = phys_to_virt(dma_pte_addr(pte));
975 		level--;
976 	}
977 
978 	if (!*target_level)
979 		*target_level = level;
980 
981 	return pte;
982 }
983 
984 /* return address's pte at specific level */
985 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
986 					 unsigned long pfn,
987 					 int level, int *large_page)
988 {
989 	struct dma_pte *parent, *pte;
990 	int total = agaw_to_level(domain->agaw);
991 	int offset;
992 
993 	parent = domain->pgd;
994 	while (level <= total) {
995 		offset = pfn_level_offset(pfn, total);
996 		pte = &parent[offset];
997 		if (level == total)
998 			return pte;
999 
1000 		if (!dma_pte_present(pte)) {
1001 			*large_page = total;
1002 			break;
1003 		}
1004 
1005 		if (dma_pte_superpage(pte)) {
1006 			*large_page = total;
1007 			return pte;
1008 		}
1009 
1010 		parent = phys_to_virt(dma_pte_addr(pte));
1011 		total--;
1012 	}
1013 	return NULL;
1014 }
1015 
1016 /* clear last level pte, a tlb flush should be followed */
1017 static void dma_pte_clear_range(struct dmar_domain *domain,
1018 				unsigned long start_pfn,
1019 				unsigned long last_pfn)
1020 {
1021 	unsigned int large_page;
1022 	struct dma_pte *first_pte, *pte;
1023 
1024 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1025 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1026 	BUG_ON(start_pfn > last_pfn);
1027 
1028 	/* we don't need lock here; nobody else touches the iova range */
1029 	do {
1030 		large_page = 1;
1031 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1032 		if (!pte) {
1033 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1034 			continue;
1035 		}
1036 		do {
1037 			dma_clear_pte(pte);
1038 			start_pfn += lvl_to_nr_pages(large_page);
1039 			pte++;
1040 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1041 
1042 		domain_flush_cache(domain, first_pte,
1043 				   (void *)pte - (void *)first_pte);
1044 
1045 	} while (start_pfn && start_pfn <= last_pfn);
1046 }
1047 
1048 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1049 			       int retain_level, struct dma_pte *pte,
1050 			       unsigned long pfn, unsigned long start_pfn,
1051 			       unsigned long last_pfn)
1052 {
1053 	pfn = max(start_pfn, pfn);
1054 	pte = &pte[pfn_level_offset(pfn, level)];
1055 
1056 	do {
1057 		unsigned long level_pfn;
1058 		struct dma_pte *level_pte;
1059 
1060 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1061 			goto next;
1062 
1063 		level_pfn = pfn & level_mask(level);
1064 		level_pte = phys_to_virt(dma_pte_addr(pte));
1065 
1066 		if (level > 2) {
1067 			dma_pte_free_level(domain, level - 1, retain_level,
1068 					   level_pte, level_pfn, start_pfn,
1069 					   last_pfn);
1070 		}
1071 
1072 		/*
1073 		 * Free the page table if we're below the level we want to
1074 		 * retain and the range covers the entire table.
1075 		 */
1076 		if (level < retain_level && !(start_pfn > level_pfn ||
1077 		      last_pfn < level_pfn + level_size(level) - 1)) {
1078 			dma_clear_pte(pte);
1079 			domain_flush_cache(domain, pte, sizeof(*pte));
1080 			free_pgtable_page(level_pte);
1081 		}
1082 next:
1083 		pfn += level_size(level);
1084 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1085 }
1086 
1087 /*
1088  * clear last level (leaf) ptes and free page table pages below the
1089  * level we wish to keep intact.
1090  */
1091 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1092 				   unsigned long start_pfn,
1093 				   unsigned long last_pfn,
1094 				   int retain_level)
1095 {
1096 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1097 
1098 	/* We don't need lock here; nobody else touches the iova range */
1099 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1100 			   domain->pgd, 0, start_pfn, last_pfn);
1101 
1102 	/* free pgd */
1103 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1104 		free_pgtable_page(domain->pgd);
1105 		domain->pgd = NULL;
1106 	}
1107 }
1108 
1109 /* When a page at a given level is being unlinked from its parent, we don't
1110    need to *modify* it at all. All we need to do is make a list of all the
1111    pages which can be freed just as soon as we've flushed the IOTLB and we
1112    know the hardware page-walk will no longer touch them.
1113    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1114    be freed. */
1115 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1116 				    int level, struct dma_pte *pte,
1117 				    struct list_head *freelist)
1118 {
1119 	struct page *pg;
1120 
1121 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1122 	list_add_tail(&pg->lru, freelist);
1123 
1124 	if (level == 1)
1125 		return;
1126 
1127 	pte = page_address(pg);
1128 	do {
1129 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1130 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1131 		pte++;
1132 	} while (!first_pte_in_page(pte));
1133 }
1134 
1135 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1136 				struct dma_pte *pte, unsigned long pfn,
1137 				unsigned long start_pfn, unsigned long last_pfn,
1138 				struct list_head *freelist)
1139 {
1140 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1141 
1142 	pfn = max(start_pfn, pfn);
1143 	pte = &pte[pfn_level_offset(pfn, level)];
1144 
1145 	do {
1146 		unsigned long level_pfn = pfn & level_mask(level);
1147 
1148 		if (!dma_pte_present(pte))
1149 			goto next;
1150 
1151 		/* If range covers entire pagetable, free it */
1152 		if (start_pfn <= level_pfn &&
1153 		    last_pfn >= level_pfn + level_size(level) - 1) {
1154 			/* These suborbinate page tables are going away entirely. Don't
1155 			   bother to clear them; we're just going to *free* them. */
1156 			if (level > 1 && !dma_pte_superpage(pte))
1157 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1158 
1159 			dma_clear_pte(pte);
1160 			if (!first_pte)
1161 				first_pte = pte;
1162 			last_pte = pte;
1163 		} else if (level > 1) {
1164 			/* Recurse down into a level that isn't *entirely* obsolete */
1165 			dma_pte_clear_level(domain, level - 1,
1166 					    phys_to_virt(dma_pte_addr(pte)),
1167 					    level_pfn, start_pfn, last_pfn,
1168 					    freelist);
1169 		}
1170 next:
1171 		pfn = level_pfn + level_size(level);
1172 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1173 
1174 	if (first_pte)
1175 		domain_flush_cache(domain, first_pte,
1176 				   (void *)++last_pte - (void *)first_pte);
1177 }
1178 
1179 /* We can't just free the pages because the IOMMU may still be walking
1180    the page tables, and may have cached the intermediate levels. The
1181    pages can only be freed after the IOTLB flush has been done. */
1182 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1183 			 unsigned long last_pfn, struct list_head *freelist)
1184 {
1185 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1186 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1187 	BUG_ON(start_pfn > last_pfn);
1188 
1189 	/* we don't need lock here; nobody else touches the iova range */
1190 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1191 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1192 
1193 	/* free pgd */
1194 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1195 		struct page *pgd_page = virt_to_page(domain->pgd);
1196 		list_add_tail(&pgd_page->lru, freelist);
1197 		domain->pgd = NULL;
1198 	}
1199 }
1200 
1201 /* iommu handling */
1202 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1203 {
1204 	struct root_entry *root;
1205 
1206 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1207 	if (!root) {
1208 		pr_err("Allocating root entry for %s failed\n",
1209 			iommu->name);
1210 		return -ENOMEM;
1211 	}
1212 
1213 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1214 	iommu->root_entry = root;
1215 
1216 	return 0;
1217 }
1218 
1219 static void iommu_set_root_entry(struct intel_iommu *iommu)
1220 {
1221 	u64 addr;
1222 	u32 sts;
1223 	unsigned long flag;
1224 
1225 	addr = virt_to_phys(iommu->root_entry);
1226 	if (sm_supported(iommu))
1227 		addr |= DMA_RTADDR_SMT;
1228 
1229 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1230 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1231 
1232 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1233 
1234 	/* Make sure hardware complete it */
1235 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1236 		      readl, (sts & DMA_GSTS_RTPS), sts);
1237 
1238 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 
1240 	/*
1241 	 * Hardware invalidates all DMA remapping hardware translation
1242 	 * caches as part of SRTP flow.
1243 	 */
1244 	if (cap_esrtps(iommu->cap))
1245 		return;
1246 
1247 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1248 	if (sm_supported(iommu))
1249 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1250 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1251 }
1252 
1253 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1254 {
1255 	u32 val;
1256 	unsigned long flag;
1257 
1258 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1259 		return;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1262 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1263 
1264 	/* Make sure hardware complete it */
1265 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1266 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1267 
1268 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 }
1270 
1271 /* return value determine if we need a write buffer flush */
1272 static void __iommu_flush_context(struct intel_iommu *iommu,
1273 				  u16 did, u16 source_id, u8 function_mask,
1274 				  u64 type)
1275 {
1276 	u64 val = 0;
1277 	unsigned long flag;
1278 
1279 	switch (type) {
1280 	case DMA_CCMD_GLOBAL_INVL:
1281 		val = DMA_CCMD_GLOBAL_INVL;
1282 		break;
1283 	case DMA_CCMD_DOMAIN_INVL:
1284 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1285 		break;
1286 	case DMA_CCMD_DEVICE_INVL:
1287 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1288 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1289 		break;
1290 	default:
1291 		BUG();
1292 	}
1293 	val |= DMA_CCMD_ICC;
1294 
1295 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1297 
1298 	/* Make sure hardware complete it */
1299 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1300 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1301 
1302 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 }
1304 
1305 /* return value determine if we need a write buffer flush */
1306 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1307 				u64 addr, unsigned int size_order, u64 type)
1308 {
1309 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1310 	u64 val = 0, val_iva = 0;
1311 	unsigned long flag;
1312 
1313 	switch (type) {
1314 	case DMA_TLB_GLOBAL_FLUSH:
1315 		/* global flush doesn't need set IVA_REG */
1316 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1317 		break;
1318 	case DMA_TLB_DSI_FLUSH:
1319 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1320 		break;
1321 	case DMA_TLB_PSI_FLUSH:
1322 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1323 		/* IH bit is passed in as part of address */
1324 		val_iva = size_order | addr;
1325 		break;
1326 	default:
1327 		BUG();
1328 	}
1329 	/* Note: set drain read/write */
1330 #if 0
1331 	/*
1332 	 * This is probably to be super secure.. Looks like we can
1333 	 * ignore it without any impact.
1334 	 */
1335 	if (cap_read_drain(iommu->cap))
1336 		val |= DMA_TLB_READ_DRAIN;
1337 #endif
1338 	if (cap_write_drain(iommu->cap))
1339 		val |= DMA_TLB_WRITE_DRAIN;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	/* Note: Only uses first TLB reg currently */
1343 	if (val_iva)
1344 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1345 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1346 
1347 	/* Make sure hardware complete it */
1348 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1349 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1350 
1351 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352 
1353 	/* check IOTLB invalidation granularity */
1354 	if (DMA_TLB_IAIG(val) == 0)
1355 		pr_err("Flush IOTLB failed\n");
1356 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1357 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1358 			(unsigned long long)DMA_TLB_IIRG(type),
1359 			(unsigned long long)DMA_TLB_IAIG(val));
1360 }
1361 
1362 static struct device_domain_info *
1363 domain_lookup_dev_info(struct dmar_domain *domain,
1364 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1365 {
1366 	struct device_domain_info *info;
1367 	unsigned long flags;
1368 
1369 	spin_lock_irqsave(&domain->lock, flags);
1370 	list_for_each_entry(info, &domain->devices, link) {
1371 		if (info->iommu == iommu && info->bus == bus &&
1372 		    info->devfn == devfn) {
1373 			spin_unlock_irqrestore(&domain->lock, flags);
1374 			return info;
1375 		}
1376 	}
1377 	spin_unlock_irqrestore(&domain->lock, flags);
1378 
1379 	return NULL;
1380 }
1381 
1382 static void domain_update_iotlb(struct dmar_domain *domain)
1383 {
1384 	struct device_domain_info *info;
1385 	bool has_iotlb_device = false;
1386 	unsigned long flags;
1387 
1388 	spin_lock_irqsave(&domain->lock, flags);
1389 	list_for_each_entry(info, &domain->devices, link) {
1390 		if (info->ats_enabled) {
1391 			has_iotlb_device = true;
1392 			break;
1393 		}
1394 	}
1395 	domain->has_iotlb_device = has_iotlb_device;
1396 	spin_unlock_irqrestore(&domain->lock, flags);
1397 }
1398 
1399 static void iommu_enable_pci_caps(struct device_domain_info *info)
1400 {
1401 	struct pci_dev *pdev;
1402 
1403 	if (!info || !dev_is_pci(info->dev))
1404 		return;
1405 
1406 	pdev = to_pci_dev(info->dev);
1407 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1408 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1409 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1410 	 * reserved, which should be set to 0.
1411 	 */
1412 	if (!ecap_dit(info->iommu->ecap))
1413 		info->pfsid = 0;
1414 	else {
1415 		struct pci_dev *pf_pdev;
1416 
1417 		/* pdev will be returned if device is not a vf */
1418 		pf_pdev = pci_physfn(pdev);
1419 		info->pfsid = pci_dev_id(pf_pdev);
1420 	}
1421 
1422 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1423 	   the device if you enable PASID support after ATS support is
1424 	   undefined. So always enable PASID support on devices which
1425 	   have it, even if we can't yet know if we're ever going to
1426 	   use it. */
1427 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1428 		info->pasid_enabled = 1;
1429 
1430 	if (info->pri_supported &&
1431 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1432 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1433 		info->pri_enabled = 1;
1434 
1435 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1436 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1437 		info->ats_enabled = 1;
1438 		domain_update_iotlb(info->domain);
1439 		info->ats_qdep = pci_ats_queue_depth(pdev);
1440 	}
1441 }
1442 
1443 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1444 {
1445 	struct pci_dev *pdev;
1446 
1447 	if (!dev_is_pci(info->dev))
1448 		return;
1449 
1450 	pdev = to_pci_dev(info->dev);
1451 
1452 	if (info->ats_enabled) {
1453 		pci_disable_ats(pdev);
1454 		info->ats_enabled = 0;
1455 		domain_update_iotlb(info->domain);
1456 	}
1457 
1458 	if (info->pri_enabled) {
1459 		pci_disable_pri(pdev);
1460 		info->pri_enabled = 0;
1461 	}
1462 
1463 	if (info->pasid_enabled) {
1464 		pci_disable_pasid(pdev);
1465 		info->pasid_enabled = 0;
1466 	}
1467 }
1468 
1469 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1470 				    u64 addr, unsigned int mask)
1471 {
1472 	u16 sid, qdep;
1473 
1474 	if (!info || !info->ats_enabled)
1475 		return;
1476 
1477 	sid = info->bus << 8 | info->devfn;
1478 	qdep = info->ats_qdep;
1479 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1480 			   qdep, addr, mask);
1481 }
1482 
1483 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1484 				  u64 addr, unsigned mask)
1485 {
1486 	struct device_domain_info *info;
1487 	unsigned long flags;
1488 
1489 	if (!domain->has_iotlb_device)
1490 		return;
1491 
1492 	spin_lock_irqsave(&domain->lock, flags);
1493 	list_for_each_entry(info, &domain->devices, link)
1494 		__iommu_flush_dev_iotlb(info, addr, mask);
1495 	spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497 
1498 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1499 				  struct dmar_domain *domain,
1500 				  unsigned long pfn, unsigned int pages,
1501 				  int ih, int map)
1502 {
1503 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1504 	unsigned int mask = ilog2(aligned_pages);
1505 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1506 	u16 did = domain_id_iommu(domain, iommu);
1507 
1508 	BUG_ON(pages == 0);
1509 
1510 	if (ih)
1511 		ih = 1 << 6;
1512 
1513 	if (domain_use_first_level(domain)) {
1514 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1515 	} else {
1516 		unsigned long bitmask = aligned_pages - 1;
1517 
1518 		/*
1519 		 * PSI masks the low order bits of the base address. If the
1520 		 * address isn't aligned to the mask, then compute a mask value
1521 		 * needed to ensure the target range is flushed.
1522 		 */
1523 		if (unlikely(bitmask & pfn)) {
1524 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1525 
1526 			/*
1527 			 * Since end_pfn <= pfn + bitmask, the only way bits
1528 			 * higher than bitmask can differ in pfn and end_pfn is
1529 			 * by carrying. This means after masking out bitmask,
1530 			 * high bits starting with the first set bit in
1531 			 * shared_bits are all equal in both pfn and end_pfn.
1532 			 */
1533 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1534 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1535 		}
1536 
1537 		/*
1538 		 * Fallback to domain selective flush if no PSI support or
1539 		 * the size is too big.
1540 		 */
1541 		if (!cap_pgsel_inv(iommu->cap) ||
1542 		    mask > cap_max_amask_val(iommu->cap))
1543 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1544 							DMA_TLB_DSI_FLUSH);
1545 		else
1546 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1547 							DMA_TLB_PSI_FLUSH);
1548 	}
1549 
1550 	/*
1551 	 * In caching mode, changes of pages from non-present to present require
1552 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1553 	 */
1554 	if (!cap_caching_mode(iommu->cap) || !map)
1555 		iommu_flush_dev_iotlb(domain, addr, mask);
1556 }
1557 
1558 /* Notification for newly created mappings */
1559 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1560 					struct dmar_domain *domain,
1561 					unsigned long pfn, unsigned int pages)
1562 {
1563 	/*
1564 	 * It's a non-present to present mapping. Only flush if caching mode
1565 	 * and second level.
1566 	 */
1567 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1568 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1569 	else
1570 		iommu_flush_write_buffer(iommu);
1571 }
1572 
1573 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1574 {
1575 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1576 	struct iommu_domain_info *info;
1577 	unsigned long idx;
1578 
1579 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1580 		struct intel_iommu *iommu = info->iommu;
1581 		u16 did = domain_id_iommu(dmar_domain, iommu);
1582 
1583 		if (domain_use_first_level(dmar_domain))
1584 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1585 		else
1586 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587 						 DMA_TLB_DSI_FLUSH);
1588 
1589 		if (!cap_caching_mode(iommu->cap))
1590 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1591 	}
1592 }
1593 
1594 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1595 {
1596 	u32 pmen;
1597 	unsigned long flags;
1598 
1599 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1600 		return;
1601 
1602 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1603 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1604 	pmen &= ~DMA_PMEN_EPM;
1605 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1606 
1607 	/* wait for the protected region status bit to clear */
1608 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1609 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1610 
1611 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1612 }
1613 
1614 static void iommu_enable_translation(struct intel_iommu *iommu)
1615 {
1616 	u32 sts;
1617 	unsigned long flags;
1618 
1619 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620 	iommu->gcmd |= DMA_GCMD_TE;
1621 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1622 
1623 	/* Make sure hardware complete it */
1624 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1625 		      readl, (sts & DMA_GSTS_TES), sts);
1626 
1627 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 }
1629 
1630 static void iommu_disable_translation(struct intel_iommu *iommu)
1631 {
1632 	u32 sts;
1633 	unsigned long flag;
1634 
1635 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1636 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1637 		return;
1638 
1639 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1640 	iommu->gcmd &= ~DMA_GCMD_TE;
1641 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1642 
1643 	/* Make sure hardware complete it */
1644 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1645 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1646 
1647 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1648 }
1649 
1650 static int iommu_init_domains(struct intel_iommu *iommu)
1651 {
1652 	u32 ndomains;
1653 
1654 	ndomains = cap_ndoms(iommu->cap);
1655 	pr_debug("%s: Number of Domains supported <%d>\n",
1656 		 iommu->name, ndomains);
1657 
1658 	spin_lock_init(&iommu->lock);
1659 
1660 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1661 	if (!iommu->domain_ids)
1662 		return -ENOMEM;
1663 
1664 	/*
1665 	 * If Caching mode is set, then invalid translations are tagged
1666 	 * with domain-id 0, hence we need to pre-allocate it. We also
1667 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1668 	 * make sure it is not used for a real domain.
1669 	 */
1670 	set_bit(0, iommu->domain_ids);
1671 
1672 	/*
1673 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1674 	 * entry for first-level or pass-through translation modes should
1675 	 * be programmed with a domain id different from those used for
1676 	 * second-level or nested translation. We reserve a domain id for
1677 	 * this purpose.
1678 	 */
1679 	if (sm_supported(iommu))
1680 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1681 
1682 	return 0;
1683 }
1684 
1685 static void disable_dmar_iommu(struct intel_iommu *iommu)
1686 {
1687 	if (!iommu->domain_ids)
1688 		return;
1689 
1690 	/*
1691 	 * All iommu domains must have been detached from the devices,
1692 	 * hence there should be no domain IDs in use.
1693 	 */
1694 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1695 		    > NUM_RESERVED_DID))
1696 		return;
1697 
1698 	if (iommu->gcmd & DMA_GCMD_TE)
1699 		iommu_disable_translation(iommu);
1700 }
1701 
1702 static void free_dmar_iommu(struct intel_iommu *iommu)
1703 {
1704 	if (iommu->domain_ids) {
1705 		bitmap_free(iommu->domain_ids);
1706 		iommu->domain_ids = NULL;
1707 	}
1708 
1709 	if (iommu->copied_tables) {
1710 		bitmap_free(iommu->copied_tables);
1711 		iommu->copied_tables = NULL;
1712 	}
1713 
1714 	/* free context mapping */
1715 	free_context_table(iommu);
1716 
1717 #ifdef CONFIG_INTEL_IOMMU_SVM
1718 	if (pasid_supported(iommu)) {
1719 		if (ecap_prs(iommu->ecap))
1720 			intel_svm_finish_prq(iommu);
1721 	}
1722 	if (vccap_pasid(iommu->vccap))
1723 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1724 
1725 #endif
1726 }
1727 
1728 /*
1729  * Check and return whether first level is used by default for
1730  * DMA translation.
1731  */
1732 static bool first_level_by_default(unsigned int type)
1733 {
1734 	/* Only SL is available in legacy mode */
1735 	if (!scalable_mode_support())
1736 		return false;
1737 
1738 	/* Only level (either FL or SL) is available, just use it */
1739 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1740 		return intel_cap_flts_sanity();
1741 
1742 	/* Both levels are available, decide it based on domain type */
1743 	return type != IOMMU_DOMAIN_UNMANAGED;
1744 }
1745 
1746 static struct dmar_domain *alloc_domain(unsigned int type)
1747 {
1748 	struct dmar_domain *domain;
1749 
1750 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1751 	if (!domain)
1752 		return NULL;
1753 
1754 	domain->nid = NUMA_NO_NODE;
1755 	if (first_level_by_default(type))
1756 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1757 	domain->has_iotlb_device = false;
1758 	INIT_LIST_HEAD(&domain->devices);
1759 	spin_lock_init(&domain->lock);
1760 	xa_init(&domain->iommu_array);
1761 
1762 	return domain;
1763 }
1764 
1765 static int domain_attach_iommu(struct dmar_domain *domain,
1766 			       struct intel_iommu *iommu)
1767 {
1768 	struct iommu_domain_info *info, *curr;
1769 	unsigned long ndomains;
1770 	int num, ret = -ENOSPC;
1771 
1772 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1773 	if (!info)
1774 		return -ENOMEM;
1775 
1776 	spin_lock(&iommu->lock);
1777 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1778 	if (curr) {
1779 		curr->refcnt++;
1780 		spin_unlock(&iommu->lock);
1781 		kfree(info);
1782 		return 0;
1783 	}
1784 
1785 	ndomains = cap_ndoms(iommu->cap);
1786 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1787 	if (num >= ndomains) {
1788 		pr_err("%s: No free domain ids\n", iommu->name);
1789 		goto err_unlock;
1790 	}
1791 
1792 	set_bit(num, iommu->domain_ids);
1793 	info->refcnt	= 1;
1794 	info->did	= num;
1795 	info->iommu	= iommu;
1796 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1797 			  NULL, info, GFP_ATOMIC);
1798 	if (curr) {
1799 		ret = xa_err(curr) ? : -EBUSY;
1800 		goto err_clear;
1801 	}
1802 	domain_update_iommu_cap(domain);
1803 
1804 	spin_unlock(&iommu->lock);
1805 	return 0;
1806 
1807 err_clear:
1808 	clear_bit(info->did, iommu->domain_ids);
1809 err_unlock:
1810 	spin_unlock(&iommu->lock);
1811 	kfree(info);
1812 	return ret;
1813 }
1814 
1815 static void domain_detach_iommu(struct dmar_domain *domain,
1816 				struct intel_iommu *iommu)
1817 {
1818 	struct iommu_domain_info *info;
1819 
1820 	spin_lock(&iommu->lock);
1821 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1822 	if (--info->refcnt == 0) {
1823 		clear_bit(info->did, iommu->domain_ids);
1824 		xa_erase(&domain->iommu_array, iommu->seq_id);
1825 		domain->nid = NUMA_NO_NODE;
1826 		domain_update_iommu_cap(domain);
1827 		kfree(info);
1828 	}
1829 	spin_unlock(&iommu->lock);
1830 }
1831 
1832 static inline int guestwidth_to_adjustwidth(int gaw)
1833 {
1834 	int agaw;
1835 	int r = (gaw - 12) % 9;
1836 
1837 	if (r == 0)
1838 		agaw = gaw;
1839 	else
1840 		agaw = gaw + 9 - r;
1841 	if (agaw > 64)
1842 		agaw = 64;
1843 	return agaw;
1844 }
1845 
1846 static void domain_exit(struct dmar_domain *domain)
1847 {
1848 	if (domain->pgd) {
1849 		LIST_HEAD(freelist);
1850 
1851 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1852 		put_pages_list(&freelist);
1853 	}
1854 
1855 	if (WARN_ON(!list_empty(&domain->devices)))
1856 		return;
1857 
1858 	kfree(domain);
1859 }
1860 
1861 /*
1862  * Get the PASID directory size for scalable mode context entry.
1863  * Value of X in the PDTS field of a scalable mode context entry
1864  * indicates PASID directory with 2^(X + 7) entries.
1865  */
1866 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1867 {
1868 	unsigned long pds, max_pde;
1869 
1870 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1871 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1872 	if (pds < 7)
1873 		return 0;
1874 
1875 	return pds - 7;
1876 }
1877 
1878 /*
1879  * Set the RID_PASID field of a scalable mode context entry. The
1880  * IOMMU hardware will use the PASID value set in this field for
1881  * DMA translations of DMA requests without PASID.
1882  */
1883 static inline void
1884 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1885 {
1886 	context->hi |= pasid & ((1 << 20) - 1);
1887 }
1888 
1889 /*
1890  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1891  * entry.
1892  */
1893 static inline void context_set_sm_dte(struct context_entry *context)
1894 {
1895 	context->lo |= (1 << 2);
1896 }
1897 
1898 /*
1899  * Set the PRE(Page Request Enable) field of a scalable mode context
1900  * entry.
1901  */
1902 static inline void context_set_sm_pre(struct context_entry *context)
1903 {
1904 	context->lo |= (1 << 4);
1905 }
1906 
1907 /* Convert value to context PASID directory size field coding. */
1908 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1909 
1910 static int domain_context_mapping_one(struct dmar_domain *domain,
1911 				      struct intel_iommu *iommu,
1912 				      struct pasid_table *table,
1913 				      u8 bus, u8 devfn)
1914 {
1915 	struct device_domain_info *info =
1916 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1917 	u16 did = domain_id_iommu(domain, iommu);
1918 	int translation = CONTEXT_TT_MULTI_LEVEL;
1919 	struct context_entry *context;
1920 	int ret;
1921 
1922 	WARN_ON(did == 0);
1923 
1924 	if (hw_pass_through && domain_type_is_si(domain))
1925 		translation = CONTEXT_TT_PASS_THROUGH;
1926 
1927 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1928 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1929 
1930 	BUG_ON(!domain->pgd);
1931 
1932 	spin_lock(&iommu->lock);
1933 	ret = -ENOMEM;
1934 	context = iommu_context_addr(iommu, bus, devfn, 1);
1935 	if (!context)
1936 		goto out_unlock;
1937 
1938 	ret = 0;
1939 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1940 		goto out_unlock;
1941 
1942 	/*
1943 	 * For kdump cases, old valid entries may be cached due to the
1944 	 * in-flight DMA and copied pgtable, but there is no unmapping
1945 	 * behaviour for them, thus we need an explicit cache flush for
1946 	 * the newly-mapped device. For kdump, at this point, the device
1947 	 * is supposed to finish reset at its driver probe stage, so no
1948 	 * in-flight DMA will exist, and we don't need to worry anymore
1949 	 * hereafter.
1950 	 */
1951 	if (context_copied(iommu, bus, devfn)) {
1952 		u16 did_old = context_domain_id(context);
1953 
1954 		if (did_old < cap_ndoms(iommu->cap)) {
1955 			iommu->flush.flush_context(iommu, did_old,
1956 						   (((u16)bus) << 8) | devfn,
1957 						   DMA_CCMD_MASK_NOBIT,
1958 						   DMA_CCMD_DEVICE_INVL);
1959 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1960 						 DMA_TLB_DSI_FLUSH);
1961 		}
1962 
1963 		clear_context_copied(iommu, bus, devfn);
1964 	}
1965 
1966 	context_clear_entry(context);
1967 
1968 	if (sm_supported(iommu)) {
1969 		unsigned long pds;
1970 
1971 		WARN_ON(!table);
1972 
1973 		/* Setup the PASID DIR pointer: */
1974 		pds = context_get_sm_pds(table);
1975 		context->lo = (u64)virt_to_phys(table->table) |
1976 				context_pdts(pds);
1977 
1978 		/* Setup the RID_PASID field: */
1979 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1980 
1981 		/*
1982 		 * Setup the Device-TLB enable bit and Page request
1983 		 * Enable bit:
1984 		 */
1985 		if (info && info->ats_supported)
1986 			context_set_sm_dte(context);
1987 		if (info && info->pri_supported)
1988 			context_set_sm_pre(context);
1989 		if (info && info->pasid_supported)
1990 			context_set_pasid(context);
1991 	} else {
1992 		struct dma_pte *pgd = domain->pgd;
1993 		int agaw;
1994 
1995 		context_set_domain_id(context, did);
1996 
1997 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1998 			/*
1999 			 * Skip top levels of page tables for iommu which has
2000 			 * less agaw than default. Unnecessary for PT mode.
2001 			 */
2002 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2003 				ret = -ENOMEM;
2004 				pgd = phys_to_virt(dma_pte_addr(pgd));
2005 				if (!dma_pte_present(pgd))
2006 					goto out_unlock;
2007 			}
2008 
2009 			if (info && info->ats_supported)
2010 				translation = CONTEXT_TT_DEV_IOTLB;
2011 			else
2012 				translation = CONTEXT_TT_MULTI_LEVEL;
2013 
2014 			context_set_address_root(context, virt_to_phys(pgd));
2015 			context_set_address_width(context, agaw);
2016 		} else {
2017 			/*
2018 			 * In pass through mode, AW must be programmed to
2019 			 * indicate the largest AGAW value supported by
2020 			 * hardware. And ASR is ignored by hardware.
2021 			 */
2022 			context_set_address_width(context, iommu->msagaw);
2023 		}
2024 
2025 		context_set_translation_type(context, translation);
2026 	}
2027 
2028 	context_set_fault_enable(context);
2029 	context_set_present(context);
2030 	if (!ecap_coherent(iommu->ecap))
2031 		clflush_cache_range(context, sizeof(*context));
2032 
2033 	/*
2034 	 * It's a non-present to present mapping. If hardware doesn't cache
2035 	 * non-present entry we only need to flush the write-buffer. If the
2036 	 * _does_ cache non-present entries, then it does so in the special
2037 	 * domain #0, which we have to flush:
2038 	 */
2039 	if (cap_caching_mode(iommu->cap)) {
2040 		iommu->flush.flush_context(iommu, 0,
2041 					   (((u16)bus) << 8) | devfn,
2042 					   DMA_CCMD_MASK_NOBIT,
2043 					   DMA_CCMD_DEVICE_INVL);
2044 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2045 	} else {
2046 		iommu_flush_write_buffer(iommu);
2047 	}
2048 	iommu_enable_pci_caps(info);
2049 
2050 	ret = 0;
2051 
2052 out_unlock:
2053 	spin_unlock(&iommu->lock);
2054 
2055 	return ret;
2056 }
2057 
2058 struct domain_context_mapping_data {
2059 	struct dmar_domain *domain;
2060 	struct intel_iommu *iommu;
2061 	struct pasid_table *table;
2062 };
2063 
2064 static int domain_context_mapping_cb(struct pci_dev *pdev,
2065 				     u16 alias, void *opaque)
2066 {
2067 	struct domain_context_mapping_data *data = opaque;
2068 
2069 	return domain_context_mapping_one(data->domain, data->iommu,
2070 					  data->table, PCI_BUS_NUM(alias),
2071 					  alias & 0xff);
2072 }
2073 
2074 static int
2075 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2076 {
2077 	struct domain_context_mapping_data data;
2078 	struct pasid_table *table;
2079 	struct intel_iommu *iommu;
2080 	u8 bus, devfn;
2081 
2082 	iommu = device_to_iommu(dev, &bus, &devfn);
2083 	if (!iommu)
2084 		return -ENODEV;
2085 
2086 	table = intel_pasid_get_table(dev);
2087 
2088 	if (!dev_is_pci(dev))
2089 		return domain_context_mapping_one(domain, iommu, table,
2090 						  bus, devfn);
2091 
2092 	data.domain = domain;
2093 	data.iommu = iommu;
2094 	data.table = table;
2095 
2096 	return pci_for_each_dma_alias(to_pci_dev(dev),
2097 				      &domain_context_mapping_cb, &data);
2098 }
2099 
2100 static int domain_context_mapped_cb(struct pci_dev *pdev,
2101 				    u16 alias, void *opaque)
2102 {
2103 	struct intel_iommu *iommu = opaque;
2104 
2105 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2106 }
2107 
2108 static int domain_context_mapped(struct device *dev)
2109 {
2110 	struct intel_iommu *iommu;
2111 	u8 bus, devfn;
2112 
2113 	iommu = device_to_iommu(dev, &bus, &devfn);
2114 	if (!iommu)
2115 		return -ENODEV;
2116 
2117 	if (!dev_is_pci(dev))
2118 		return device_context_mapped(iommu, bus, devfn);
2119 
2120 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2121 				       domain_context_mapped_cb, iommu);
2122 }
2123 
2124 /* Returns a number of VTD pages, but aligned to MM page size */
2125 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2126 					    size_t size)
2127 {
2128 	host_addr &= ~PAGE_MASK;
2129 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2130 }
2131 
2132 /* Return largest possible superpage level for a given mapping */
2133 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2134 					  unsigned long iov_pfn,
2135 					  unsigned long phy_pfn,
2136 					  unsigned long pages)
2137 {
2138 	int support, level = 1;
2139 	unsigned long pfnmerge;
2140 
2141 	support = domain->iommu_superpage;
2142 
2143 	/* To use a large page, the virtual *and* physical addresses
2144 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2145 	   of them will mean we have to use smaller pages. So just
2146 	   merge them and check both at once. */
2147 	pfnmerge = iov_pfn | phy_pfn;
2148 
2149 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2150 		pages >>= VTD_STRIDE_SHIFT;
2151 		if (!pages)
2152 			break;
2153 		pfnmerge >>= VTD_STRIDE_SHIFT;
2154 		level++;
2155 		support--;
2156 	}
2157 	return level;
2158 }
2159 
2160 /*
2161  * Ensure that old small page tables are removed to make room for superpage(s).
2162  * We're going to add new large pages, so make sure we don't remove their parent
2163  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2164  */
2165 static void switch_to_super_page(struct dmar_domain *domain,
2166 				 unsigned long start_pfn,
2167 				 unsigned long end_pfn, int level)
2168 {
2169 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2170 	struct iommu_domain_info *info;
2171 	struct dma_pte *pte = NULL;
2172 	unsigned long i;
2173 
2174 	while (start_pfn <= end_pfn) {
2175 		if (!pte)
2176 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2177 
2178 		if (dma_pte_present(pte)) {
2179 			dma_pte_free_pagetable(domain, start_pfn,
2180 					       start_pfn + lvl_pages - 1,
2181 					       level + 1);
2182 
2183 			xa_for_each(&domain->iommu_array, i, info)
2184 				iommu_flush_iotlb_psi(info->iommu, domain,
2185 						      start_pfn, lvl_pages,
2186 						      0, 0);
2187 		}
2188 
2189 		pte++;
2190 		start_pfn += lvl_pages;
2191 		if (first_pte_in_page(pte))
2192 			pte = NULL;
2193 	}
2194 }
2195 
2196 static int
2197 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2198 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2199 {
2200 	struct dma_pte *first_pte = NULL, *pte = NULL;
2201 	unsigned int largepage_lvl = 0;
2202 	unsigned long lvl_pages = 0;
2203 	phys_addr_t pteval;
2204 	u64 attr;
2205 
2206 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2207 
2208 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2209 		return -EINVAL;
2210 
2211 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2212 	attr |= DMA_FL_PTE_PRESENT;
2213 	if (domain_use_first_level(domain)) {
2214 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2215 		if (prot & DMA_PTE_WRITE)
2216 			attr |= DMA_FL_PTE_DIRTY;
2217 	}
2218 
2219 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2220 
2221 	while (nr_pages > 0) {
2222 		uint64_t tmp;
2223 
2224 		if (!pte) {
2225 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2226 					phys_pfn, nr_pages);
2227 
2228 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2229 			if (!pte)
2230 				return -ENOMEM;
2231 			first_pte = pte;
2232 
2233 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2234 
2235 			/* It is large page*/
2236 			if (largepage_lvl > 1) {
2237 				unsigned long end_pfn;
2238 				unsigned long pages_to_remove;
2239 
2240 				pteval |= DMA_PTE_LARGE_PAGE;
2241 				pages_to_remove = min_t(unsigned long, nr_pages,
2242 							nr_pte_to_next_page(pte) * lvl_pages);
2243 				end_pfn = iov_pfn + pages_to_remove - 1;
2244 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2245 			} else {
2246 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2247 			}
2248 
2249 		}
2250 		/* We don't need lock here, nobody else
2251 		 * touches the iova range
2252 		 */
2253 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2254 		if (tmp) {
2255 			static int dumps = 5;
2256 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2257 				iov_pfn, tmp, (unsigned long long)pteval);
2258 			if (dumps) {
2259 				dumps--;
2260 				debug_dma_dump_mappings(NULL);
2261 			}
2262 			WARN_ON(1);
2263 		}
2264 
2265 		nr_pages -= lvl_pages;
2266 		iov_pfn += lvl_pages;
2267 		phys_pfn += lvl_pages;
2268 		pteval += lvl_pages * VTD_PAGE_SIZE;
2269 
2270 		/* If the next PTE would be the first in a new page, then we
2271 		 * need to flush the cache on the entries we've just written.
2272 		 * And then we'll need to recalculate 'pte', so clear it and
2273 		 * let it get set again in the if (!pte) block above.
2274 		 *
2275 		 * If we're done (!nr_pages) we need to flush the cache too.
2276 		 *
2277 		 * Also if we've been setting superpages, we may need to
2278 		 * recalculate 'pte' and switch back to smaller pages for the
2279 		 * end of the mapping, if the trailing size is not enough to
2280 		 * use another superpage (i.e. nr_pages < lvl_pages).
2281 		 */
2282 		pte++;
2283 		if (!nr_pages || first_pte_in_page(pte) ||
2284 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2285 			domain_flush_cache(domain, first_pte,
2286 					   (void *)pte - (void *)first_pte);
2287 			pte = NULL;
2288 		}
2289 	}
2290 
2291 	return 0;
2292 }
2293 
2294 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2295 {
2296 	struct intel_iommu *iommu = info->iommu;
2297 	struct context_entry *context;
2298 	u16 did_old;
2299 
2300 	if (!iommu)
2301 		return;
2302 
2303 	spin_lock(&iommu->lock);
2304 	context = iommu_context_addr(iommu, bus, devfn, 0);
2305 	if (!context) {
2306 		spin_unlock(&iommu->lock);
2307 		return;
2308 	}
2309 
2310 	if (sm_supported(iommu)) {
2311 		if (hw_pass_through && domain_type_is_si(info->domain))
2312 			did_old = FLPT_DEFAULT_DID;
2313 		else
2314 			did_old = domain_id_iommu(info->domain, iommu);
2315 	} else {
2316 		did_old = context_domain_id(context);
2317 	}
2318 
2319 	context_clear_entry(context);
2320 	__iommu_flush_cache(iommu, context, sizeof(*context));
2321 	spin_unlock(&iommu->lock);
2322 	iommu->flush.flush_context(iommu,
2323 				   did_old,
2324 				   (((u16)bus) << 8) | devfn,
2325 				   DMA_CCMD_MASK_NOBIT,
2326 				   DMA_CCMD_DEVICE_INVL);
2327 
2328 	if (sm_supported(iommu))
2329 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2330 
2331 	iommu->flush.flush_iotlb(iommu,
2332 				 did_old,
2333 				 0,
2334 				 0,
2335 				 DMA_TLB_DSI_FLUSH);
2336 
2337 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2338 }
2339 
2340 static int domain_setup_first_level(struct intel_iommu *iommu,
2341 				    struct dmar_domain *domain,
2342 				    struct device *dev,
2343 				    u32 pasid)
2344 {
2345 	struct dma_pte *pgd = domain->pgd;
2346 	int agaw, level;
2347 	int flags = 0;
2348 
2349 	/*
2350 	 * Skip top levels of page tables for iommu which has
2351 	 * less agaw than default. Unnecessary for PT mode.
2352 	 */
2353 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2354 		pgd = phys_to_virt(dma_pte_addr(pgd));
2355 		if (!dma_pte_present(pgd))
2356 			return -ENOMEM;
2357 	}
2358 
2359 	level = agaw_to_level(agaw);
2360 	if (level != 4 && level != 5)
2361 		return -EINVAL;
2362 
2363 	if (pasid != PASID_RID2PASID)
2364 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2365 	if (level == 5)
2366 		flags |= PASID_FLAG_FL5LP;
2367 
2368 	if (domain->force_snooping)
2369 		flags |= PASID_FLAG_PAGE_SNOOP;
2370 
2371 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2372 					     domain_id_iommu(domain, iommu),
2373 					     flags);
2374 }
2375 
2376 static bool dev_is_real_dma_subdevice(struct device *dev)
2377 {
2378 	return dev && dev_is_pci(dev) &&
2379 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2380 }
2381 
2382 static int iommu_domain_identity_map(struct dmar_domain *domain,
2383 				     unsigned long first_vpfn,
2384 				     unsigned long last_vpfn)
2385 {
2386 	/*
2387 	 * RMRR range might have overlap with physical memory range,
2388 	 * clear it first
2389 	 */
2390 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2391 
2392 	return __domain_mapping(domain, first_vpfn,
2393 				first_vpfn, last_vpfn - first_vpfn + 1,
2394 				DMA_PTE_READ|DMA_PTE_WRITE);
2395 }
2396 
2397 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2398 
2399 static int __init si_domain_init(int hw)
2400 {
2401 	struct dmar_rmrr_unit *rmrr;
2402 	struct device *dev;
2403 	int i, nid, ret;
2404 
2405 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2406 	if (!si_domain)
2407 		return -EFAULT;
2408 
2409 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2410 		domain_exit(si_domain);
2411 		si_domain = NULL;
2412 		return -EFAULT;
2413 	}
2414 
2415 	if (hw)
2416 		return 0;
2417 
2418 	for_each_online_node(nid) {
2419 		unsigned long start_pfn, end_pfn;
2420 		int i;
2421 
2422 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2423 			ret = iommu_domain_identity_map(si_domain,
2424 					mm_to_dma_pfn(start_pfn),
2425 					mm_to_dma_pfn(end_pfn));
2426 			if (ret)
2427 				return ret;
2428 		}
2429 	}
2430 
2431 	/*
2432 	 * Identity map the RMRRs so that devices with RMRRs could also use
2433 	 * the si_domain.
2434 	 */
2435 	for_each_rmrr_units(rmrr) {
2436 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2437 					  i, dev) {
2438 			unsigned long long start = rmrr->base_address;
2439 			unsigned long long end = rmrr->end_address;
2440 
2441 			if (WARN_ON(end < start ||
2442 				    end >> agaw_to_width(si_domain->agaw)))
2443 				continue;
2444 
2445 			ret = iommu_domain_identity_map(si_domain,
2446 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2447 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2448 			if (ret)
2449 				return ret;
2450 		}
2451 	}
2452 
2453 	return 0;
2454 }
2455 
2456 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2457 {
2458 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2459 	struct intel_iommu *iommu;
2460 	unsigned long flags;
2461 	u8 bus, devfn;
2462 	int ret;
2463 
2464 	iommu = device_to_iommu(dev, &bus, &devfn);
2465 	if (!iommu)
2466 		return -ENODEV;
2467 
2468 	ret = domain_attach_iommu(domain, iommu);
2469 	if (ret)
2470 		return ret;
2471 	info->domain = domain;
2472 	spin_lock_irqsave(&domain->lock, flags);
2473 	list_add(&info->link, &domain->devices);
2474 	spin_unlock_irqrestore(&domain->lock, flags);
2475 
2476 	/* PASID table is mandatory for a PCI device in scalable mode. */
2477 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2478 		ret = intel_pasid_alloc_table(dev);
2479 		if (ret) {
2480 			dev_err(dev, "PASID table allocation failed\n");
2481 			dmar_remove_one_dev_info(dev);
2482 			return ret;
2483 		}
2484 
2485 		/* Setup the PASID entry for requests without PASID: */
2486 		if (hw_pass_through && domain_type_is_si(domain))
2487 			ret = intel_pasid_setup_pass_through(iommu, domain,
2488 					dev, PASID_RID2PASID);
2489 		else if (domain_use_first_level(domain))
2490 			ret = domain_setup_first_level(iommu, domain, dev,
2491 					PASID_RID2PASID);
2492 		else
2493 			ret = intel_pasid_setup_second_level(iommu, domain,
2494 					dev, PASID_RID2PASID);
2495 		if (ret) {
2496 			dev_err(dev, "Setup RID2PASID failed\n");
2497 			dmar_remove_one_dev_info(dev);
2498 			return ret;
2499 		}
2500 	}
2501 
2502 	ret = domain_context_mapping(domain, dev);
2503 	if (ret) {
2504 		dev_err(dev, "Domain context map failed\n");
2505 		dmar_remove_one_dev_info(dev);
2506 		return ret;
2507 	}
2508 
2509 	return 0;
2510 }
2511 
2512 static bool device_has_rmrr(struct device *dev)
2513 {
2514 	struct dmar_rmrr_unit *rmrr;
2515 	struct device *tmp;
2516 	int i;
2517 
2518 	rcu_read_lock();
2519 	for_each_rmrr_units(rmrr) {
2520 		/*
2521 		 * Return TRUE if this RMRR contains the device that
2522 		 * is passed in.
2523 		 */
2524 		for_each_active_dev_scope(rmrr->devices,
2525 					  rmrr->devices_cnt, i, tmp)
2526 			if (tmp == dev ||
2527 			    is_downstream_to_pci_bridge(dev, tmp)) {
2528 				rcu_read_unlock();
2529 				return true;
2530 			}
2531 	}
2532 	rcu_read_unlock();
2533 	return false;
2534 }
2535 
2536 /**
2537  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2538  * is relaxable (ie. is allowed to be not enforced under some conditions)
2539  * @dev: device handle
2540  *
2541  * We assume that PCI USB devices with RMRRs have them largely
2542  * for historical reasons and that the RMRR space is not actively used post
2543  * boot.  This exclusion may change if vendors begin to abuse it.
2544  *
2545  * The same exception is made for graphics devices, with the requirement that
2546  * any use of the RMRR regions will be torn down before assigning the device
2547  * to a guest.
2548  *
2549  * Return: true if the RMRR is relaxable, false otherwise
2550  */
2551 static bool device_rmrr_is_relaxable(struct device *dev)
2552 {
2553 	struct pci_dev *pdev;
2554 
2555 	if (!dev_is_pci(dev))
2556 		return false;
2557 
2558 	pdev = to_pci_dev(dev);
2559 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2560 		return true;
2561 	else
2562 		return false;
2563 }
2564 
2565 /*
2566  * There are a couple cases where we need to restrict the functionality of
2567  * devices associated with RMRRs.  The first is when evaluating a device for
2568  * identity mapping because problems exist when devices are moved in and out
2569  * of domains and their respective RMRR information is lost.  This means that
2570  * a device with associated RMRRs will never be in a "passthrough" domain.
2571  * The second is use of the device through the IOMMU API.  This interface
2572  * expects to have full control of the IOVA space for the device.  We cannot
2573  * satisfy both the requirement that RMRR access is maintained and have an
2574  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2575  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2576  * We therefore prevent devices associated with an RMRR from participating in
2577  * the IOMMU API, which eliminates them from device assignment.
2578  *
2579  * In both cases, devices which have relaxable RMRRs are not concerned by this
2580  * restriction. See device_rmrr_is_relaxable comment.
2581  */
2582 static bool device_is_rmrr_locked(struct device *dev)
2583 {
2584 	if (!device_has_rmrr(dev))
2585 		return false;
2586 
2587 	if (device_rmrr_is_relaxable(dev))
2588 		return false;
2589 
2590 	return true;
2591 }
2592 
2593 /*
2594  * Return the required default domain type for a specific device.
2595  *
2596  * @dev: the device in query
2597  * @startup: true if this is during early boot
2598  *
2599  * Returns:
2600  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2601  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2602  *  - 0: both identity and dynamic domains work for this device
2603  */
2604 static int device_def_domain_type(struct device *dev)
2605 {
2606 	if (dev_is_pci(dev)) {
2607 		struct pci_dev *pdev = to_pci_dev(dev);
2608 
2609 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2610 			return IOMMU_DOMAIN_IDENTITY;
2611 
2612 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2613 			return IOMMU_DOMAIN_IDENTITY;
2614 	}
2615 
2616 	return 0;
2617 }
2618 
2619 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2620 {
2621 	/*
2622 	 * Start from the sane iommu hardware state.
2623 	 * If the queued invalidation is already initialized by us
2624 	 * (for example, while enabling interrupt-remapping) then
2625 	 * we got the things already rolling from a sane state.
2626 	 */
2627 	if (!iommu->qi) {
2628 		/*
2629 		 * Clear any previous faults.
2630 		 */
2631 		dmar_fault(-1, iommu);
2632 		/*
2633 		 * Disable queued invalidation if supported and already enabled
2634 		 * before OS handover.
2635 		 */
2636 		dmar_disable_qi(iommu);
2637 	}
2638 
2639 	if (dmar_enable_qi(iommu)) {
2640 		/*
2641 		 * Queued Invalidate not enabled, use Register Based Invalidate
2642 		 */
2643 		iommu->flush.flush_context = __iommu_flush_context;
2644 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2645 		pr_info("%s: Using Register based invalidation\n",
2646 			iommu->name);
2647 	} else {
2648 		iommu->flush.flush_context = qi_flush_context;
2649 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2650 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2651 	}
2652 }
2653 
2654 static int copy_context_table(struct intel_iommu *iommu,
2655 			      struct root_entry *old_re,
2656 			      struct context_entry **tbl,
2657 			      int bus, bool ext)
2658 {
2659 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2660 	struct context_entry *new_ce = NULL, ce;
2661 	struct context_entry *old_ce = NULL;
2662 	struct root_entry re;
2663 	phys_addr_t old_ce_phys;
2664 
2665 	tbl_idx = ext ? bus * 2 : bus;
2666 	memcpy(&re, old_re, sizeof(re));
2667 
2668 	for (devfn = 0; devfn < 256; devfn++) {
2669 		/* First calculate the correct index */
2670 		idx = (ext ? devfn * 2 : devfn) % 256;
2671 
2672 		if (idx == 0) {
2673 			/* First save what we may have and clean up */
2674 			if (new_ce) {
2675 				tbl[tbl_idx] = new_ce;
2676 				__iommu_flush_cache(iommu, new_ce,
2677 						    VTD_PAGE_SIZE);
2678 				pos = 1;
2679 			}
2680 
2681 			if (old_ce)
2682 				memunmap(old_ce);
2683 
2684 			ret = 0;
2685 			if (devfn < 0x80)
2686 				old_ce_phys = root_entry_lctp(&re);
2687 			else
2688 				old_ce_phys = root_entry_uctp(&re);
2689 
2690 			if (!old_ce_phys) {
2691 				if (ext && devfn == 0) {
2692 					/* No LCTP, try UCTP */
2693 					devfn = 0x7f;
2694 					continue;
2695 				} else {
2696 					goto out;
2697 				}
2698 			}
2699 
2700 			ret = -ENOMEM;
2701 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2702 					MEMREMAP_WB);
2703 			if (!old_ce)
2704 				goto out;
2705 
2706 			new_ce = alloc_pgtable_page(iommu->node);
2707 			if (!new_ce)
2708 				goto out_unmap;
2709 
2710 			ret = 0;
2711 		}
2712 
2713 		/* Now copy the context entry */
2714 		memcpy(&ce, old_ce + idx, sizeof(ce));
2715 
2716 		if (!context_present(&ce))
2717 			continue;
2718 
2719 		did = context_domain_id(&ce);
2720 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2721 			set_bit(did, iommu->domain_ids);
2722 
2723 		set_context_copied(iommu, bus, devfn);
2724 		new_ce[idx] = ce;
2725 	}
2726 
2727 	tbl[tbl_idx + pos] = new_ce;
2728 
2729 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2730 
2731 out_unmap:
2732 	memunmap(old_ce);
2733 
2734 out:
2735 	return ret;
2736 }
2737 
2738 static int copy_translation_tables(struct intel_iommu *iommu)
2739 {
2740 	struct context_entry **ctxt_tbls;
2741 	struct root_entry *old_rt;
2742 	phys_addr_t old_rt_phys;
2743 	int ctxt_table_entries;
2744 	u64 rtaddr_reg;
2745 	int bus, ret;
2746 	bool new_ext, ext;
2747 
2748 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2749 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2750 	new_ext    = !!sm_supported(iommu);
2751 
2752 	/*
2753 	 * The RTT bit can only be changed when translation is disabled,
2754 	 * but disabling translation means to open a window for data
2755 	 * corruption. So bail out and don't copy anything if we would
2756 	 * have to change the bit.
2757 	 */
2758 	if (new_ext != ext)
2759 		return -EINVAL;
2760 
2761 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2762 	if (!iommu->copied_tables)
2763 		return -ENOMEM;
2764 
2765 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2766 	if (!old_rt_phys)
2767 		return -EINVAL;
2768 
2769 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2770 	if (!old_rt)
2771 		return -ENOMEM;
2772 
2773 	/* This is too big for the stack - allocate it from slab */
2774 	ctxt_table_entries = ext ? 512 : 256;
2775 	ret = -ENOMEM;
2776 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2777 	if (!ctxt_tbls)
2778 		goto out_unmap;
2779 
2780 	for (bus = 0; bus < 256; bus++) {
2781 		ret = copy_context_table(iommu, &old_rt[bus],
2782 					 ctxt_tbls, bus, ext);
2783 		if (ret) {
2784 			pr_err("%s: Failed to copy context table for bus %d\n",
2785 				iommu->name, bus);
2786 			continue;
2787 		}
2788 	}
2789 
2790 	spin_lock(&iommu->lock);
2791 
2792 	/* Context tables are copied, now write them to the root_entry table */
2793 	for (bus = 0; bus < 256; bus++) {
2794 		int idx = ext ? bus * 2 : bus;
2795 		u64 val;
2796 
2797 		if (ctxt_tbls[idx]) {
2798 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2799 			iommu->root_entry[bus].lo = val;
2800 		}
2801 
2802 		if (!ext || !ctxt_tbls[idx + 1])
2803 			continue;
2804 
2805 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2806 		iommu->root_entry[bus].hi = val;
2807 	}
2808 
2809 	spin_unlock(&iommu->lock);
2810 
2811 	kfree(ctxt_tbls);
2812 
2813 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2814 
2815 	ret = 0;
2816 
2817 out_unmap:
2818 	memunmap(old_rt);
2819 
2820 	return ret;
2821 }
2822 
2823 #ifdef CONFIG_INTEL_IOMMU_SVM
2824 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2825 {
2826 	struct intel_iommu *iommu = data;
2827 	ioasid_t ioasid;
2828 
2829 	if (!iommu)
2830 		return INVALID_IOASID;
2831 	/*
2832 	 * VT-d virtual command interface always uses the full 20 bit
2833 	 * PASID range. Host can partition guest PASID range based on
2834 	 * policies but it is out of guest's control.
2835 	 */
2836 	if (min < PASID_MIN || max > intel_pasid_max_id)
2837 		return INVALID_IOASID;
2838 
2839 	if (vcmd_alloc_pasid(iommu, &ioasid))
2840 		return INVALID_IOASID;
2841 
2842 	return ioasid;
2843 }
2844 
2845 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2846 {
2847 	struct intel_iommu *iommu = data;
2848 
2849 	if (!iommu)
2850 		return;
2851 	/*
2852 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2853 	 * We can only free the PASID when all the devices are unbound.
2854 	 */
2855 	if (ioasid_find(NULL, ioasid, NULL)) {
2856 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2857 		return;
2858 	}
2859 	vcmd_free_pasid(iommu, ioasid);
2860 }
2861 
2862 static void register_pasid_allocator(struct intel_iommu *iommu)
2863 {
2864 	/*
2865 	 * If we are running in the host, no need for custom allocator
2866 	 * in that PASIDs are allocated from the host system-wide.
2867 	 */
2868 	if (!cap_caching_mode(iommu->cap))
2869 		return;
2870 
2871 	if (!sm_supported(iommu)) {
2872 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2873 		return;
2874 	}
2875 
2876 	/*
2877 	 * Register a custom PASID allocator if we are running in a guest,
2878 	 * guest PASID must be obtained via virtual command interface.
2879 	 * There can be multiple vIOMMUs in each guest but only one allocator
2880 	 * is active. All vIOMMU allocators will eventually be calling the same
2881 	 * host allocator.
2882 	 */
2883 	if (!vccap_pasid(iommu->vccap))
2884 		return;
2885 
2886 	pr_info("Register custom PASID allocator\n");
2887 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2888 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2889 	iommu->pasid_allocator.pdata = (void *)iommu;
2890 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2891 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2892 		/*
2893 		 * Disable scalable mode on this IOMMU if there
2894 		 * is no custom allocator. Mixing SM capable vIOMMU
2895 		 * and non-SM vIOMMU are not supported.
2896 		 */
2897 		intel_iommu_sm = 0;
2898 	}
2899 }
2900 #endif
2901 
2902 static int __init init_dmars(void)
2903 {
2904 	struct dmar_drhd_unit *drhd;
2905 	struct intel_iommu *iommu;
2906 	int ret;
2907 
2908 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2909 	if (ret)
2910 		goto free_iommu;
2911 
2912 	for_each_iommu(iommu, drhd) {
2913 		if (drhd->ignored) {
2914 			iommu_disable_translation(iommu);
2915 			continue;
2916 		}
2917 
2918 		/*
2919 		 * Find the max pasid size of all IOMMU's in the system.
2920 		 * We need to ensure the system pasid table is no bigger
2921 		 * than the smallest supported.
2922 		 */
2923 		if (pasid_supported(iommu)) {
2924 			u32 temp = 2 << ecap_pss(iommu->ecap);
2925 
2926 			intel_pasid_max_id = min_t(u32, temp,
2927 						   intel_pasid_max_id);
2928 		}
2929 
2930 		intel_iommu_init_qi(iommu);
2931 
2932 		ret = iommu_init_domains(iommu);
2933 		if (ret)
2934 			goto free_iommu;
2935 
2936 		init_translation_status(iommu);
2937 
2938 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2939 			iommu_disable_translation(iommu);
2940 			clear_translation_pre_enabled(iommu);
2941 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2942 				iommu->name);
2943 		}
2944 
2945 		/*
2946 		 * TBD:
2947 		 * we could share the same root & context tables
2948 		 * among all IOMMU's. Need to Split it later.
2949 		 */
2950 		ret = iommu_alloc_root_entry(iommu);
2951 		if (ret)
2952 			goto free_iommu;
2953 
2954 		if (translation_pre_enabled(iommu)) {
2955 			pr_info("Translation already enabled - trying to copy translation structures\n");
2956 
2957 			ret = copy_translation_tables(iommu);
2958 			if (ret) {
2959 				/*
2960 				 * We found the IOMMU with translation
2961 				 * enabled - but failed to copy over the
2962 				 * old root-entry table. Try to proceed
2963 				 * by disabling translation now and
2964 				 * allocating a clean root-entry table.
2965 				 * This might cause DMAR faults, but
2966 				 * probably the dump will still succeed.
2967 				 */
2968 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2969 				       iommu->name);
2970 				iommu_disable_translation(iommu);
2971 				clear_translation_pre_enabled(iommu);
2972 			} else {
2973 				pr_info("Copied translation tables from previous kernel for %s\n",
2974 					iommu->name);
2975 			}
2976 		}
2977 
2978 		if (!ecap_pass_through(iommu->ecap))
2979 			hw_pass_through = 0;
2980 		intel_svm_check(iommu);
2981 	}
2982 
2983 	/*
2984 	 * Now that qi is enabled on all iommus, set the root entry and flush
2985 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2986 	 * flush_context function will loop forever and the boot hangs.
2987 	 */
2988 	for_each_active_iommu(iommu, drhd) {
2989 		iommu_flush_write_buffer(iommu);
2990 #ifdef CONFIG_INTEL_IOMMU_SVM
2991 		register_pasid_allocator(iommu);
2992 #endif
2993 		iommu_set_root_entry(iommu);
2994 	}
2995 
2996 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2997 	dmar_map_gfx = 0;
2998 #endif
2999 
3000 	if (!dmar_map_gfx)
3001 		iommu_identity_mapping |= IDENTMAP_GFX;
3002 
3003 	check_tylersburg_isoch();
3004 
3005 	ret = si_domain_init(hw_pass_through);
3006 	if (ret)
3007 		goto free_iommu;
3008 
3009 	/*
3010 	 * for each drhd
3011 	 *   enable fault log
3012 	 *   global invalidate context cache
3013 	 *   global invalidate iotlb
3014 	 *   enable translation
3015 	 */
3016 	for_each_iommu(iommu, drhd) {
3017 		if (drhd->ignored) {
3018 			/*
3019 			 * we always have to disable PMRs or DMA may fail on
3020 			 * this device
3021 			 */
3022 			if (force_on)
3023 				iommu_disable_protect_mem_regions(iommu);
3024 			continue;
3025 		}
3026 
3027 		iommu_flush_write_buffer(iommu);
3028 
3029 #ifdef CONFIG_INTEL_IOMMU_SVM
3030 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3031 			/*
3032 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3033 			 * could cause possible lock race condition.
3034 			 */
3035 			up_write(&dmar_global_lock);
3036 			ret = intel_svm_enable_prq(iommu);
3037 			down_write(&dmar_global_lock);
3038 			if (ret)
3039 				goto free_iommu;
3040 		}
3041 #endif
3042 		ret = dmar_set_interrupt(iommu);
3043 		if (ret)
3044 			goto free_iommu;
3045 	}
3046 
3047 	return 0;
3048 
3049 free_iommu:
3050 	for_each_active_iommu(iommu, drhd) {
3051 		disable_dmar_iommu(iommu);
3052 		free_dmar_iommu(iommu);
3053 	}
3054 	if (si_domain) {
3055 		domain_exit(si_domain);
3056 		si_domain = NULL;
3057 	}
3058 
3059 	return ret;
3060 }
3061 
3062 static void __init init_no_remapping_devices(void)
3063 {
3064 	struct dmar_drhd_unit *drhd;
3065 	struct device *dev;
3066 	int i;
3067 
3068 	for_each_drhd_unit(drhd) {
3069 		if (!drhd->include_all) {
3070 			for_each_active_dev_scope(drhd->devices,
3071 						  drhd->devices_cnt, i, dev)
3072 				break;
3073 			/* ignore DMAR unit if no devices exist */
3074 			if (i == drhd->devices_cnt)
3075 				drhd->ignored = 1;
3076 		}
3077 	}
3078 
3079 	for_each_active_drhd_unit(drhd) {
3080 		if (drhd->include_all)
3081 			continue;
3082 
3083 		for_each_active_dev_scope(drhd->devices,
3084 					  drhd->devices_cnt, i, dev)
3085 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3086 				break;
3087 		if (i < drhd->devices_cnt)
3088 			continue;
3089 
3090 		/* This IOMMU has *only* gfx devices. Either bypass it or
3091 		   set the gfx_mapped flag, as appropriate */
3092 		drhd->gfx_dedicated = 1;
3093 		if (!dmar_map_gfx)
3094 			drhd->ignored = 1;
3095 	}
3096 }
3097 
3098 #ifdef CONFIG_SUSPEND
3099 static int init_iommu_hw(void)
3100 {
3101 	struct dmar_drhd_unit *drhd;
3102 	struct intel_iommu *iommu = NULL;
3103 
3104 	for_each_active_iommu(iommu, drhd)
3105 		if (iommu->qi)
3106 			dmar_reenable_qi(iommu);
3107 
3108 	for_each_iommu(iommu, drhd) {
3109 		if (drhd->ignored) {
3110 			/*
3111 			 * we always have to disable PMRs or DMA may fail on
3112 			 * this device
3113 			 */
3114 			if (force_on)
3115 				iommu_disable_protect_mem_regions(iommu);
3116 			continue;
3117 		}
3118 
3119 		iommu_flush_write_buffer(iommu);
3120 		iommu_set_root_entry(iommu);
3121 		iommu_enable_translation(iommu);
3122 		iommu_disable_protect_mem_regions(iommu);
3123 	}
3124 
3125 	return 0;
3126 }
3127 
3128 static void iommu_flush_all(void)
3129 {
3130 	struct dmar_drhd_unit *drhd;
3131 	struct intel_iommu *iommu;
3132 
3133 	for_each_active_iommu(iommu, drhd) {
3134 		iommu->flush.flush_context(iommu, 0, 0, 0,
3135 					   DMA_CCMD_GLOBAL_INVL);
3136 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3137 					 DMA_TLB_GLOBAL_FLUSH);
3138 	}
3139 }
3140 
3141 static int iommu_suspend(void)
3142 {
3143 	struct dmar_drhd_unit *drhd;
3144 	struct intel_iommu *iommu = NULL;
3145 	unsigned long flag;
3146 
3147 	for_each_active_iommu(iommu, drhd) {
3148 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3149 					     GFP_KERNEL);
3150 		if (!iommu->iommu_state)
3151 			goto nomem;
3152 	}
3153 
3154 	iommu_flush_all();
3155 
3156 	for_each_active_iommu(iommu, drhd) {
3157 		iommu_disable_translation(iommu);
3158 
3159 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3160 
3161 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3162 			readl(iommu->reg + DMAR_FECTL_REG);
3163 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3164 			readl(iommu->reg + DMAR_FEDATA_REG);
3165 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3166 			readl(iommu->reg + DMAR_FEADDR_REG);
3167 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3168 			readl(iommu->reg + DMAR_FEUADDR_REG);
3169 
3170 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3171 	}
3172 	return 0;
3173 
3174 nomem:
3175 	for_each_active_iommu(iommu, drhd)
3176 		kfree(iommu->iommu_state);
3177 
3178 	return -ENOMEM;
3179 }
3180 
3181 static void iommu_resume(void)
3182 {
3183 	struct dmar_drhd_unit *drhd;
3184 	struct intel_iommu *iommu = NULL;
3185 	unsigned long flag;
3186 
3187 	if (init_iommu_hw()) {
3188 		if (force_on)
3189 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3190 		else
3191 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3192 		return;
3193 	}
3194 
3195 	for_each_active_iommu(iommu, drhd) {
3196 
3197 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3198 
3199 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3200 			iommu->reg + DMAR_FECTL_REG);
3201 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3202 			iommu->reg + DMAR_FEDATA_REG);
3203 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3204 			iommu->reg + DMAR_FEADDR_REG);
3205 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3206 			iommu->reg + DMAR_FEUADDR_REG);
3207 
3208 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3209 	}
3210 
3211 	for_each_active_iommu(iommu, drhd)
3212 		kfree(iommu->iommu_state);
3213 }
3214 
3215 static struct syscore_ops iommu_syscore_ops = {
3216 	.resume		= iommu_resume,
3217 	.suspend	= iommu_suspend,
3218 };
3219 
3220 static void __init init_iommu_pm_ops(void)
3221 {
3222 	register_syscore_ops(&iommu_syscore_ops);
3223 }
3224 
3225 #else
3226 static inline void init_iommu_pm_ops(void) {}
3227 #endif	/* CONFIG_PM */
3228 
3229 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3230 {
3231 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3232 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3233 	    rmrr->end_address <= rmrr->base_address ||
3234 	    arch_rmrr_sanity_check(rmrr))
3235 		return -EINVAL;
3236 
3237 	return 0;
3238 }
3239 
3240 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3241 {
3242 	struct acpi_dmar_reserved_memory *rmrr;
3243 	struct dmar_rmrr_unit *rmrru;
3244 
3245 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3246 	if (rmrr_sanity_check(rmrr)) {
3247 		pr_warn(FW_BUG
3248 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3249 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3250 			   rmrr->base_address, rmrr->end_address,
3251 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3252 			   dmi_get_system_info(DMI_BIOS_VERSION),
3253 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3254 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3255 	}
3256 
3257 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3258 	if (!rmrru)
3259 		goto out;
3260 
3261 	rmrru->hdr = header;
3262 
3263 	rmrru->base_address = rmrr->base_address;
3264 	rmrru->end_address = rmrr->end_address;
3265 
3266 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3267 				((void *)rmrr) + rmrr->header.length,
3268 				&rmrru->devices_cnt);
3269 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3270 		goto free_rmrru;
3271 
3272 	list_add(&rmrru->list, &dmar_rmrr_units);
3273 
3274 	return 0;
3275 free_rmrru:
3276 	kfree(rmrru);
3277 out:
3278 	return -ENOMEM;
3279 }
3280 
3281 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3282 {
3283 	struct dmar_atsr_unit *atsru;
3284 	struct acpi_dmar_atsr *tmp;
3285 
3286 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3287 				dmar_rcu_check()) {
3288 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3289 		if (atsr->segment != tmp->segment)
3290 			continue;
3291 		if (atsr->header.length != tmp->header.length)
3292 			continue;
3293 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3294 			return atsru;
3295 	}
3296 
3297 	return NULL;
3298 }
3299 
3300 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3301 {
3302 	struct acpi_dmar_atsr *atsr;
3303 	struct dmar_atsr_unit *atsru;
3304 
3305 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3306 		return 0;
3307 
3308 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3309 	atsru = dmar_find_atsr(atsr);
3310 	if (atsru)
3311 		return 0;
3312 
3313 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3314 	if (!atsru)
3315 		return -ENOMEM;
3316 
3317 	/*
3318 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3319 	 * copy the memory content because the memory buffer will be freed
3320 	 * on return.
3321 	 */
3322 	atsru->hdr = (void *)(atsru + 1);
3323 	memcpy(atsru->hdr, hdr, hdr->length);
3324 	atsru->include_all = atsr->flags & 0x1;
3325 	if (!atsru->include_all) {
3326 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3327 				(void *)atsr + atsr->header.length,
3328 				&atsru->devices_cnt);
3329 		if (atsru->devices_cnt && atsru->devices == NULL) {
3330 			kfree(atsru);
3331 			return -ENOMEM;
3332 		}
3333 	}
3334 
3335 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3336 
3337 	return 0;
3338 }
3339 
3340 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3341 {
3342 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3343 	kfree(atsru);
3344 }
3345 
3346 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3347 {
3348 	struct acpi_dmar_atsr *atsr;
3349 	struct dmar_atsr_unit *atsru;
3350 
3351 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3352 	atsru = dmar_find_atsr(atsr);
3353 	if (atsru) {
3354 		list_del_rcu(&atsru->list);
3355 		synchronize_rcu();
3356 		intel_iommu_free_atsr(atsru);
3357 	}
3358 
3359 	return 0;
3360 }
3361 
3362 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3363 {
3364 	int i;
3365 	struct device *dev;
3366 	struct acpi_dmar_atsr *atsr;
3367 	struct dmar_atsr_unit *atsru;
3368 
3369 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3370 	atsru = dmar_find_atsr(atsr);
3371 	if (!atsru)
3372 		return 0;
3373 
3374 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3375 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3376 					  i, dev)
3377 			return -EBUSY;
3378 	}
3379 
3380 	return 0;
3381 }
3382 
3383 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3384 {
3385 	struct dmar_satc_unit *satcu;
3386 	struct acpi_dmar_satc *tmp;
3387 
3388 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3389 				dmar_rcu_check()) {
3390 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3391 		if (satc->segment != tmp->segment)
3392 			continue;
3393 		if (satc->header.length != tmp->header.length)
3394 			continue;
3395 		if (memcmp(satc, tmp, satc->header.length) == 0)
3396 			return satcu;
3397 	}
3398 
3399 	return NULL;
3400 }
3401 
3402 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3403 {
3404 	struct acpi_dmar_satc *satc;
3405 	struct dmar_satc_unit *satcu;
3406 
3407 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3408 		return 0;
3409 
3410 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3411 	satcu = dmar_find_satc(satc);
3412 	if (satcu)
3413 		return 0;
3414 
3415 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3416 	if (!satcu)
3417 		return -ENOMEM;
3418 
3419 	satcu->hdr = (void *)(satcu + 1);
3420 	memcpy(satcu->hdr, hdr, hdr->length);
3421 	satcu->atc_required = satc->flags & 0x1;
3422 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3423 					      (void *)satc + satc->header.length,
3424 					      &satcu->devices_cnt);
3425 	if (satcu->devices_cnt && !satcu->devices) {
3426 		kfree(satcu);
3427 		return -ENOMEM;
3428 	}
3429 	list_add_rcu(&satcu->list, &dmar_satc_units);
3430 
3431 	return 0;
3432 }
3433 
3434 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3435 {
3436 	int sp, ret;
3437 	struct intel_iommu *iommu = dmaru->iommu;
3438 
3439 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3440 	if (ret)
3441 		goto out;
3442 
3443 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3444 		pr_warn("%s: Doesn't support hardware pass through.\n",
3445 			iommu->name);
3446 		return -ENXIO;
3447 	}
3448 
3449 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3450 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3451 		pr_warn("%s: Doesn't support large page.\n",
3452 			iommu->name);
3453 		return -ENXIO;
3454 	}
3455 
3456 	/*
3457 	 * Disable translation if already enabled prior to OS handover.
3458 	 */
3459 	if (iommu->gcmd & DMA_GCMD_TE)
3460 		iommu_disable_translation(iommu);
3461 
3462 	ret = iommu_init_domains(iommu);
3463 	if (ret == 0)
3464 		ret = iommu_alloc_root_entry(iommu);
3465 	if (ret)
3466 		goto out;
3467 
3468 	intel_svm_check(iommu);
3469 
3470 	if (dmaru->ignored) {
3471 		/*
3472 		 * we always have to disable PMRs or DMA may fail on this device
3473 		 */
3474 		if (force_on)
3475 			iommu_disable_protect_mem_regions(iommu);
3476 		return 0;
3477 	}
3478 
3479 	intel_iommu_init_qi(iommu);
3480 	iommu_flush_write_buffer(iommu);
3481 
3482 #ifdef CONFIG_INTEL_IOMMU_SVM
3483 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3484 		ret = intel_svm_enable_prq(iommu);
3485 		if (ret)
3486 			goto disable_iommu;
3487 	}
3488 #endif
3489 	ret = dmar_set_interrupt(iommu);
3490 	if (ret)
3491 		goto disable_iommu;
3492 
3493 	iommu_set_root_entry(iommu);
3494 	iommu_enable_translation(iommu);
3495 
3496 	iommu_disable_protect_mem_regions(iommu);
3497 	return 0;
3498 
3499 disable_iommu:
3500 	disable_dmar_iommu(iommu);
3501 out:
3502 	free_dmar_iommu(iommu);
3503 	return ret;
3504 }
3505 
3506 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3507 {
3508 	int ret = 0;
3509 	struct intel_iommu *iommu = dmaru->iommu;
3510 
3511 	if (!intel_iommu_enabled)
3512 		return 0;
3513 	if (iommu == NULL)
3514 		return -EINVAL;
3515 
3516 	if (insert) {
3517 		ret = intel_iommu_add(dmaru);
3518 	} else {
3519 		disable_dmar_iommu(iommu);
3520 		free_dmar_iommu(iommu);
3521 	}
3522 
3523 	return ret;
3524 }
3525 
3526 static void intel_iommu_free_dmars(void)
3527 {
3528 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3529 	struct dmar_atsr_unit *atsru, *atsr_n;
3530 	struct dmar_satc_unit *satcu, *satc_n;
3531 
3532 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3533 		list_del(&rmrru->list);
3534 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3535 		kfree(rmrru);
3536 	}
3537 
3538 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3539 		list_del(&atsru->list);
3540 		intel_iommu_free_atsr(atsru);
3541 	}
3542 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3543 		list_del(&satcu->list);
3544 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3545 		kfree(satcu);
3546 	}
3547 }
3548 
3549 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3550 {
3551 	struct dmar_satc_unit *satcu;
3552 	struct acpi_dmar_satc *satc;
3553 	struct device *tmp;
3554 	int i;
3555 
3556 	dev = pci_physfn(dev);
3557 	rcu_read_lock();
3558 
3559 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3560 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3561 		if (satc->segment != pci_domain_nr(dev->bus))
3562 			continue;
3563 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3564 			if (to_pci_dev(tmp) == dev)
3565 				goto out;
3566 	}
3567 	satcu = NULL;
3568 out:
3569 	rcu_read_unlock();
3570 	return satcu;
3571 }
3572 
3573 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3574 {
3575 	int i, ret = 1;
3576 	struct pci_bus *bus;
3577 	struct pci_dev *bridge = NULL;
3578 	struct device *tmp;
3579 	struct acpi_dmar_atsr *atsr;
3580 	struct dmar_atsr_unit *atsru;
3581 	struct dmar_satc_unit *satcu;
3582 
3583 	dev = pci_physfn(dev);
3584 	satcu = dmar_find_matched_satc_unit(dev);
3585 	if (satcu)
3586 		/*
3587 		 * This device supports ATS as it is in SATC table.
3588 		 * When IOMMU is in legacy mode, enabling ATS is done
3589 		 * automatically by HW for the device that requires
3590 		 * ATS, hence OS should not enable this device ATS
3591 		 * to avoid duplicated TLB invalidation.
3592 		 */
3593 		return !(satcu->atc_required && !sm_supported(iommu));
3594 
3595 	for (bus = dev->bus; bus; bus = bus->parent) {
3596 		bridge = bus->self;
3597 		/* If it's an integrated device, allow ATS */
3598 		if (!bridge)
3599 			return 1;
3600 		/* Connected via non-PCIe: no ATS */
3601 		if (!pci_is_pcie(bridge) ||
3602 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3603 			return 0;
3604 		/* If we found the root port, look it up in the ATSR */
3605 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3606 			break;
3607 	}
3608 
3609 	rcu_read_lock();
3610 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3611 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3612 		if (atsr->segment != pci_domain_nr(dev->bus))
3613 			continue;
3614 
3615 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3616 			if (tmp == &bridge->dev)
3617 				goto out;
3618 
3619 		if (atsru->include_all)
3620 			goto out;
3621 	}
3622 	ret = 0;
3623 out:
3624 	rcu_read_unlock();
3625 
3626 	return ret;
3627 }
3628 
3629 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3630 {
3631 	int ret;
3632 	struct dmar_rmrr_unit *rmrru;
3633 	struct dmar_atsr_unit *atsru;
3634 	struct dmar_satc_unit *satcu;
3635 	struct acpi_dmar_atsr *atsr;
3636 	struct acpi_dmar_reserved_memory *rmrr;
3637 	struct acpi_dmar_satc *satc;
3638 
3639 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3640 		return 0;
3641 
3642 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3643 		rmrr = container_of(rmrru->hdr,
3644 				    struct acpi_dmar_reserved_memory, header);
3645 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3646 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3647 				((void *)rmrr) + rmrr->header.length,
3648 				rmrr->segment, rmrru->devices,
3649 				rmrru->devices_cnt);
3650 			if (ret < 0)
3651 				return ret;
3652 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3653 			dmar_remove_dev_scope(info, rmrr->segment,
3654 				rmrru->devices, rmrru->devices_cnt);
3655 		}
3656 	}
3657 
3658 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3659 		if (atsru->include_all)
3660 			continue;
3661 
3662 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3663 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3664 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3665 					(void *)atsr + atsr->header.length,
3666 					atsr->segment, atsru->devices,
3667 					atsru->devices_cnt);
3668 			if (ret > 0)
3669 				break;
3670 			else if (ret < 0)
3671 				return ret;
3672 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3673 			if (dmar_remove_dev_scope(info, atsr->segment,
3674 					atsru->devices, atsru->devices_cnt))
3675 				break;
3676 		}
3677 	}
3678 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3679 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3680 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3681 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3682 					(void *)satc + satc->header.length,
3683 					satc->segment, satcu->devices,
3684 					satcu->devices_cnt);
3685 			if (ret > 0)
3686 				break;
3687 			else if (ret < 0)
3688 				return ret;
3689 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3690 			if (dmar_remove_dev_scope(info, satc->segment,
3691 					satcu->devices, satcu->devices_cnt))
3692 				break;
3693 		}
3694 	}
3695 
3696 	return 0;
3697 }
3698 
3699 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3700 				       unsigned long val, void *v)
3701 {
3702 	struct memory_notify *mhp = v;
3703 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3704 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3705 			mhp->nr_pages - 1);
3706 
3707 	switch (val) {
3708 	case MEM_GOING_ONLINE:
3709 		if (iommu_domain_identity_map(si_domain,
3710 					      start_vpfn, last_vpfn)) {
3711 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3712 				start_vpfn, last_vpfn);
3713 			return NOTIFY_BAD;
3714 		}
3715 		break;
3716 
3717 	case MEM_OFFLINE:
3718 	case MEM_CANCEL_ONLINE:
3719 		{
3720 			struct dmar_drhd_unit *drhd;
3721 			struct intel_iommu *iommu;
3722 			LIST_HEAD(freelist);
3723 
3724 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3725 
3726 			rcu_read_lock();
3727 			for_each_active_iommu(iommu, drhd)
3728 				iommu_flush_iotlb_psi(iommu, si_domain,
3729 					start_vpfn, mhp->nr_pages,
3730 					list_empty(&freelist), 0);
3731 			rcu_read_unlock();
3732 			put_pages_list(&freelist);
3733 		}
3734 		break;
3735 	}
3736 
3737 	return NOTIFY_OK;
3738 }
3739 
3740 static struct notifier_block intel_iommu_memory_nb = {
3741 	.notifier_call = intel_iommu_memory_notifier,
3742 	.priority = 0
3743 };
3744 
3745 static void intel_disable_iommus(void)
3746 {
3747 	struct intel_iommu *iommu = NULL;
3748 	struct dmar_drhd_unit *drhd;
3749 
3750 	for_each_iommu(iommu, drhd)
3751 		iommu_disable_translation(iommu);
3752 }
3753 
3754 void intel_iommu_shutdown(void)
3755 {
3756 	struct dmar_drhd_unit *drhd;
3757 	struct intel_iommu *iommu = NULL;
3758 
3759 	if (no_iommu || dmar_disabled)
3760 		return;
3761 
3762 	down_write(&dmar_global_lock);
3763 
3764 	/* Disable PMRs explicitly here. */
3765 	for_each_iommu(iommu, drhd)
3766 		iommu_disable_protect_mem_regions(iommu);
3767 
3768 	/* Make sure the IOMMUs are switched off */
3769 	intel_disable_iommus();
3770 
3771 	up_write(&dmar_global_lock);
3772 }
3773 
3774 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3775 {
3776 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3777 
3778 	return container_of(iommu_dev, struct intel_iommu, iommu);
3779 }
3780 
3781 static ssize_t version_show(struct device *dev,
3782 			    struct device_attribute *attr, char *buf)
3783 {
3784 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3785 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3786 	return sprintf(buf, "%d:%d\n",
3787 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3788 }
3789 static DEVICE_ATTR_RO(version);
3790 
3791 static ssize_t address_show(struct device *dev,
3792 			    struct device_attribute *attr, char *buf)
3793 {
3794 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3795 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3796 }
3797 static DEVICE_ATTR_RO(address);
3798 
3799 static ssize_t cap_show(struct device *dev,
3800 			struct device_attribute *attr, char *buf)
3801 {
3802 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3803 	return sprintf(buf, "%llx\n", iommu->cap);
3804 }
3805 static DEVICE_ATTR_RO(cap);
3806 
3807 static ssize_t ecap_show(struct device *dev,
3808 			 struct device_attribute *attr, char *buf)
3809 {
3810 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3811 	return sprintf(buf, "%llx\n", iommu->ecap);
3812 }
3813 static DEVICE_ATTR_RO(ecap);
3814 
3815 static ssize_t domains_supported_show(struct device *dev,
3816 				      struct device_attribute *attr, char *buf)
3817 {
3818 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3819 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3820 }
3821 static DEVICE_ATTR_RO(domains_supported);
3822 
3823 static ssize_t domains_used_show(struct device *dev,
3824 				 struct device_attribute *attr, char *buf)
3825 {
3826 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3827 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3828 						  cap_ndoms(iommu->cap)));
3829 }
3830 static DEVICE_ATTR_RO(domains_used);
3831 
3832 static struct attribute *intel_iommu_attrs[] = {
3833 	&dev_attr_version.attr,
3834 	&dev_attr_address.attr,
3835 	&dev_attr_cap.attr,
3836 	&dev_attr_ecap.attr,
3837 	&dev_attr_domains_supported.attr,
3838 	&dev_attr_domains_used.attr,
3839 	NULL,
3840 };
3841 
3842 static struct attribute_group intel_iommu_group = {
3843 	.name = "intel-iommu",
3844 	.attrs = intel_iommu_attrs,
3845 };
3846 
3847 const struct attribute_group *intel_iommu_groups[] = {
3848 	&intel_iommu_group,
3849 	NULL,
3850 };
3851 
3852 static inline bool has_external_pci(void)
3853 {
3854 	struct pci_dev *pdev = NULL;
3855 
3856 	for_each_pci_dev(pdev)
3857 		if (pdev->external_facing)
3858 			return true;
3859 
3860 	return false;
3861 }
3862 
3863 static int __init platform_optin_force_iommu(void)
3864 {
3865 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3866 		return 0;
3867 
3868 	if (no_iommu || dmar_disabled)
3869 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3870 
3871 	/*
3872 	 * If Intel-IOMMU is disabled by default, we will apply identity
3873 	 * map for all devices except those marked as being untrusted.
3874 	 */
3875 	if (dmar_disabled)
3876 		iommu_set_default_passthrough(false);
3877 
3878 	dmar_disabled = 0;
3879 	no_iommu = 0;
3880 
3881 	return 1;
3882 }
3883 
3884 static int __init probe_acpi_namespace_devices(void)
3885 {
3886 	struct dmar_drhd_unit *drhd;
3887 	/* To avoid a -Wunused-but-set-variable warning. */
3888 	struct intel_iommu *iommu __maybe_unused;
3889 	struct device *dev;
3890 	int i, ret = 0;
3891 
3892 	for_each_active_iommu(iommu, drhd) {
3893 		for_each_active_dev_scope(drhd->devices,
3894 					  drhd->devices_cnt, i, dev) {
3895 			struct acpi_device_physical_node *pn;
3896 			struct iommu_group *group;
3897 			struct acpi_device *adev;
3898 
3899 			if (dev->bus != &acpi_bus_type)
3900 				continue;
3901 
3902 			adev = to_acpi_device(dev);
3903 			mutex_lock(&adev->physical_node_lock);
3904 			list_for_each_entry(pn,
3905 					    &adev->physical_node_list, node) {
3906 				group = iommu_group_get(pn->dev);
3907 				if (group) {
3908 					iommu_group_put(group);
3909 					continue;
3910 				}
3911 
3912 				ret = iommu_probe_device(pn->dev);
3913 				if (ret)
3914 					break;
3915 			}
3916 			mutex_unlock(&adev->physical_node_lock);
3917 
3918 			if (ret)
3919 				return ret;
3920 		}
3921 	}
3922 
3923 	return 0;
3924 }
3925 
3926 static __init int tboot_force_iommu(void)
3927 {
3928 	if (!tboot_enabled())
3929 		return 0;
3930 
3931 	if (no_iommu || dmar_disabled)
3932 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3933 
3934 	dmar_disabled = 0;
3935 	no_iommu = 0;
3936 
3937 	return 1;
3938 }
3939 
3940 int __init intel_iommu_init(void)
3941 {
3942 	int ret = -ENODEV;
3943 	struct dmar_drhd_unit *drhd;
3944 	struct intel_iommu *iommu;
3945 
3946 	/*
3947 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3948 	 * opt in, so enforce that.
3949 	 */
3950 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3951 		    platform_optin_force_iommu();
3952 
3953 	down_write(&dmar_global_lock);
3954 	if (dmar_table_init()) {
3955 		if (force_on)
3956 			panic("tboot: Failed to initialize DMAR table\n");
3957 		goto out_free_dmar;
3958 	}
3959 
3960 	if (dmar_dev_scope_init() < 0) {
3961 		if (force_on)
3962 			panic("tboot: Failed to initialize DMAR device scope\n");
3963 		goto out_free_dmar;
3964 	}
3965 
3966 	up_write(&dmar_global_lock);
3967 
3968 	/*
3969 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3970 	 * complain later when we register it under the lock.
3971 	 */
3972 	dmar_register_bus_notifier();
3973 
3974 	down_write(&dmar_global_lock);
3975 
3976 	if (!no_iommu)
3977 		intel_iommu_debugfs_init();
3978 
3979 	if (no_iommu || dmar_disabled) {
3980 		/*
3981 		 * We exit the function here to ensure IOMMU's remapping and
3982 		 * mempool aren't setup, which means that the IOMMU's PMRs
3983 		 * won't be disabled via the call to init_dmars(). So disable
3984 		 * it explicitly here. The PMRs were setup by tboot prior to
3985 		 * calling SENTER, but the kernel is expected to reset/tear
3986 		 * down the PMRs.
3987 		 */
3988 		if (intel_iommu_tboot_noforce) {
3989 			for_each_iommu(iommu, drhd)
3990 				iommu_disable_protect_mem_regions(iommu);
3991 		}
3992 
3993 		/*
3994 		 * Make sure the IOMMUs are switched off, even when we
3995 		 * boot into a kexec kernel and the previous kernel left
3996 		 * them enabled
3997 		 */
3998 		intel_disable_iommus();
3999 		goto out_free_dmar;
4000 	}
4001 
4002 	if (list_empty(&dmar_rmrr_units))
4003 		pr_info("No RMRR found\n");
4004 
4005 	if (list_empty(&dmar_atsr_units))
4006 		pr_info("No ATSR found\n");
4007 
4008 	if (list_empty(&dmar_satc_units))
4009 		pr_info("No SATC found\n");
4010 
4011 	init_no_remapping_devices();
4012 
4013 	ret = init_dmars();
4014 	if (ret) {
4015 		if (force_on)
4016 			panic("tboot: Failed to initialize DMARs\n");
4017 		pr_err("Initialization failed\n");
4018 		goto out_free_dmar;
4019 	}
4020 	up_write(&dmar_global_lock);
4021 
4022 	init_iommu_pm_ops();
4023 
4024 	down_read(&dmar_global_lock);
4025 	for_each_active_iommu(iommu, drhd) {
4026 		/*
4027 		 * The flush queue implementation does not perform
4028 		 * page-selective invalidations that are required for efficient
4029 		 * TLB flushes in virtual environments.  The benefit of batching
4030 		 * is likely to be much lower than the overhead of synchronizing
4031 		 * the virtual and physical IOMMU page-tables.
4032 		 */
4033 		if (cap_caching_mode(iommu->cap)) {
4034 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4035 			iommu_set_dma_strict();
4036 		}
4037 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4038 				       intel_iommu_groups,
4039 				       "%s", iommu->name);
4040 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4041 	}
4042 	up_read(&dmar_global_lock);
4043 
4044 	if (si_domain && !hw_pass_through)
4045 		register_memory_notifier(&intel_iommu_memory_nb);
4046 
4047 	down_read(&dmar_global_lock);
4048 	if (probe_acpi_namespace_devices())
4049 		pr_warn("ACPI name space devices didn't probe correctly\n");
4050 
4051 	/* Finally, we enable the DMA remapping hardware. */
4052 	for_each_iommu(iommu, drhd) {
4053 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4054 			iommu_enable_translation(iommu);
4055 
4056 		iommu_disable_protect_mem_regions(iommu);
4057 	}
4058 	up_read(&dmar_global_lock);
4059 
4060 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4061 
4062 	intel_iommu_enabled = 1;
4063 
4064 	return 0;
4065 
4066 out_free_dmar:
4067 	intel_iommu_free_dmars();
4068 	up_write(&dmar_global_lock);
4069 	return ret;
4070 }
4071 
4072 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4073 {
4074 	struct device_domain_info *info = opaque;
4075 
4076 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4077 	return 0;
4078 }
4079 
4080 /*
4081  * NB - intel-iommu lacks any sort of reference counting for the users of
4082  * dependent devices.  If multiple endpoints have intersecting dependent
4083  * devices, unbinding the driver from any one of them will possibly leave
4084  * the others unable to operate.
4085  */
4086 static void domain_context_clear(struct device_domain_info *info)
4087 {
4088 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4089 		return;
4090 
4091 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4092 			       &domain_context_clear_one_cb, info);
4093 }
4094 
4095 static void dmar_remove_one_dev_info(struct device *dev)
4096 {
4097 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4098 	struct dmar_domain *domain = info->domain;
4099 	struct intel_iommu *iommu = info->iommu;
4100 	unsigned long flags;
4101 
4102 	if (!dev_is_real_dma_subdevice(info->dev)) {
4103 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4104 			intel_pasid_tear_down_entry(iommu, info->dev,
4105 					PASID_RID2PASID, false);
4106 
4107 		iommu_disable_dev_iotlb(info);
4108 		domain_context_clear(info);
4109 		intel_pasid_free_table(info->dev);
4110 	}
4111 
4112 	spin_lock_irqsave(&domain->lock, flags);
4113 	list_del(&info->link);
4114 	spin_unlock_irqrestore(&domain->lock, flags);
4115 
4116 	domain_detach_iommu(domain, iommu);
4117 	info->domain = NULL;
4118 }
4119 
4120 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4121 {
4122 	int adjust_width;
4123 
4124 	/* calculate AGAW */
4125 	domain->gaw = guest_width;
4126 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4127 	domain->agaw = width_to_agaw(adjust_width);
4128 
4129 	domain->iommu_coherency = false;
4130 	domain->iommu_superpage = 0;
4131 	domain->max_addr = 0;
4132 
4133 	/* always allocate the top pgd */
4134 	domain->pgd = alloc_pgtable_page(domain->nid);
4135 	if (!domain->pgd)
4136 		return -ENOMEM;
4137 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4138 	return 0;
4139 }
4140 
4141 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4142 {
4143 	struct dmar_domain *dmar_domain;
4144 	struct iommu_domain *domain;
4145 
4146 	switch (type) {
4147 	case IOMMU_DOMAIN_DMA:
4148 	case IOMMU_DOMAIN_DMA_FQ:
4149 	case IOMMU_DOMAIN_UNMANAGED:
4150 		dmar_domain = alloc_domain(type);
4151 		if (!dmar_domain) {
4152 			pr_err("Can't allocate dmar_domain\n");
4153 			return NULL;
4154 		}
4155 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4156 			pr_err("Domain initialization failed\n");
4157 			domain_exit(dmar_domain);
4158 			return NULL;
4159 		}
4160 
4161 		domain = &dmar_domain->domain;
4162 		domain->geometry.aperture_start = 0;
4163 		domain->geometry.aperture_end   =
4164 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4165 		domain->geometry.force_aperture = true;
4166 
4167 		return domain;
4168 	case IOMMU_DOMAIN_IDENTITY:
4169 		return &si_domain->domain;
4170 	default:
4171 		return NULL;
4172 	}
4173 
4174 	return NULL;
4175 }
4176 
4177 static void intel_iommu_domain_free(struct iommu_domain *domain)
4178 {
4179 	if (domain != &si_domain->domain)
4180 		domain_exit(to_dmar_domain(domain));
4181 }
4182 
4183 static int prepare_domain_attach_device(struct iommu_domain *domain,
4184 					struct device *dev)
4185 {
4186 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4187 	struct intel_iommu *iommu;
4188 	int addr_width;
4189 
4190 	iommu = device_to_iommu(dev, NULL, NULL);
4191 	if (!iommu)
4192 		return -ENODEV;
4193 
4194 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4195 		return -EOPNOTSUPP;
4196 
4197 	/* check if this iommu agaw is sufficient for max mapped address */
4198 	addr_width = agaw_to_width(iommu->agaw);
4199 	if (addr_width > cap_mgaw(iommu->cap))
4200 		addr_width = cap_mgaw(iommu->cap);
4201 
4202 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4203 		dev_err(dev, "%s: iommu width (%d) is not "
4204 		        "sufficient for the mapped address (%llx)\n",
4205 		        __func__, addr_width, dmar_domain->max_addr);
4206 		return -EFAULT;
4207 	}
4208 	dmar_domain->gaw = addr_width;
4209 
4210 	/*
4211 	 * Knock out extra levels of page tables if necessary
4212 	 */
4213 	while (iommu->agaw < dmar_domain->agaw) {
4214 		struct dma_pte *pte;
4215 
4216 		pte = dmar_domain->pgd;
4217 		if (dma_pte_present(pte)) {
4218 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4219 			free_pgtable_page(pte);
4220 		}
4221 		dmar_domain->agaw--;
4222 	}
4223 
4224 	return 0;
4225 }
4226 
4227 static int intel_iommu_attach_device(struct iommu_domain *domain,
4228 				     struct device *dev)
4229 {
4230 	int ret;
4231 
4232 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4233 	    device_is_rmrr_locked(dev)) {
4234 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4235 		return -EPERM;
4236 	}
4237 
4238 	/* normally dev is not mapped */
4239 	if (unlikely(domain_context_mapped(dev))) {
4240 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4241 
4242 		if (info->domain)
4243 			dmar_remove_one_dev_info(dev);
4244 	}
4245 
4246 	ret = prepare_domain_attach_device(domain, dev);
4247 	if (ret)
4248 		return ret;
4249 
4250 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4251 }
4252 
4253 static void intel_iommu_detach_device(struct iommu_domain *domain,
4254 				      struct device *dev)
4255 {
4256 	dmar_remove_one_dev_info(dev);
4257 }
4258 
4259 static int intel_iommu_map(struct iommu_domain *domain,
4260 			   unsigned long iova, phys_addr_t hpa,
4261 			   size_t size, int iommu_prot, gfp_t gfp)
4262 {
4263 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4264 	u64 max_addr;
4265 	int prot = 0;
4266 
4267 	if (iommu_prot & IOMMU_READ)
4268 		prot |= DMA_PTE_READ;
4269 	if (iommu_prot & IOMMU_WRITE)
4270 		prot |= DMA_PTE_WRITE;
4271 	if (dmar_domain->set_pte_snp)
4272 		prot |= DMA_PTE_SNP;
4273 
4274 	max_addr = iova + size;
4275 	if (dmar_domain->max_addr < max_addr) {
4276 		u64 end;
4277 
4278 		/* check if minimum agaw is sufficient for mapped address */
4279 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4280 		if (end < max_addr) {
4281 			pr_err("%s: iommu width (%d) is not "
4282 			       "sufficient for the mapped address (%llx)\n",
4283 			       __func__, dmar_domain->gaw, max_addr);
4284 			return -EFAULT;
4285 		}
4286 		dmar_domain->max_addr = max_addr;
4287 	}
4288 	/* Round up size to next multiple of PAGE_SIZE, if it and
4289 	   the low bits of hpa would take us onto the next page */
4290 	size = aligned_nrpages(hpa, size);
4291 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4292 				hpa >> VTD_PAGE_SHIFT, size, prot);
4293 }
4294 
4295 static int intel_iommu_map_pages(struct iommu_domain *domain,
4296 				 unsigned long iova, phys_addr_t paddr,
4297 				 size_t pgsize, size_t pgcount,
4298 				 int prot, gfp_t gfp, size_t *mapped)
4299 {
4300 	unsigned long pgshift = __ffs(pgsize);
4301 	size_t size = pgcount << pgshift;
4302 	int ret;
4303 
4304 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4305 		return -EINVAL;
4306 
4307 	if (!IS_ALIGNED(iova | paddr, pgsize))
4308 		return -EINVAL;
4309 
4310 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4311 	if (!ret && mapped)
4312 		*mapped = size;
4313 
4314 	return ret;
4315 }
4316 
4317 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4318 				unsigned long iova, size_t size,
4319 				struct iommu_iotlb_gather *gather)
4320 {
4321 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4322 	unsigned long start_pfn, last_pfn;
4323 	int level = 0;
4324 
4325 	/* Cope with horrid API which requires us to unmap more than the
4326 	   size argument if it happens to be a large-page mapping. */
4327 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4328 
4329 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4330 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4331 
4332 	start_pfn = iova >> VTD_PAGE_SHIFT;
4333 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4334 
4335 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4336 
4337 	if (dmar_domain->max_addr == iova + size)
4338 		dmar_domain->max_addr = iova;
4339 
4340 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4341 
4342 	return size;
4343 }
4344 
4345 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4346 				      unsigned long iova,
4347 				      size_t pgsize, size_t pgcount,
4348 				      struct iommu_iotlb_gather *gather)
4349 {
4350 	unsigned long pgshift = __ffs(pgsize);
4351 	size_t size = pgcount << pgshift;
4352 
4353 	return intel_iommu_unmap(domain, iova, size, gather);
4354 }
4355 
4356 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4357 				 struct iommu_iotlb_gather *gather)
4358 {
4359 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4360 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4361 	size_t size = gather->end - gather->start;
4362 	struct iommu_domain_info *info;
4363 	unsigned long start_pfn;
4364 	unsigned long nrpages;
4365 	unsigned long i;
4366 
4367 	nrpages = aligned_nrpages(gather->start, size);
4368 	start_pfn = mm_to_dma_pfn(iova_pfn);
4369 
4370 	xa_for_each(&dmar_domain->iommu_array, i, info)
4371 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4372 				      start_pfn, nrpages,
4373 				      list_empty(&gather->freelist), 0);
4374 
4375 	put_pages_list(&gather->freelist);
4376 }
4377 
4378 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4379 					    dma_addr_t iova)
4380 {
4381 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4382 	struct dma_pte *pte;
4383 	int level = 0;
4384 	u64 phys = 0;
4385 
4386 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4387 	if (pte && dma_pte_present(pte))
4388 		phys = dma_pte_addr(pte) +
4389 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4390 						VTD_PAGE_SHIFT) - 1));
4391 
4392 	return phys;
4393 }
4394 
4395 static bool domain_support_force_snooping(struct dmar_domain *domain)
4396 {
4397 	struct device_domain_info *info;
4398 	bool support = true;
4399 
4400 	assert_spin_locked(&domain->lock);
4401 	list_for_each_entry(info, &domain->devices, link) {
4402 		if (!ecap_sc_support(info->iommu->ecap)) {
4403 			support = false;
4404 			break;
4405 		}
4406 	}
4407 
4408 	return support;
4409 }
4410 
4411 static void domain_set_force_snooping(struct dmar_domain *domain)
4412 {
4413 	struct device_domain_info *info;
4414 
4415 	assert_spin_locked(&domain->lock);
4416 	/*
4417 	 * Second level page table supports per-PTE snoop control. The
4418 	 * iommu_map() interface will handle this by setting SNP bit.
4419 	 */
4420 	if (!domain_use_first_level(domain)) {
4421 		domain->set_pte_snp = true;
4422 		return;
4423 	}
4424 
4425 	list_for_each_entry(info, &domain->devices, link)
4426 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4427 						     PASID_RID2PASID);
4428 }
4429 
4430 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4431 {
4432 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4433 	unsigned long flags;
4434 
4435 	if (dmar_domain->force_snooping)
4436 		return true;
4437 
4438 	spin_lock_irqsave(&dmar_domain->lock, flags);
4439 	if (!domain_support_force_snooping(dmar_domain)) {
4440 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4441 		return false;
4442 	}
4443 
4444 	domain_set_force_snooping(dmar_domain);
4445 	dmar_domain->force_snooping = true;
4446 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4447 
4448 	return true;
4449 }
4450 
4451 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4452 {
4453 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4454 		return true;
4455 	if (cap == IOMMU_CAP_INTR_REMAP)
4456 		return irq_remapping_enabled == 1;
4457 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4458 		return dmar_platform_optin();
4459 
4460 	return false;
4461 }
4462 
4463 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4464 {
4465 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4466 	struct device_domain_info *info;
4467 	struct intel_iommu *iommu;
4468 	u8 bus, devfn;
4469 
4470 	iommu = device_to_iommu(dev, &bus, &devfn);
4471 	if (!iommu || !iommu->iommu.ops)
4472 		return ERR_PTR(-ENODEV);
4473 
4474 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4475 	if (!info)
4476 		return ERR_PTR(-ENOMEM);
4477 
4478 	if (dev_is_real_dma_subdevice(dev)) {
4479 		info->bus = pdev->bus->number;
4480 		info->devfn = pdev->devfn;
4481 		info->segment = pci_domain_nr(pdev->bus);
4482 	} else {
4483 		info->bus = bus;
4484 		info->devfn = devfn;
4485 		info->segment = iommu->segment;
4486 	}
4487 
4488 	info->dev = dev;
4489 	info->iommu = iommu;
4490 	if (dev_is_pci(dev)) {
4491 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4492 		    pci_ats_supported(pdev) &&
4493 		    dmar_ats_supported(pdev, iommu))
4494 			info->ats_supported = 1;
4495 
4496 		if (sm_supported(iommu)) {
4497 			if (pasid_supported(iommu)) {
4498 				int features = pci_pasid_features(pdev);
4499 
4500 				if (features >= 0)
4501 					info->pasid_supported = features | 1;
4502 			}
4503 
4504 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4505 			    pci_pri_supported(pdev))
4506 				info->pri_supported = 1;
4507 		}
4508 	}
4509 
4510 	dev_iommu_priv_set(dev, info);
4511 
4512 	return &iommu->iommu;
4513 }
4514 
4515 static void intel_iommu_release_device(struct device *dev)
4516 {
4517 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4518 
4519 	dmar_remove_one_dev_info(dev);
4520 	dev_iommu_priv_set(dev, NULL);
4521 	kfree(info);
4522 	set_dma_ops(dev, NULL);
4523 }
4524 
4525 static void intel_iommu_probe_finalize(struct device *dev)
4526 {
4527 	set_dma_ops(dev, NULL);
4528 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4529 }
4530 
4531 static void intel_iommu_get_resv_regions(struct device *device,
4532 					 struct list_head *head)
4533 {
4534 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4535 	struct iommu_resv_region *reg;
4536 	struct dmar_rmrr_unit *rmrr;
4537 	struct device *i_dev;
4538 	int i;
4539 
4540 	rcu_read_lock();
4541 	for_each_rmrr_units(rmrr) {
4542 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4543 					  i, i_dev) {
4544 			struct iommu_resv_region *resv;
4545 			enum iommu_resv_type type;
4546 			size_t length;
4547 
4548 			if (i_dev != device &&
4549 			    !is_downstream_to_pci_bridge(device, i_dev))
4550 				continue;
4551 
4552 			length = rmrr->end_address - rmrr->base_address + 1;
4553 
4554 			type = device_rmrr_is_relaxable(device) ?
4555 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4556 
4557 			resv = iommu_alloc_resv_region(rmrr->base_address,
4558 						       length, prot, type,
4559 						       GFP_ATOMIC);
4560 			if (!resv)
4561 				break;
4562 
4563 			list_add_tail(&resv->list, head);
4564 		}
4565 	}
4566 	rcu_read_unlock();
4567 
4568 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4569 	if (dev_is_pci(device)) {
4570 		struct pci_dev *pdev = to_pci_dev(device);
4571 
4572 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4573 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4574 					IOMMU_RESV_DIRECT_RELAXABLE,
4575 					GFP_KERNEL);
4576 			if (reg)
4577 				list_add_tail(&reg->list, head);
4578 		}
4579 	}
4580 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4581 
4582 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4583 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4584 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4585 	if (!reg)
4586 		return;
4587 	list_add_tail(&reg->list, head);
4588 }
4589 
4590 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4591 {
4592 	if (dev_is_pci(dev))
4593 		return pci_device_group(dev);
4594 	return generic_device_group(dev);
4595 }
4596 
4597 static int intel_iommu_enable_sva(struct device *dev)
4598 {
4599 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4600 	struct intel_iommu *iommu;
4601 	int ret;
4602 
4603 	if (!info || dmar_disabled)
4604 		return -EINVAL;
4605 
4606 	iommu = info->iommu;
4607 	if (!iommu)
4608 		return -EINVAL;
4609 
4610 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4611 		return -ENODEV;
4612 
4613 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4614 		return -EINVAL;
4615 
4616 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4617 	if (!ret)
4618 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4619 
4620 	return ret;
4621 }
4622 
4623 static int intel_iommu_disable_sva(struct device *dev)
4624 {
4625 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4626 	struct intel_iommu *iommu = info->iommu;
4627 	int ret;
4628 
4629 	ret = iommu_unregister_device_fault_handler(dev);
4630 	if (!ret)
4631 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4632 
4633 	return ret;
4634 }
4635 
4636 static int intel_iommu_enable_iopf(struct device *dev)
4637 {
4638 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4639 
4640 	if (info && info->pri_supported)
4641 		return 0;
4642 
4643 	return -ENODEV;
4644 }
4645 
4646 static int
4647 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4648 {
4649 	switch (feat) {
4650 	case IOMMU_DEV_FEAT_IOPF:
4651 		return intel_iommu_enable_iopf(dev);
4652 
4653 	case IOMMU_DEV_FEAT_SVA:
4654 		return intel_iommu_enable_sva(dev);
4655 
4656 	default:
4657 		return -ENODEV;
4658 	}
4659 }
4660 
4661 static int
4662 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4663 {
4664 	switch (feat) {
4665 	case IOMMU_DEV_FEAT_IOPF:
4666 		return 0;
4667 
4668 	case IOMMU_DEV_FEAT_SVA:
4669 		return intel_iommu_disable_sva(dev);
4670 
4671 	default:
4672 		return -ENODEV;
4673 	}
4674 }
4675 
4676 static bool intel_iommu_is_attach_deferred(struct device *dev)
4677 {
4678 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4679 
4680 	return translation_pre_enabled(info->iommu) && !info->domain;
4681 }
4682 
4683 /*
4684  * Check that the device does not live on an external facing PCI port that is
4685  * marked as untrusted. Such devices should not be able to apply quirks and
4686  * thus not be able to bypass the IOMMU restrictions.
4687  */
4688 static bool risky_device(struct pci_dev *pdev)
4689 {
4690 	if (pdev->untrusted) {
4691 		pci_info(pdev,
4692 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4693 			 pdev->vendor, pdev->device);
4694 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4695 		return true;
4696 	}
4697 	return false;
4698 }
4699 
4700 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4701 				       unsigned long iova, size_t size)
4702 {
4703 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4704 	unsigned long pages = aligned_nrpages(iova, size);
4705 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4706 	struct iommu_domain_info *info;
4707 	unsigned long i;
4708 
4709 	xa_for_each(&dmar_domain->iommu_array, i, info)
4710 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4711 }
4712 
4713 const struct iommu_ops intel_iommu_ops = {
4714 	.capable		= intel_iommu_capable,
4715 	.domain_alloc		= intel_iommu_domain_alloc,
4716 	.probe_device		= intel_iommu_probe_device,
4717 	.probe_finalize		= intel_iommu_probe_finalize,
4718 	.release_device		= intel_iommu_release_device,
4719 	.get_resv_regions	= intel_iommu_get_resv_regions,
4720 	.device_group		= intel_iommu_device_group,
4721 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4722 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4723 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4724 	.def_domain_type	= device_def_domain_type,
4725 	.pgsize_bitmap		= SZ_4K,
4726 #ifdef CONFIG_INTEL_IOMMU_SVM
4727 	.sva_bind		= intel_svm_bind,
4728 	.sva_unbind		= intel_svm_unbind,
4729 	.sva_get_pasid		= intel_svm_get_pasid,
4730 	.page_response		= intel_svm_page_response,
4731 #endif
4732 	.default_domain_ops = &(const struct iommu_domain_ops) {
4733 		.attach_dev		= intel_iommu_attach_device,
4734 		.detach_dev		= intel_iommu_detach_device,
4735 		.map_pages		= intel_iommu_map_pages,
4736 		.unmap_pages		= intel_iommu_unmap_pages,
4737 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4738 		.flush_iotlb_all        = intel_flush_iotlb_all,
4739 		.iotlb_sync		= intel_iommu_tlb_sync,
4740 		.iova_to_phys		= intel_iommu_iova_to_phys,
4741 		.free			= intel_iommu_domain_free,
4742 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4743 	}
4744 };
4745 
4746 static void quirk_iommu_igfx(struct pci_dev *dev)
4747 {
4748 	if (risky_device(dev))
4749 		return;
4750 
4751 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4752 	dmar_map_gfx = 0;
4753 }
4754 
4755 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4756 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4757 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4758 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4759 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4760 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4763 
4764 /* Broadwell igfx malfunctions with dmar */
4765 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4766 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4774 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4789 
4790 static void quirk_iommu_rwbf(struct pci_dev *dev)
4791 {
4792 	if (risky_device(dev))
4793 		return;
4794 
4795 	/*
4796 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4797 	 * but needs it. Same seems to hold for the desktop versions.
4798 	 */
4799 	pci_info(dev, "Forcing write-buffer flush capability\n");
4800 	rwbf_quirk = 1;
4801 }
4802 
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4810 
4811 #define GGC 0x52
4812 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4813 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4814 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4815 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4816 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4817 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4818 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4819 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4820 
4821 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4822 {
4823 	unsigned short ggc;
4824 
4825 	if (risky_device(dev))
4826 		return;
4827 
4828 	if (pci_read_config_word(dev, GGC, &ggc))
4829 		return;
4830 
4831 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4832 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4833 		dmar_map_gfx = 0;
4834 	} else if (dmar_map_gfx) {
4835 		/* we have to ensure the gfx device is idle before we flush */
4836 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4837 		iommu_set_dma_strict();
4838 	}
4839 }
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4844 
4845 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4846 {
4847 	unsigned short ver;
4848 
4849 	if (!IS_GFX_DEVICE(dev))
4850 		return;
4851 
4852 	ver = (dev->device >> 8) & 0xff;
4853 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4854 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4855 	    ver != 0x9a && ver != 0xa7)
4856 		return;
4857 
4858 	if (risky_device(dev))
4859 		return;
4860 
4861 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4862 	iommu_skip_te_disable = 1;
4863 }
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4865 
4866 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4867    ISOCH DMAR unit for the Azalia sound device, but not give it any
4868    TLB entries, which causes it to deadlock. Check for that.  We do
4869    this in a function called from init_dmars(), instead of in a PCI
4870    quirk, because we don't want to print the obnoxious "BIOS broken"
4871    message if VT-d is actually disabled.
4872 */
4873 static void __init check_tylersburg_isoch(void)
4874 {
4875 	struct pci_dev *pdev;
4876 	uint32_t vtisochctrl;
4877 
4878 	/* If there's no Azalia in the system anyway, forget it. */
4879 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4880 	if (!pdev)
4881 		return;
4882 
4883 	if (risky_device(pdev)) {
4884 		pci_dev_put(pdev);
4885 		return;
4886 	}
4887 
4888 	pci_dev_put(pdev);
4889 
4890 	/* System Management Registers. Might be hidden, in which case
4891 	   we can't do the sanity check. But that's OK, because the
4892 	   known-broken BIOSes _don't_ actually hide it, so far. */
4893 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4894 	if (!pdev)
4895 		return;
4896 
4897 	if (risky_device(pdev)) {
4898 		pci_dev_put(pdev);
4899 		return;
4900 	}
4901 
4902 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4903 		pci_dev_put(pdev);
4904 		return;
4905 	}
4906 
4907 	pci_dev_put(pdev);
4908 
4909 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4910 	if (vtisochctrl & 1)
4911 		return;
4912 
4913 	/* Drop all bits other than the number of TLB entries */
4914 	vtisochctrl &= 0x1c;
4915 
4916 	/* If we have the recommended number of TLB entries (16), fine. */
4917 	if (vtisochctrl == 0x10)
4918 		return;
4919 
4920 	/* Zero TLB entries? You get to ride the short bus to school. */
4921 	if (!vtisochctrl) {
4922 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4923 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4924 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4925 		     dmi_get_system_info(DMI_BIOS_VERSION),
4926 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4927 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4928 		return;
4929 	}
4930 
4931 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4932 	       vtisochctrl);
4933 }
4934