xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision f0f217ba)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void dmar_remove_one_dev_info(struct device *dev);
281 
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284 
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287 
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292 
293 #define IDENTMAP_GFX		2
294 #define IDENTMAP_AZALIA		4
295 
296 const struct iommu_ops intel_iommu_ops;
297 
298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302 
303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307 
308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310 	u32 gsts;
311 
312 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
313 	if (gsts & DMA_GSTS_TES)
314 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316 
317 static int __init intel_iommu_setup(char *str)
318 {
319 	if (!str)
320 		return -EINVAL;
321 
322 	while (*str) {
323 		if (!strncmp(str, "on", 2)) {
324 			dmar_disabled = 0;
325 			pr_info("IOMMU enabled\n");
326 		} else if (!strncmp(str, "off", 3)) {
327 			dmar_disabled = 1;
328 			no_platform_optin = 1;
329 			pr_info("IOMMU disabled\n");
330 		} else if (!strncmp(str, "igfx_off", 8)) {
331 			dmar_map_gfx = 0;
332 			pr_info("Disable GFX device mapping\n");
333 		} else if (!strncmp(str, "forcedac", 8)) {
334 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335 			iommu_dma_forcedac = true;
336 		} else if (!strncmp(str, "strict", 6)) {
337 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338 			iommu_set_dma_strict();
339 		} else if (!strncmp(str, "sp_off", 6)) {
340 			pr_info("Disable supported super page\n");
341 			intel_iommu_superpage = 0;
342 		} else if (!strncmp(str, "sm_on", 5)) {
343 			pr_info("Enable scalable mode if hardware supports\n");
344 			intel_iommu_sm = 1;
345 		} else if (!strncmp(str, "sm_off", 6)) {
346 			pr_info("Scalable mode is disallowed\n");
347 			intel_iommu_sm = 0;
348 		} else if (!strncmp(str, "tboot_noforce", 13)) {
349 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350 			intel_iommu_tboot_noforce = 1;
351 		} else {
352 			pr_notice("Unknown option - '%s'\n", str);
353 		}
354 
355 		str += strcspn(str, ",");
356 		while (*str == ',')
357 			str++;
358 	}
359 
360 	return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363 
364 void *alloc_pgtable_page(int node)
365 {
366 	struct page *page;
367 	void *vaddr = NULL;
368 
369 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370 	if (page)
371 		vaddr = page_address(page);
372 	return vaddr;
373 }
374 
375 void free_pgtable_page(void *vaddr)
376 {
377 	free_page((unsigned long)vaddr);
378 }
379 
380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384 
385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389 
390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 				       unsigned long pfn)
392 {
393 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394 
395 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397 
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 	unsigned long fl_sagaw, sl_sagaw;
406 
407 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 	sl_sagaw = cap_sagaw(iommu->cap);
409 
410 	/* Second level only. */
411 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 		return sl_sagaw;
413 
414 	/* First level only. */
415 	if (!ecap_slts(iommu->ecap))
416 		return fl_sagaw;
417 
418 	return fl_sagaw & sl_sagaw;
419 }
420 
421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 	unsigned long sagaw;
424 	int agaw;
425 
426 	sagaw = __iommu_calculate_sagaw(iommu);
427 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 		if (test_bit(agaw, &sagaw))
429 			break;
430 	}
431 
432 	return agaw;
433 }
434 
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442 
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452 
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 	return sm_supported(iommu) ?
456 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458 
459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 	struct iommu_domain_info *info;
462 	struct dmar_drhd_unit *drhd;
463 	struct intel_iommu *iommu;
464 	bool found = false;
465 	unsigned long i;
466 
467 	domain->iommu_coherency = true;
468 	xa_for_each(&domain->iommu_array, i, info) {
469 		found = true;
470 		if (!iommu_paging_structure_coherency(info->iommu)) {
471 			domain->iommu_coherency = false;
472 			break;
473 		}
474 	}
475 	if (found)
476 		return;
477 
478 	/* No hardware attached; use lowest common denominator */
479 	rcu_read_lock();
480 	for_each_active_iommu(iommu, drhd) {
481 		if (!iommu_paging_structure_coherency(iommu)) {
482 			domain->iommu_coherency = false;
483 			break;
484 		}
485 	}
486 	rcu_read_unlock();
487 }
488 
489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 					 struct intel_iommu *skip)
491 {
492 	struct dmar_drhd_unit *drhd;
493 	struct intel_iommu *iommu;
494 	int mask = 0x3;
495 
496 	if (!intel_iommu_superpage)
497 		return 0;
498 
499 	/* set iommu_superpage to the smallest common denominator */
500 	rcu_read_lock();
501 	for_each_active_iommu(iommu, drhd) {
502 		if (iommu != skip) {
503 			if (domain && domain_use_first_level(domain)) {
504 				if (!cap_fl1gp_support(iommu->cap))
505 					mask = 0x1;
506 			} else {
507 				mask &= cap_super_page_val(iommu->cap);
508 			}
509 
510 			if (!mask)
511 				break;
512 		}
513 	}
514 	rcu_read_unlock();
515 
516 	return fls(mask);
517 }
518 
519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 	struct device_domain_info *info;
522 	int nid = NUMA_NO_NODE;
523 	unsigned long flags;
524 
525 	spin_lock_irqsave(&domain->lock, flags);
526 	list_for_each_entry(info, &domain->devices, link) {
527 		/*
528 		 * There could possibly be multiple device numa nodes as devices
529 		 * within the same domain may sit behind different IOMMUs. There
530 		 * isn't perfect answer in such situation, so we select first
531 		 * come first served policy.
532 		 */
533 		nid = dev_to_node(info->dev);
534 		if (nid != NUMA_NO_NODE)
535 			break;
536 	}
537 	spin_unlock_irqrestore(&domain->lock, flags);
538 
539 	return nid;
540 }
541 
542 static void domain_update_iotlb(struct dmar_domain *domain);
543 
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 	unsigned long bitmap = 0;
548 
549 	/*
550 	 * 1-level super page supports page size of 2MiB, 2-level super page
551 	 * supports page size of both 2MiB and 1GiB.
552 	 */
553 	if (domain->iommu_superpage == 1)
554 		bitmap |= SZ_2M;
555 	else if (domain->iommu_superpage == 2)
556 		bitmap |= SZ_2M | SZ_1G;
557 
558 	return bitmap;
559 }
560 
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 	domain_update_iommu_coherency(domain);
565 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566 
567 	/*
568 	 * If RHSA is missing, we should default to the device numa domain
569 	 * as fall back.
570 	 */
571 	if (domain->nid == NUMA_NO_NODE)
572 		domain->nid = domain_update_device_node(domain);
573 
574 	/*
575 	 * First-level translation restricts the input-address to a
576 	 * canonical address (i.e., address bits 63:N have the same
577 	 * value as address bit [N-1], where N is 48-bits with 4-level
578 	 * paging and 57-bits with 5-level paging). Hence, skip bit
579 	 * [N-1].
580 	 */
581 	if (domain_use_first_level(domain))
582 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 	else
584 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585 
586 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 	domain_update_iotlb(domain);
588 }
589 
590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 					 u8 devfn, int alloc)
592 {
593 	struct root_entry *root = &iommu->root_entry[bus];
594 	struct context_entry *context;
595 	u64 *entry;
596 
597 	/*
598 	 * Except that the caller requested to allocate a new entry,
599 	 * returning a copied context entry makes no sense.
600 	 */
601 	if (!alloc && context_copied(iommu, bus, devfn))
602 		return NULL;
603 
604 	entry = &root->lo;
605 	if (sm_supported(iommu)) {
606 		if (devfn >= 0x80) {
607 			devfn -= 0x80;
608 			entry = &root->hi;
609 		}
610 		devfn *= 2;
611 	}
612 	if (*entry & 1)
613 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 	else {
615 		unsigned long phy_addr;
616 		if (!alloc)
617 			return NULL;
618 
619 		context = alloc_pgtable_page(iommu->node);
620 		if (!context)
621 			return NULL;
622 
623 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 		phy_addr = virt_to_phys((void *)context);
625 		*entry = phy_addr | 1;
626 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
627 	}
628 	return &context[devfn];
629 }
630 
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *				 sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 	struct pci_dev *pdev, *pbridge;
643 
644 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 		return false;
646 
647 	pdev = to_pci_dev(dev);
648 	pbridge = to_pci_dev(bridge);
649 
650 	if (pbridge->subordinate &&
651 	    pbridge->subordinate->number <= pdev->bus->number &&
652 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 		return true;
654 
655 	return false;
656 }
657 
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 	struct dmar_drhd_unit *drhd;
661 	u32 vtbar;
662 	int rc;
663 
664 	/* We know that this device on this chipset has its own IOMMU.
665 	 * If we find it under a different IOMMU, then the BIOS is lying
666 	 * to us. Hope that the IOMMU for this device is actually
667 	 * disabled, and it needs no translation...
668 	 */
669 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 	if (rc) {
671 		/* "can't" happen */
672 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 		return false;
674 	}
675 	vtbar &= 0xffff0000;
676 
677 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
678 	drhd = dmar_find_matched_drhd_unit(pdev);
679 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 		return true;
683 	}
684 
685 	return false;
686 }
687 
688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 	if (!iommu || iommu->drhd->ignored)
691 		return true;
692 
693 	if (dev_is_pci(dev)) {
694 		struct pci_dev *pdev = to_pci_dev(dev);
695 
696 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 		    quirk_ioat_snb_local_iommu(pdev))
699 			return true;
700 	}
701 
702 	return false;
703 }
704 
705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 	struct dmar_drhd_unit *drhd = NULL;
708 	struct pci_dev *pdev = NULL;
709 	struct intel_iommu *iommu;
710 	struct device *tmp;
711 	u16 segment = 0;
712 	int i;
713 
714 	if (!dev)
715 		return NULL;
716 
717 	if (dev_is_pci(dev)) {
718 		struct pci_dev *pf_pdev;
719 
720 		pdev = pci_real_dma_dev(to_pci_dev(dev));
721 
722 		/* VFs aren't listed in scope tables; we need to look up
723 		 * the PF instead to find the IOMMU. */
724 		pf_pdev = pci_physfn(pdev);
725 		dev = &pf_pdev->dev;
726 		segment = pci_domain_nr(pdev->bus);
727 	} else if (has_acpi_companion(dev))
728 		dev = &ACPI_COMPANION(dev)->dev;
729 
730 	rcu_read_lock();
731 	for_each_iommu(iommu, drhd) {
732 		if (pdev && segment != drhd->segment)
733 			continue;
734 
735 		for_each_active_dev_scope(drhd->devices,
736 					  drhd->devices_cnt, i, tmp) {
737 			if (tmp == dev) {
738 				/* For a VF use its original BDF# not that of the PF
739 				 * which we used for the IOMMU lookup. Strictly speaking
740 				 * we could do this for all PCI devices; we only need to
741 				 * get the BDF# from the scope table for ACPI matches. */
742 				if (pdev && pdev->is_virtfn)
743 					goto got_pdev;
744 
745 				if (bus && devfn) {
746 					*bus = drhd->devices[i].bus;
747 					*devfn = drhd->devices[i].devfn;
748 				}
749 				goto out;
750 			}
751 
752 			if (is_downstream_to_pci_bridge(dev, tmp))
753 				goto got_pdev;
754 		}
755 
756 		if (pdev && drhd->include_all) {
757 got_pdev:
758 			if (bus && devfn) {
759 				*bus = pdev->bus->number;
760 				*devfn = pdev->devfn;
761 			}
762 			goto out;
763 		}
764 	}
765 	iommu = NULL;
766 out:
767 	if (iommu_is_dummy(iommu, dev))
768 		iommu = NULL;
769 
770 	rcu_read_unlock();
771 
772 	return iommu;
773 }
774 
775 static void domain_flush_cache(struct dmar_domain *domain,
776 			       void *addr, int size)
777 {
778 	if (!domain->iommu_coherency)
779 		clflush_cache_range(addr, size);
780 }
781 
782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784 	struct context_entry *context;
785 	int ret = 0;
786 
787 	spin_lock(&iommu->lock);
788 	context = iommu_context_addr(iommu, bus, devfn, 0);
789 	if (context)
790 		ret = context_present(context);
791 	spin_unlock(&iommu->lock);
792 	return ret;
793 }
794 
795 static void free_context_table(struct intel_iommu *iommu)
796 {
797 	struct context_entry *context;
798 	int i;
799 
800 	if (!iommu->root_entry)
801 		return;
802 
803 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
804 		context = iommu_context_addr(iommu, i, 0, 0);
805 		if (context)
806 			free_pgtable_page(context);
807 
808 		if (!sm_supported(iommu))
809 			continue;
810 
811 		context = iommu_context_addr(iommu, i, 0x80, 0);
812 		if (context)
813 			free_pgtable_page(context);
814 	}
815 
816 	free_pgtable_page(iommu->root_entry);
817 	iommu->root_entry = NULL;
818 }
819 
820 #ifdef CONFIG_DMAR_DEBUG
821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824 	struct dma_pte *pte;
825 	int offset;
826 
827 	while (1) {
828 		offset = pfn_level_offset(pfn, level);
829 		pte = &parent[offset];
830 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831 			pr_info("PTE not present at level %d\n", level);
832 			break;
833 		}
834 
835 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836 
837 		if (level == 1)
838 			break;
839 
840 		parent = phys_to_virt(dma_pte_addr(pte));
841 		level--;
842 	}
843 }
844 
845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846 			  unsigned long long addr, u32 pasid)
847 {
848 	struct pasid_dir_entry *dir, *pde;
849 	struct pasid_entry *entries, *pte;
850 	struct context_entry *ctx_entry;
851 	struct root_entry *rt_entry;
852 	int i, dir_index, index, level;
853 	u8 devfn = source_id & 0xff;
854 	u8 bus = source_id >> 8;
855 	struct dma_pte *pgtable;
856 
857 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858 
859 	/* root entry dump */
860 	rt_entry = &iommu->root_entry[bus];
861 	if (!rt_entry) {
862 		pr_info("root table entry is not present\n");
863 		return;
864 	}
865 
866 	if (sm_supported(iommu))
867 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868 			rt_entry->hi, rt_entry->lo);
869 	else
870 		pr_info("root entry: 0x%016llx", rt_entry->lo);
871 
872 	/* context entry dump */
873 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874 	if (!ctx_entry) {
875 		pr_info("context table entry is not present\n");
876 		return;
877 	}
878 
879 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880 		ctx_entry->hi, ctx_entry->lo);
881 
882 	/* legacy mode does not require PASID entries */
883 	if (!sm_supported(iommu)) {
884 		level = agaw_to_level(ctx_entry->hi & 7);
885 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886 		goto pgtable_walk;
887 	}
888 
889 	/* get the pointer to pasid directory entry */
890 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891 	if (!dir) {
892 		pr_info("pasid directory entry is not present\n");
893 		return;
894 	}
895 	/* For request-without-pasid, get the pasid from context entry */
896 	if (intel_iommu_sm && pasid == INVALID_IOASID)
897 		pasid = PASID_RID2PASID;
898 
899 	dir_index = pasid >> PASID_PDE_SHIFT;
900 	pde = &dir[dir_index];
901 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902 
903 	/* get the pointer to the pasid table entry */
904 	entries = get_pasid_table_from_pde(pde);
905 	if (!entries) {
906 		pr_info("pasid table entry is not present\n");
907 		return;
908 	}
909 	index = pasid & PASID_PTE_MASK;
910 	pte = &entries[index];
911 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level)
929 {
930 	struct dma_pte *parent, *pte;
931 	int level = agaw_to_level(domain->agaw);
932 	int offset;
933 
934 	BUG_ON(!domain->pgd);
935 
936 	if (!domain_pfn_supported(domain, pfn))
937 		/* Address beyond IOMMU's addressing capabilities. */
938 		return NULL;
939 
940 	parent = domain->pgd;
941 
942 	while (1) {
943 		void *tmp_page;
944 
945 		offset = pfn_level_offset(pfn, level);
946 		pte = &parent[offset];
947 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948 			break;
949 		if (level == *target_level)
950 			break;
951 
952 		if (!dma_pte_present(pte)) {
953 			uint64_t pteval;
954 
955 			tmp_page = alloc_pgtable_page(domain->nid);
956 
957 			if (!tmp_page)
958 				return NULL;
959 
960 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962 			if (domain_use_first_level(domain)) {
963 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
964 				if (iommu_is_dma_domain(&domain->domain))
965 					pteval |= DMA_FL_PTE_ACCESS;
966 			}
967 			if (cmpxchg64(&pte->val, 0ULL, pteval))
968 				/* Someone else set it while we were thinking; use theirs. */
969 				free_pgtable_page(tmp_page);
970 			else
971 				domain_flush_cache(domain, pte, sizeof(*pte));
972 		}
973 		if (level == 1)
974 			break;
975 
976 		parent = phys_to_virt(dma_pte_addr(pte));
977 		level--;
978 	}
979 
980 	if (!*target_level)
981 		*target_level = level;
982 
983 	return pte;
984 }
985 
986 /* return address's pte at specific level */
987 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
988 					 unsigned long pfn,
989 					 int level, int *large_page)
990 {
991 	struct dma_pte *parent, *pte;
992 	int total = agaw_to_level(domain->agaw);
993 	int offset;
994 
995 	parent = domain->pgd;
996 	while (level <= total) {
997 		offset = pfn_level_offset(pfn, total);
998 		pte = &parent[offset];
999 		if (level == total)
1000 			return pte;
1001 
1002 		if (!dma_pte_present(pte)) {
1003 			*large_page = total;
1004 			break;
1005 		}
1006 
1007 		if (dma_pte_superpage(pte)) {
1008 			*large_page = total;
1009 			return pte;
1010 		}
1011 
1012 		parent = phys_to_virt(dma_pte_addr(pte));
1013 		total--;
1014 	}
1015 	return NULL;
1016 }
1017 
1018 /* clear last level pte, a tlb flush should be followed */
1019 static void dma_pte_clear_range(struct dmar_domain *domain,
1020 				unsigned long start_pfn,
1021 				unsigned long last_pfn)
1022 {
1023 	unsigned int large_page;
1024 	struct dma_pte *first_pte, *pte;
1025 
1026 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1027 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1028 	BUG_ON(start_pfn > last_pfn);
1029 
1030 	/* we don't need lock here; nobody else touches the iova range */
1031 	do {
1032 		large_page = 1;
1033 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1034 		if (!pte) {
1035 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1036 			continue;
1037 		}
1038 		do {
1039 			dma_clear_pte(pte);
1040 			start_pfn += lvl_to_nr_pages(large_page);
1041 			pte++;
1042 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1043 
1044 		domain_flush_cache(domain, first_pte,
1045 				   (void *)pte - (void *)first_pte);
1046 
1047 	} while (start_pfn && start_pfn <= last_pfn);
1048 }
1049 
1050 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1051 			       int retain_level, struct dma_pte *pte,
1052 			       unsigned long pfn, unsigned long start_pfn,
1053 			       unsigned long last_pfn)
1054 {
1055 	pfn = max(start_pfn, pfn);
1056 	pte = &pte[pfn_level_offset(pfn, level)];
1057 
1058 	do {
1059 		unsigned long level_pfn;
1060 		struct dma_pte *level_pte;
1061 
1062 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1063 			goto next;
1064 
1065 		level_pfn = pfn & level_mask(level);
1066 		level_pte = phys_to_virt(dma_pte_addr(pte));
1067 
1068 		if (level > 2) {
1069 			dma_pte_free_level(domain, level - 1, retain_level,
1070 					   level_pte, level_pfn, start_pfn,
1071 					   last_pfn);
1072 		}
1073 
1074 		/*
1075 		 * Free the page table if we're below the level we want to
1076 		 * retain and the range covers the entire table.
1077 		 */
1078 		if (level < retain_level && !(start_pfn > level_pfn ||
1079 		      last_pfn < level_pfn + level_size(level) - 1)) {
1080 			dma_clear_pte(pte);
1081 			domain_flush_cache(domain, pte, sizeof(*pte));
1082 			free_pgtable_page(level_pte);
1083 		}
1084 next:
1085 		pfn += level_size(level);
1086 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1087 }
1088 
1089 /*
1090  * clear last level (leaf) ptes and free page table pages below the
1091  * level we wish to keep intact.
1092  */
1093 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1094 				   unsigned long start_pfn,
1095 				   unsigned long last_pfn,
1096 				   int retain_level)
1097 {
1098 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1099 
1100 	/* We don't need lock here; nobody else touches the iova range */
1101 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1102 			   domain->pgd, 0, start_pfn, last_pfn);
1103 
1104 	/* free pgd */
1105 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1106 		free_pgtable_page(domain->pgd);
1107 		domain->pgd = NULL;
1108 	}
1109 }
1110 
1111 /* When a page at a given level is being unlinked from its parent, we don't
1112    need to *modify* it at all. All we need to do is make a list of all the
1113    pages which can be freed just as soon as we've flushed the IOTLB and we
1114    know the hardware page-walk will no longer touch them.
1115    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1116    be freed. */
1117 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1118 				    int level, struct dma_pte *pte,
1119 				    struct list_head *freelist)
1120 {
1121 	struct page *pg;
1122 
1123 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1124 	list_add_tail(&pg->lru, freelist);
1125 
1126 	if (level == 1)
1127 		return;
1128 
1129 	pte = page_address(pg);
1130 	do {
1131 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1132 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1133 		pte++;
1134 	} while (!first_pte_in_page(pte));
1135 }
1136 
1137 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1138 				struct dma_pte *pte, unsigned long pfn,
1139 				unsigned long start_pfn, unsigned long last_pfn,
1140 				struct list_head *freelist)
1141 {
1142 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1143 
1144 	pfn = max(start_pfn, pfn);
1145 	pte = &pte[pfn_level_offset(pfn, level)];
1146 
1147 	do {
1148 		unsigned long level_pfn = pfn & level_mask(level);
1149 
1150 		if (!dma_pte_present(pte))
1151 			goto next;
1152 
1153 		/* If range covers entire pagetable, free it */
1154 		if (start_pfn <= level_pfn &&
1155 		    last_pfn >= level_pfn + level_size(level) - 1) {
1156 			/* These suborbinate page tables are going away entirely. Don't
1157 			   bother to clear them; we're just going to *free* them. */
1158 			if (level > 1 && !dma_pte_superpage(pte))
1159 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1160 
1161 			dma_clear_pte(pte);
1162 			if (!first_pte)
1163 				first_pte = pte;
1164 			last_pte = pte;
1165 		} else if (level > 1) {
1166 			/* Recurse down into a level that isn't *entirely* obsolete */
1167 			dma_pte_clear_level(domain, level - 1,
1168 					    phys_to_virt(dma_pte_addr(pte)),
1169 					    level_pfn, start_pfn, last_pfn,
1170 					    freelist);
1171 		}
1172 next:
1173 		pfn = level_pfn + level_size(level);
1174 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1175 
1176 	if (first_pte)
1177 		domain_flush_cache(domain, first_pte,
1178 				   (void *)++last_pte - (void *)first_pte);
1179 }
1180 
1181 /* We can't just free the pages because the IOMMU may still be walking
1182    the page tables, and may have cached the intermediate levels. The
1183    pages can only be freed after the IOTLB flush has been done. */
1184 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1185 			 unsigned long last_pfn, struct list_head *freelist)
1186 {
1187 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1188 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1189 	BUG_ON(start_pfn > last_pfn);
1190 
1191 	/* we don't need lock here; nobody else touches the iova range */
1192 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1193 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1194 
1195 	/* free pgd */
1196 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1197 		struct page *pgd_page = virt_to_page(domain->pgd);
1198 		list_add_tail(&pgd_page->lru, freelist);
1199 		domain->pgd = NULL;
1200 	}
1201 }
1202 
1203 /* iommu handling */
1204 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1205 {
1206 	struct root_entry *root;
1207 
1208 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1209 	if (!root) {
1210 		pr_err("Allocating root entry for %s failed\n",
1211 			iommu->name);
1212 		return -ENOMEM;
1213 	}
1214 
1215 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1216 	iommu->root_entry = root;
1217 
1218 	return 0;
1219 }
1220 
1221 static void iommu_set_root_entry(struct intel_iommu *iommu)
1222 {
1223 	u64 addr;
1224 	u32 sts;
1225 	unsigned long flag;
1226 
1227 	addr = virt_to_phys(iommu->root_entry);
1228 	if (sm_supported(iommu))
1229 		addr |= DMA_RTADDR_SMT;
1230 
1231 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1233 
1234 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1235 
1236 	/* Make sure hardware complete it */
1237 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238 		      readl, (sts & DMA_GSTS_RTPS), sts);
1239 
1240 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241 
1242 	/*
1243 	 * Hardware invalidates all DMA remapping hardware translation
1244 	 * caches as part of SRTP flow.
1245 	 */
1246 	if (cap_esrtps(iommu->cap))
1247 		return;
1248 
1249 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1250 	if (sm_supported(iommu))
1251 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1252 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1253 }
1254 
1255 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1256 {
1257 	u32 val;
1258 	unsigned long flag;
1259 
1260 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1261 		return;
1262 
1263 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1264 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1265 
1266 	/* Make sure hardware complete it */
1267 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1268 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1269 
1270 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1271 }
1272 
1273 /* return value determine if we need a write buffer flush */
1274 static void __iommu_flush_context(struct intel_iommu *iommu,
1275 				  u16 did, u16 source_id, u8 function_mask,
1276 				  u64 type)
1277 {
1278 	u64 val = 0;
1279 	unsigned long flag;
1280 
1281 	switch (type) {
1282 	case DMA_CCMD_GLOBAL_INVL:
1283 		val = DMA_CCMD_GLOBAL_INVL;
1284 		break;
1285 	case DMA_CCMD_DOMAIN_INVL:
1286 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1287 		break;
1288 	case DMA_CCMD_DEVICE_INVL:
1289 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1290 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1291 		break;
1292 	default:
1293 		BUG();
1294 	}
1295 	val |= DMA_CCMD_ICC;
1296 
1297 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1298 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1299 
1300 	/* Make sure hardware complete it */
1301 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1302 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1303 
1304 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1305 }
1306 
1307 /* return value determine if we need a write buffer flush */
1308 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1309 				u64 addr, unsigned int size_order, u64 type)
1310 {
1311 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1312 	u64 val = 0, val_iva = 0;
1313 	unsigned long flag;
1314 
1315 	switch (type) {
1316 	case DMA_TLB_GLOBAL_FLUSH:
1317 		/* global flush doesn't need set IVA_REG */
1318 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1319 		break;
1320 	case DMA_TLB_DSI_FLUSH:
1321 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1322 		break;
1323 	case DMA_TLB_PSI_FLUSH:
1324 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1325 		/* IH bit is passed in as part of address */
1326 		val_iva = size_order | addr;
1327 		break;
1328 	default:
1329 		BUG();
1330 	}
1331 	/* Note: set drain read/write */
1332 #if 0
1333 	/*
1334 	 * This is probably to be super secure.. Looks like we can
1335 	 * ignore it without any impact.
1336 	 */
1337 	if (cap_read_drain(iommu->cap))
1338 		val |= DMA_TLB_READ_DRAIN;
1339 #endif
1340 	if (cap_write_drain(iommu->cap))
1341 		val |= DMA_TLB_WRITE_DRAIN;
1342 
1343 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1344 	/* Note: Only uses first TLB reg currently */
1345 	if (val_iva)
1346 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1347 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1348 
1349 	/* Make sure hardware complete it */
1350 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1351 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1352 
1353 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1354 
1355 	/* check IOTLB invalidation granularity */
1356 	if (DMA_TLB_IAIG(val) == 0)
1357 		pr_err("Flush IOTLB failed\n");
1358 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1359 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1360 			(unsigned long long)DMA_TLB_IIRG(type),
1361 			(unsigned long long)DMA_TLB_IAIG(val));
1362 }
1363 
1364 static struct device_domain_info *
1365 domain_lookup_dev_info(struct dmar_domain *domain,
1366 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1367 {
1368 	struct device_domain_info *info;
1369 	unsigned long flags;
1370 
1371 	spin_lock_irqsave(&domain->lock, flags);
1372 	list_for_each_entry(info, &domain->devices, link) {
1373 		if (info->iommu == iommu && info->bus == bus &&
1374 		    info->devfn == devfn) {
1375 			spin_unlock_irqrestore(&domain->lock, flags);
1376 			return info;
1377 		}
1378 	}
1379 	spin_unlock_irqrestore(&domain->lock, flags);
1380 
1381 	return NULL;
1382 }
1383 
1384 static void domain_update_iotlb(struct dmar_domain *domain)
1385 {
1386 	struct device_domain_info *info;
1387 	bool has_iotlb_device = false;
1388 	unsigned long flags;
1389 
1390 	spin_lock_irqsave(&domain->lock, flags);
1391 	list_for_each_entry(info, &domain->devices, link) {
1392 		if (info->ats_enabled) {
1393 			has_iotlb_device = true;
1394 			break;
1395 		}
1396 	}
1397 	domain->has_iotlb_device = has_iotlb_device;
1398 	spin_unlock_irqrestore(&domain->lock, flags);
1399 }
1400 
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1402 {
1403 	struct pci_dev *pdev;
1404 
1405 	if (!info || !dev_is_pci(info->dev))
1406 		return;
1407 
1408 	pdev = to_pci_dev(info->dev);
1409 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412 	 * reserved, which should be set to 0.
1413 	 */
1414 	if (!ecap_dit(info->iommu->ecap))
1415 		info->pfsid = 0;
1416 	else {
1417 		struct pci_dev *pf_pdev;
1418 
1419 		/* pdev will be returned if device is not a vf */
1420 		pf_pdev = pci_physfn(pdev);
1421 		info->pfsid = pci_dev_id(pf_pdev);
1422 	}
1423 
1424 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1425 	   the device if you enable PASID support after ATS support is
1426 	   undefined. So always enable PASID support on devices which
1427 	   have it, even if we can't yet know if we're ever going to
1428 	   use it. */
1429 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430 		info->pasid_enabled = 1;
1431 
1432 	if (info->pri_supported &&
1433 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1434 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435 		info->pri_enabled = 1;
1436 
1437 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439 		info->ats_enabled = 1;
1440 		domain_update_iotlb(info->domain);
1441 		info->ats_qdep = pci_ats_queue_depth(pdev);
1442 	}
1443 }
1444 
1445 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1446 {
1447 	struct pci_dev *pdev;
1448 
1449 	if (!dev_is_pci(info->dev))
1450 		return;
1451 
1452 	pdev = to_pci_dev(info->dev);
1453 
1454 	if (info->ats_enabled) {
1455 		pci_disable_ats(pdev);
1456 		info->ats_enabled = 0;
1457 		domain_update_iotlb(info->domain);
1458 	}
1459 
1460 	if (info->pri_enabled) {
1461 		pci_disable_pri(pdev);
1462 		info->pri_enabled = 0;
1463 	}
1464 
1465 	if (info->pasid_enabled) {
1466 		pci_disable_pasid(pdev);
1467 		info->pasid_enabled = 0;
1468 	}
1469 }
1470 
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472 				    u64 addr, unsigned int mask)
1473 {
1474 	u16 sid, qdep;
1475 
1476 	if (!info || !info->ats_enabled)
1477 		return;
1478 
1479 	sid = info->bus << 8 | info->devfn;
1480 	qdep = info->ats_qdep;
1481 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1482 			   qdep, addr, mask);
1483 }
1484 
1485 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 				  u64 addr, unsigned mask)
1487 {
1488 	struct device_domain_info *info;
1489 	unsigned long flags;
1490 
1491 	if (!domain->has_iotlb_device)
1492 		return;
1493 
1494 	spin_lock_irqsave(&domain->lock, flags);
1495 	list_for_each_entry(info, &domain->devices, link)
1496 		__iommu_flush_dev_iotlb(info, addr, mask);
1497 	spin_unlock_irqrestore(&domain->lock, flags);
1498 }
1499 
1500 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 				  struct dmar_domain *domain,
1502 				  unsigned long pfn, unsigned int pages,
1503 				  int ih, int map)
1504 {
1505 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 	unsigned int mask = ilog2(aligned_pages);
1507 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 	u16 did = domain_id_iommu(domain, iommu);
1509 
1510 	BUG_ON(pages == 0);
1511 
1512 	if (ih)
1513 		ih = 1 << 6;
1514 
1515 	if (domain_use_first_level(domain)) {
1516 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 	} else {
1518 		unsigned long bitmask = aligned_pages - 1;
1519 
1520 		/*
1521 		 * PSI masks the low order bits of the base address. If the
1522 		 * address isn't aligned to the mask, then compute a mask value
1523 		 * needed to ensure the target range is flushed.
1524 		 */
1525 		if (unlikely(bitmask & pfn)) {
1526 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527 
1528 			/*
1529 			 * Since end_pfn <= pfn + bitmask, the only way bits
1530 			 * higher than bitmask can differ in pfn and end_pfn is
1531 			 * by carrying. This means after masking out bitmask,
1532 			 * high bits starting with the first set bit in
1533 			 * shared_bits are all equal in both pfn and end_pfn.
1534 			 */
1535 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 		}
1538 
1539 		/*
1540 		 * Fallback to domain selective flush if no PSI support or
1541 		 * the size is too big.
1542 		 */
1543 		if (!cap_pgsel_inv(iommu->cap) ||
1544 		    mask > cap_max_amask_val(iommu->cap))
1545 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 							DMA_TLB_DSI_FLUSH);
1547 		else
1548 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 							DMA_TLB_PSI_FLUSH);
1550 	}
1551 
1552 	/*
1553 	 * In caching mode, changes of pages from non-present to present require
1554 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 	 */
1556 	if (!cap_caching_mode(iommu->cap) || !map)
1557 		iommu_flush_dev_iotlb(domain, addr, mask);
1558 }
1559 
1560 /* Notification for newly created mappings */
1561 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 					struct dmar_domain *domain,
1563 					unsigned long pfn, unsigned int pages)
1564 {
1565 	/*
1566 	 * It's a non-present to present mapping. Only flush if caching mode
1567 	 * and second level.
1568 	 */
1569 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1570 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 	else
1572 		iommu_flush_write_buffer(iommu);
1573 }
1574 
1575 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576 {
1577 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 	struct iommu_domain_info *info;
1579 	unsigned long idx;
1580 
1581 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 		struct intel_iommu *iommu = info->iommu;
1583 		u16 did = domain_id_iommu(dmar_domain, iommu);
1584 
1585 		if (domain_use_first_level(dmar_domain))
1586 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 		else
1588 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 						 DMA_TLB_DSI_FLUSH);
1590 
1591 		if (!cap_caching_mode(iommu->cap))
1592 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 	}
1594 }
1595 
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598 	u32 pmen;
1599 	unsigned long flags;
1600 
1601 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 		return;
1603 
1604 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 	pmen &= ~DMA_PMEN_EPM;
1607 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608 
1609 	/* wait for the protected region status bit to clear */
1610 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612 
1613 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615 
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618 	u32 sts;
1619 	unsigned long flags;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	iommu->gcmd |= DMA_GCMD_TE;
1623 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624 
1625 	/* Make sure hardware complete it */
1626 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 		      readl, (sts & DMA_GSTS_TES), sts);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flag;
1636 
1637 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 		return;
1640 
1641 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 	iommu->gcmd &= ~DMA_GCMD_TE;
1643 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644 
1645 	/* Make sure hardware complete it */
1646 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648 
1649 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651 
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654 	u32 ndomains;
1655 
1656 	ndomains = cap_ndoms(iommu->cap);
1657 	pr_debug("%s: Number of Domains supported <%d>\n",
1658 		 iommu->name, ndomains);
1659 
1660 	spin_lock_init(&iommu->lock);
1661 
1662 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 	if (!iommu->domain_ids)
1664 		return -ENOMEM;
1665 
1666 	/*
1667 	 * If Caching mode is set, then invalid translations are tagged
1668 	 * with domain-id 0, hence we need to pre-allocate it. We also
1669 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 	 * make sure it is not used for a real domain.
1671 	 */
1672 	set_bit(0, iommu->domain_ids);
1673 
1674 	/*
1675 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 	 * entry for first-level or pass-through translation modes should
1677 	 * be programmed with a domain id different from those used for
1678 	 * second-level or nested translation. We reserve a domain id for
1679 	 * this purpose.
1680 	 */
1681 	if (sm_supported(iommu))
1682 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683 
1684 	return 0;
1685 }
1686 
1687 static void disable_dmar_iommu(struct intel_iommu *iommu)
1688 {
1689 	if (!iommu->domain_ids)
1690 		return;
1691 
1692 	/*
1693 	 * All iommu domains must have been detached from the devices,
1694 	 * hence there should be no domain IDs in use.
1695 	 */
1696 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 		    > NUM_RESERVED_DID))
1698 		return;
1699 
1700 	if (iommu->gcmd & DMA_GCMD_TE)
1701 		iommu_disable_translation(iommu);
1702 }
1703 
1704 static void free_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706 	if (iommu->domain_ids) {
1707 		bitmap_free(iommu->domain_ids);
1708 		iommu->domain_ids = NULL;
1709 	}
1710 
1711 	if (iommu->copied_tables) {
1712 		bitmap_free(iommu->copied_tables);
1713 		iommu->copied_tables = NULL;
1714 	}
1715 
1716 	/* free context mapping */
1717 	free_context_table(iommu);
1718 
1719 #ifdef CONFIG_INTEL_IOMMU_SVM
1720 	if (pasid_supported(iommu)) {
1721 		if (ecap_prs(iommu->ecap))
1722 			intel_svm_finish_prq(iommu);
1723 	}
1724 	if (vccap_pasid(iommu->vccap))
1725 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726 
1727 #endif
1728 }
1729 
1730 /*
1731  * Check and return whether first level is used by default for
1732  * DMA translation.
1733  */
1734 static bool first_level_by_default(unsigned int type)
1735 {
1736 	/* Only SL is available in legacy mode */
1737 	if (!scalable_mode_support())
1738 		return false;
1739 
1740 	/* Only level (either FL or SL) is available, just use it */
1741 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 		return intel_cap_flts_sanity();
1743 
1744 	/* Both levels are available, decide it based on domain type */
1745 	return type != IOMMU_DOMAIN_UNMANAGED;
1746 }
1747 
1748 static struct dmar_domain *alloc_domain(unsigned int type)
1749 {
1750 	struct dmar_domain *domain;
1751 
1752 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 	if (!domain)
1754 		return NULL;
1755 
1756 	domain->nid = NUMA_NO_NODE;
1757 	if (first_level_by_default(type))
1758 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1759 	domain->has_iotlb_device = false;
1760 	INIT_LIST_HEAD(&domain->devices);
1761 	spin_lock_init(&domain->lock);
1762 	xa_init(&domain->iommu_array);
1763 
1764 	return domain;
1765 }
1766 
1767 static int domain_attach_iommu(struct dmar_domain *domain,
1768 			       struct intel_iommu *iommu)
1769 {
1770 	struct iommu_domain_info *info, *curr;
1771 	unsigned long ndomains;
1772 	int num, ret = -ENOSPC;
1773 
1774 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 	if (!info)
1776 		return -ENOMEM;
1777 
1778 	spin_lock(&iommu->lock);
1779 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 	if (curr) {
1781 		curr->refcnt++;
1782 		spin_unlock(&iommu->lock);
1783 		kfree(info);
1784 		return 0;
1785 	}
1786 
1787 	ndomains = cap_ndoms(iommu->cap);
1788 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 	if (num >= ndomains) {
1790 		pr_err("%s: No free domain ids\n", iommu->name);
1791 		goto err_unlock;
1792 	}
1793 
1794 	set_bit(num, iommu->domain_ids);
1795 	info->refcnt	= 1;
1796 	info->did	= num;
1797 	info->iommu	= iommu;
1798 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 			  NULL, info, GFP_ATOMIC);
1800 	if (curr) {
1801 		ret = xa_err(curr) ? : -EBUSY;
1802 		goto err_clear;
1803 	}
1804 	domain_update_iommu_cap(domain);
1805 
1806 	spin_unlock(&iommu->lock);
1807 	return 0;
1808 
1809 err_clear:
1810 	clear_bit(info->did, iommu->domain_ids);
1811 err_unlock:
1812 	spin_unlock(&iommu->lock);
1813 	kfree(info);
1814 	return ret;
1815 }
1816 
1817 static void domain_detach_iommu(struct dmar_domain *domain,
1818 				struct intel_iommu *iommu)
1819 {
1820 	struct iommu_domain_info *info;
1821 
1822 	spin_lock(&iommu->lock);
1823 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 	if (--info->refcnt == 0) {
1825 		clear_bit(info->did, iommu->domain_ids);
1826 		xa_erase(&domain->iommu_array, iommu->seq_id);
1827 		domain->nid = NUMA_NO_NODE;
1828 		domain_update_iommu_cap(domain);
1829 		kfree(info);
1830 	}
1831 	spin_unlock(&iommu->lock);
1832 }
1833 
1834 static inline int guestwidth_to_adjustwidth(int gaw)
1835 {
1836 	int agaw;
1837 	int r = (gaw - 12) % 9;
1838 
1839 	if (r == 0)
1840 		agaw = gaw;
1841 	else
1842 		agaw = gaw + 9 - r;
1843 	if (agaw > 64)
1844 		agaw = 64;
1845 	return agaw;
1846 }
1847 
1848 static void domain_exit(struct dmar_domain *domain)
1849 {
1850 	if (domain->pgd) {
1851 		LIST_HEAD(freelist);
1852 
1853 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 		put_pages_list(&freelist);
1855 	}
1856 
1857 	if (WARN_ON(!list_empty(&domain->devices)))
1858 		return;
1859 
1860 	kfree(domain);
1861 }
1862 
1863 /*
1864  * Get the PASID directory size for scalable mode context entry.
1865  * Value of X in the PDTS field of a scalable mode context entry
1866  * indicates PASID directory with 2^(X + 7) entries.
1867  */
1868 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869 {
1870 	unsigned long pds, max_pde;
1871 
1872 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 	if (pds < 7)
1875 		return 0;
1876 
1877 	return pds - 7;
1878 }
1879 
1880 /*
1881  * Set the RID_PASID field of a scalable mode context entry. The
1882  * IOMMU hardware will use the PASID value set in this field for
1883  * DMA translations of DMA requests without PASID.
1884  */
1885 static inline void
1886 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887 {
1888 	context->hi |= pasid & ((1 << 20) - 1);
1889 }
1890 
1891 /*
1892  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893  * entry.
1894  */
1895 static inline void context_set_sm_dte(struct context_entry *context)
1896 {
1897 	context->lo |= (1 << 2);
1898 }
1899 
1900 /*
1901  * Set the PRE(Page Request Enable) field of a scalable mode context
1902  * entry.
1903  */
1904 static inline void context_set_sm_pre(struct context_entry *context)
1905 {
1906 	context->lo |= (1 << 4);
1907 }
1908 
1909 /* Convert value to context PASID directory size field coding. */
1910 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1911 
1912 static int domain_context_mapping_one(struct dmar_domain *domain,
1913 				      struct intel_iommu *iommu,
1914 				      struct pasid_table *table,
1915 				      u8 bus, u8 devfn)
1916 {
1917 	struct device_domain_info *info =
1918 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 	u16 did = domain_id_iommu(domain, iommu);
1920 	int translation = CONTEXT_TT_MULTI_LEVEL;
1921 	struct context_entry *context;
1922 	int ret;
1923 
1924 	WARN_ON(did == 0);
1925 
1926 	if (hw_pass_through && domain_type_is_si(domain))
1927 		translation = CONTEXT_TT_PASS_THROUGH;
1928 
1929 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931 
1932 	BUG_ON(!domain->pgd);
1933 
1934 	spin_lock(&iommu->lock);
1935 	ret = -ENOMEM;
1936 	context = iommu_context_addr(iommu, bus, devfn, 1);
1937 	if (!context)
1938 		goto out_unlock;
1939 
1940 	ret = 0;
1941 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 		goto out_unlock;
1943 
1944 	/*
1945 	 * For kdump cases, old valid entries may be cached due to the
1946 	 * in-flight DMA and copied pgtable, but there is no unmapping
1947 	 * behaviour for them, thus we need an explicit cache flush for
1948 	 * the newly-mapped device. For kdump, at this point, the device
1949 	 * is supposed to finish reset at its driver probe stage, so no
1950 	 * in-flight DMA will exist, and we don't need to worry anymore
1951 	 * hereafter.
1952 	 */
1953 	if (context_copied(iommu, bus, devfn)) {
1954 		u16 did_old = context_domain_id(context);
1955 
1956 		if (did_old < cap_ndoms(iommu->cap)) {
1957 			iommu->flush.flush_context(iommu, did_old,
1958 						   (((u16)bus) << 8) | devfn,
1959 						   DMA_CCMD_MASK_NOBIT,
1960 						   DMA_CCMD_DEVICE_INVL);
1961 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 						 DMA_TLB_DSI_FLUSH);
1963 		}
1964 
1965 		clear_context_copied(iommu, bus, devfn);
1966 	}
1967 
1968 	context_clear_entry(context);
1969 
1970 	if (sm_supported(iommu)) {
1971 		unsigned long pds;
1972 
1973 		WARN_ON(!table);
1974 
1975 		/* Setup the PASID DIR pointer: */
1976 		pds = context_get_sm_pds(table);
1977 		context->lo = (u64)virt_to_phys(table->table) |
1978 				context_pdts(pds);
1979 
1980 		/* Setup the RID_PASID field: */
1981 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982 
1983 		/*
1984 		 * Setup the Device-TLB enable bit and Page request
1985 		 * Enable bit:
1986 		 */
1987 		if (info && info->ats_supported)
1988 			context_set_sm_dte(context);
1989 		if (info && info->pri_supported)
1990 			context_set_sm_pre(context);
1991 		if (info && info->pasid_supported)
1992 			context_set_pasid(context);
1993 	} else {
1994 		struct dma_pte *pgd = domain->pgd;
1995 		int agaw;
1996 
1997 		context_set_domain_id(context, did);
1998 
1999 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 			/*
2001 			 * Skip top levels of page tables for iommu which has
2002 			 * less agaw than default. Unnecessary for PT mode.
2003 			 */
2004 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 				ret = -ENOMEM;
2006 				pgd = phys_to_virt(dma_pte_addr(pgd));
2007 				if (!dma_pte_present(pgd))
2008 					goto out_unlock;
2009 			}
2010 
2011 			if (info && info->ats_supported)
2012 				translation = CONTEXT_TT_DEV_IOTLB;
2013 			else
2014 				translation = CONTEXT_TT_MULTI_LEVEL;
2015 
2016 			context_set_address_root(context, virt_to_phys(pgd));
2017 			context_set_address_width(context, agaw);
2018 		} else {
2019 			/*
2020 			 * In pass through mode, AW must be programmed to
2021 			 * indicate the largest AGAW value supported by
2022 			 * hardware. And ASR is ignored by hardware.
2023 			 */
2024 			context_set_address_width(context, iommu->msagaw);
2025 		}
2026 
2027 		context_set_translation_type(context, translation);
2028 	}
2029 
2030 	context_set_fault_enable(context);
2031 	context_set_present(context);
2032 	if (!ecap_coherent(iommu->ecap))
2033 		clflush_cache_range(context, sizeof(*context));
2034 
2035 	/*
2036 	 * It's a non-present to present mapping. If hardware doesn't cache
2037 	 * non-present entry we only need to flush the write-buffer. If the
2038 	 * _does_ cache non-present entries, then it does so in the special
2039 	 * domain #0, which we have to flush:
2040 	 */
2041 	if (cap_caching_mode(iommu->cap)) {
2042 		iommu->flush.flush_context(iommu, 0,
2043 					   (((u16)bus) << 8) | devfn,
2044 					   DMA_CCMD_MASK_NOBIT,
2045 					   DMA_CCMD_DEVICE_INVL);
2046 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 	} else {
2048 		iommu_flush_write_buffer(iommu);
2049 	}
2050 	iommu_enable_pci_caps(info);
2051 
2052 	ret = 0;
2053 
2054 out_unlock:
2055 	spin_unlock(&iommu->lock);
2056 
2057 	return ret;
2058 }
2059 
2060 struct domain_context_mapping_data {
2061 	struct dmar_domain *domain;
2062 	struct intel_iommu *iommu;
2063 	struct pasid_table *table;
2064 };
2065 
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067 				     u16 alias, void *opaque)
2068 {
2069 	struct domain_context_mapping_data *data = opaque;
2070 
2071 	return domain_context_mapping_one(data->domain, data->iommu,
2072 					  data->table, PCI_BUS_NUM(alias),
2073 					  alias & 0xff);
2074 }
2075 
2076 static int
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2078 {
2079 	struct domain_context_mapping_data data;
2080 	struct pasid_table *table;
2081 	struct intel_iommu *iommu;
2082 	u8 bus, devfn;
2083 
2084 	iommu = device_to_iommu(dev, &bus, &devfn);
2085 	if (!iommu)
2086 		return -ENODEV;
2087 
2088 	table = intel_pasid_get_table(dev);
2089 
2090 	if (!dev_is_pci(dev))
2091 		return domain_context_mapping_one(domain, iommu, table,
2092 						  bus, devfn);
2093 
2094 	data.domain = domain;
2095 	data.iommu = iommu;
2096 	data.table = table;
2097 
2098 	return pci_for_each_dma_alias(to_pci_dev(dev),
2099 				      &domain_context_mapping_cb, &data);
2100 }
2101 
2102 static int domain_context_mapped_cb(struct pci_dev *pdev,
2103 				    u16 alias, void *opaque)
2104 {
2105 	struct intel_iommu *iommu = opaque;
2106 
2107 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2108 }
2109 
2110 static int domain_context_mapped(struct device *dev)
2111 {
2112 	struct intel_iommu *iommu;
2113 	u8 bus, devfn;
2114 
2115 	iommu = device_to_iommu(dev, &bus, &devfn);
2116 	if (!iommu)
2117 		return -ENODEV;
2118 
2119 	if (!dev_is_pci(dev))
2120 		return device_context_mapped(iommu, bus, devfn);
2121 
2122 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2123 				       domain_context_mapped_cb, iommu);
2124 }
2125 
2126 /* Returns a number of VTD pages, but aligned to MM page size */
2127 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2128 					    size_t size)
2129 {
2130 	host_addr &= ~PAGE_MASK;
2131 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2132 }
2133 
2134 /* Return largest possible superpage level for a given mapping */
2135 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2136 					  unsigned long iov_pfn,
2137 					  unsigned long phy_pfn,
2138 					  unsigned long pages)
2139 {
2140 	int support, level = 1;
2141 	unsigned long pfnmerge;
2142 
2143 	support = domain->iommu_superpage;
2144 
2145 	/* To use a large page, the virtual *and* physical addresses
2146 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2147 	   of them will mean we have to use smaller pages. So just
2148 	   merge them and check both at once. */
2149 	pfnmerge = iov_pfn | phy_pfn;
2150 
2151 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2152 		pages >>= VTD_STRIDE_SHIFT;
2153 		if (!pages)
2154 			break;
2155 		pfnmerge >>= VTD_STRIDE_SHIFT;
2156 		level++;
2157 		support--;
2158 	}
2159 	return level;
2160 }
2161 
2162 /*
2163  * Ensure that old small page tables are removed to make room for superpage(s).
2164  * We're going to add new large pages, so make sure we don't remove their parent
2165  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2166  */
2167 static void switch_to_super_page(struct dmar_domain *domain,
2168 				 unsigned long start_pfn,
2169 				 unsigned long end_pfn, int level)
2170 {
2171 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2172 	struct iommu_domain_info *info;
2173 	struct dma_pte *pte = NULL;
2174 	unsigned long i;
2175 
2176 	while (start_pfn <= end_pfn) {
2177 		if (!pte)
2178 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2179 
2180 		if (dma_pte_present(pte)) {
2181 			dma_pte_free_pagetable(domain, start_pfn,
2182 					       start_pfn + lvl_pages - 1,
2183 					       level + 1);
2184 
2185 			xa_for_each(&domain->iommu_array, i, info)
2186 				iommu_flush_iotlb_psi(info->iommu, domain,
2187 						      start_pfn, lvl_pages,
2188 						      0, 0);
2189 		}
2190 
2191 		pte++;
2192 		start_pfn += lvl_pages;
2193 		if (first_pte_in_page(pte))
2194 			pte = NULL;
2195 	}
2196 }
2197 
2198 static int
2199 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2200 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2201 {
2202 	struct dma_pte *first_pte = NULL, *pte = NULL;
2203 	unsigned int largepage_lvl = 0;
2204 	unsigned long lvl_pages = 0;
2205 	phys_addr_t pteval;
2206 	u64 attr;
2207 
2208 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2209 
2210 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2211 		return -EINVAL;
2212 
2213 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2214 	attr |= DMA_FL_PTE_PRESENT;
2215 	if (domain_use_first_level(domain)) {
2216 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2217 		if (prot & DMA_PTE_WRITE)
2218 			attr |= DMA_FL_PTE_DIRTY;
2219 	}
2220 
2221 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2222 
2223 	while (nr_pages > 0) {
2224 		uint64_t tmp;
2225 
2226 		if (!pte) {
2227 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2228 					phys_pfn, nr_pages);
2229 
2230 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2231 			if (!pte)
2232 				return -ENOMEM;
2233 			first_pte = pte;
2234 
2235 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2236 
2237 			/* It is large page*/
2238 			if (largepage_lvl > 1) {
2239 				unsigned long end_pfn;
2240 				unsigned long pages_to_remove;
2241 
2242 				pteval |= DMA_PTE_LARGE_PAGE;
2243 				pages_to_remove = min_t(unsigned long, nr_pages,
2244 							nr_pte_to_next_page(pte) * lvl_pages);
2245 				end_pfn = iov_pfn + pages_to_remove - 1;
2246 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2247 			} else {
2248 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2249 			}
2250 
2251 		}
2252 		/* We don't need lock here, nobody else
2253 		 * touches the iova range
2254 		 */
2255 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2256 		if (tmp) {
2257 			static int dumps = 5;
2258 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2259 				iov_pfn, tmp, (unsigned long long)pteval);
2260 			if (dumps) {
2261 				dumps--;
2262 				debug_dma_dump_mappings(NULL);
2263 			}
2264 			WARN_ON(1);
2265 		}
2266 
2267 		nr_pages -= lvl_pages;
2268 		iov_pfn += lvl_pages;
2269 		phys_pfn += lvl_pages;
2270 		pteval += lvl_pages * VTD_PAGE_SIZE;
2271 
2272 		/* If the next PTE would be the first in a new page, then we
2273 		 * need to flush the cache on the entries we've just written.
2274 		 * And then we'll need to recalculate 'pte', so clear it and
2275 		 * let it get set again in the if (!pte) block above.
2276 		 *
2277 		 * If we're done (!nr_pages) we need to flush the cache too.
2278 		 *
2279 		 * Also if we've been setting superpages, we may need to
2280 		 * recalculate 'pte' and switch back to smaller pages for the
2281 		 * end of the mapping, if the trailing size is not enough to
2282 		 * use another superpage (i.e. nr_pages < lvl_pages).
2283 		 */
2284 		pte++;
2285 		if (!nr_pages || first_pte_in_page(pte) ||
2286 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2287 			domain_flush_cache(domain, first_pte,
2288 					   (void *)pte - (void *)first_pte);
2289 			pte = NULL;
2290 		}
2291 	}
2292 
2293 	return 0;
2294 }
2295 
2296 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2297 {
2298 	struct intel_iommu *iommu = info->iommu;
2299 	struct context_entry *context;
2300 	u16 did_old;
2301 
2302 	if (!iommu)
2303 		return;
2304 
2305 	spin_lock(&iommu->lock);
2306 	context = iommu_context_addr(iommu, bus, devfn, 0);
2307 	if (!context) {
2308 		spin_unlock(&iommu->lock);
2309 		return;
2310 	}
2311 
2312 	if (sm_supported(iommu)) {
2313 		if (hw_pass_through && domain_type_is_si(info->domain))
2314 			did_old = FLPT_DEFAULT_DID;
2315 		else
2316 			did_old = domain_id_iommu(info->domain, iommu);
2317 	} else {
2318 		did_old = context_domain_id(context);
2319 	}
2320 
2321 	context_clear_entry(context);
2322 	__iommu_flush_cache(iommu, context, sizeof(*context));
2323 	spin_unlock(&iommu->lock);
2324 	iommu->flush.flush_context(iommu,
2325 				   did_old,
2326 				   (((u16)bus) << 8) | devfn,
2327 				   DMA_CCMD_MASK_NOBIT,
2328 				   DMA_CCMD_DEVICE_INVL);
2329 
2330 	if (sm_supported(iommu))
2331 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2332 
2333 	iommu->flush.flush_iotlb(iommu,
2334 				 did_old,
2335 				 0,
2336 				 0,
2337 				 DMA_TLB_DSI_FLUSH);
2338 
2339 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2340 }
2341 
2342 static int domain_setup_first_level(struct intel_iommu *iommu,
2343 				    struct dmar_domain *domain,
2344 				    struct device *dev,
2345 				    u32 pasid)
2346 {
2347 	struct dma_pte *pgd = domain->pgd;
2348 	int agaw, level;
2349 	int flags = 0;
2350 
2351 	/*
2352 	 * Skip top levels of page tables for iommu which has
2353 	 * less agaw than default. Unnecessary for PT mode.
2354 	 */
2355 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2356 		pgd = phys_to_virt(dma_pte_addr(pgd));
2357 		if (!dma_pte_present(pgd))
2358 			return -ENOMEM;
2359 	}
2360 
2361 	level = agaw_to_level(agaw);
2362 	if (level != 4 && level != 5)
2363 		return -EINVAL;
2364 
2365 	if (pasid != PASID_RID2PASID)
2366 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2367 	if (level == 5)
2368 		flags |= PASID_FLAG_FL5LP;
2369 
2370 	if (domain->force_snooping)
2371 		flags |= PASID_FLAG_PAGE_SNOOP;
2372 
2373 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2374 					     domain_id_iommu(domain, iommu),
2375 					     flags);
2376 }
2377 
2378 static bool dev_is_real_dma_subdevice(struct device *dev)
2379 {
2380 	return dev && dev_is_pci(dev) &&
2381 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2382 }
2383 
2384 static int iommu_domain_identity_map(struct dmar_domain *domain,
2385 				     unsigned long first_vpfn,
2386 				     unsigned long last_vpfn)
2387 {
2388 	/*
2389 	 * RMRR range might have overlap with physical memory range,
2390 	 * clear it first
2391 	 */
2392 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2393 
2394 	return __domain_mapping(domain, first_vpfn,
2395 				first_vpfn, last_vpfn - first_vpfn + 1,
2396 				DMA_PTE_READ|DMA_PTE_WRITE);
2397 }
2398 
2399 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2400 
2401 static int __init si_domain_init(int hw)
2402 {
2403 	struct dmar_rmrr_unit *rmrr;
2404 	struct device *dev;
2405 	int i, nid, ret;
2406 
2407 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2408 	if (!si_domain)
2409 		return -EFAULT;
2410 
2411 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2412 		domain_exit(si_domain);
2413 		return -EFAULT;
2414 	}
2415 
2416 	if (hw)
2417 		return 0;
2418 
2419 	for_each_online_node(nid) {
2420 		unsigned long start_pfn, end_pfn;
2421 		int i;
2422 
2423 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2424 			ret = iommu_domain_identity_map(si_domain,
2425 					mm_to_dma_pfn(start_pfn),
2426 					mm_to_dma_pfn(end_pfn));
2427 			if (ret)
2428 				return ret;
2429 		}
2430 	}
2431 
2432 	/*
2433 	 * Identity map the RMRRs so that devices with RMRRs could also use
2434 	 * the si_domain.
2435 	 */
2436 	for_each_rmrr_units(rmrr) {
2437 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2438 					  i, dev) {
2439 			unsigned long long start = rmrr->base_address;
2440 			unsigned long long end = rmrr->end_address;
2441 
2442 			if (WARN_ON(end < start ||
2443 				    end >> agaw_to_width(si_domain->agaw)))
2444 				continue;
2445 
2446 			ret = iommu_domain_identity_map(si_domain,
2447 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2448 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2449 			if (ret)
2450 				return ret;
2451 		}
2452 	}
2453 
2454 	return 0;
2455 }
2456 
2457 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2458 {
2459 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2460 	struct intel_iommu *iommu;
2461 	unsigned long flags;
2462 	u8 bus, devfn;
2463 	int ret;
2464 
2465 	iommu = device_to_iommu(dev, &bus, &devfn);
2466 	if (!iommu)
2467 		return -ENODEV;
2468 
2469 	ret = domain_attach_iommu(domain, iommu);
2470 	if (ret)
2471 		return ret;
2472 	info->domain = domain;
2473 	spin_lock_irqsave(&domain->lock, flags);
2474 	list_add(&info->link, &domain->devices);
2475 	spin_unlock_irqrestore(&domain->lock, flags);
2476 
2477 	/* PASID table is mandatory for a PCI device in scalable mode. */
2478 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2479 		ret = intel_pasid_alloc_table(dev);
2480 		if (ret) {
2481 			dev_err(dev, "PASID table allocation failed\n");
2482 			dmar_remove_one_dev_info(dev);
2483 			return ret;
2484 		}
2485 
2486 		/* Setup the PASID entry for requests without PASID: */
2487 		if (hw_pass_through && domain_type_is_si(domain))
2488 			ret = intel_pasid_setup_pass_through(iommu, domain,
2489 					dev, PASID_RID2PASID);
2490 		else if (domain_use_first_level(domain))
2491 			ret = domain_setup_first_level(iommu, domain, dev,
2492 					PASID_RID2PASID);
2493 		else
2494 			ret = intel_pasid_setup_second_level(iommu, domain,
2495 					dev, PASID_RID2PASID);
2496 		if (ret) {
2497 			dev_err(dev, "Setup RID2PASID failed\n");
2498 			dmar_remove_one_dev_info(dev);
2499 			return ret;
2500 		}
2501 	}
2502 
2503 	ret = domain_context_mapping(domain, dev);
2504 	if (ret) {
2505 		dev_err(dev, "Domain context map failed\n");
2506 		dmar_remove_one_dev_info(dev);
2507 		return ret;
2508 	}
2509 
2510 	return 0;
2511 }
2512 
2513 static bool device_has_rmrr(struct device *dev)
2514 {
2515 	struct dmar_rmrr_unit *rmrr;
2516 	struct device *tmp;
2517 	int i;
2518 
2519 	rcu_read_lock();
2520 	for_each_rmrr_units(rmrr) {
2521 		/*
2522 		 * Return TRUE if this RMRR contains the device that
2523 		 * is passed in.
2524 		 */
2525 		for_each_active_dev_scope(rmrr->devices,
2526 					  rmrr->devices_cnt, i, tmp)
2527 			if (tmp == dev ||
2528 			    is_downstream_to_pci_bridge(dev, tmp)) {
2529 				rcu_read_unlock();
2530 				return true;
2531 			}
2532 	}
2533 	rcu_read_unlock();
2534 	return false;
2535 }
2536 
2537 /**
2538  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2539  * is relaxable (ie. is allowed to be not enforced under some conditions)
2540  * @dev: device handle
2541  *
2542  * We assume that PCI USB devices with RMRRs have them largely
2543  * for historical reasons and that the RMRR space is not actively used post
2544  * boot.  This exclusion may change if vendors begin to abuse it.
2545  *
2546  * The same exception is made for graphics devices, with the requirement that
2547  * any use of the RMRR regions will be torn down before assigning the device
2548  * to a guest.
2549  *
2550  * Return: true if the RMRR is relaxable, false otherwise
2551  */
2552 static bool device_rmrr_is_relaxable(struct device *dev)
2553 {
2554 	struct pci_dev *pdev;
2555 
2556 	if (!dev_is_pci(dev))
2557 		return false;
2558 
2559 	pdev = to_pci_dev(dev);
2560 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2561 		return true;
2562 	else
2563 		return false;
2564 }
2565 
2566 /*
2567  * There are a couple cases where we need to restrict the functionality of
2568  * devices associated with RMRRs.  The first is when evaluating a device for
2569  * identity mapping because problems exist when devices are moved in and out
2570  * of domains and their respective RMRR information is lost.  This means that
2571  * a device with associated RMRRs will never be in a "passthrough" domain.
2572  * The second is use of the device through the IOMMU API.  This interface
2573  * expects to have full control of the IOVA space for the device.  We cannot
2574  * satisfy both the requirement that RMRR access is maintained and have an
2575  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2576  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2577  * We therefore prevent devices associated with an RMRR from participating in
2578  * the IOMMU API, which eliminates them from device assignment.
2579  *
2580  * In both cases, devices which have relaxable RMRRs are not concerned by this
2581  * restriction. See device_rmrr_is_relaxable comment.
2582  */
2583 static bool device_is_rmrr_locked(struct device *dev)
2584 {
2585 	if (!device_has_rmrr(dev))
2586 		return false;
2587 
2588 	if (device_rmrr_is_relaxable(dev))
2589 		return false;
2590 
2591 	return true;
2592 }
2593 
2594 /*
2595  * Return the required default domain type for a specific device.
2596  *
2597  * @dev: the device in query
2598  * @startup: true if this is during early boot
2599  *
2600  * Returns:
2601  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2602  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2603  *  - 0: both identity and dynamic domains work for this device
2604  */
2605 static int device_def_domain_type(struct device *dev)
2606 {
2607 	if (dev_is_pci(dev)) {
2608 		struct pci_dev *pdev = to_pci_dev(dev);
2609 
2610 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2611 			return IOMMU_DOMAIN_IDENTITY;
2612 
2613 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2614 			return IOMMU_DOMAIN_IDENTITY;
2615 	}
2616 
2617 	return 0;
2618 }
2619 
2620 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2621 {
2622 	/*
2623 	 * Start from the sane iommu hardware state.
2624 	 * If the queued invalidation is already initialized by us
2625 	 * (for example, while enabling interrupt-remapping) then
2626 	 * we got the things already rolling from a sane state.
2627 	 */
2628 	if (!iommu->qi) {
2629 		/*
2630 		 * Clear any previous faults.
2631 		 */
2632 		dmar_fault(-1, iommu);
2633 		/*
2634 		 * Disable queued invalidation if supported and already enabled
2635 		 * before OS handover.
2636 		 */
2637 		dmar_disable_qi(iommu);
2638 	}
2639 
2640 	if (dmar_enable_qi(iommu)) {
2641 		/*
2642 		 * Queued Invalidate not enabled, use Register Based Invalidate
2643 		 */
2644 		iommu->flush.flush_context = __iommu_flush_context;
2645 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2646 		pr_info("%s: Using Register based invalidation\n",
2647 			iommu->name);
2648 	} else {
2649 		iommu->flush.flush_context = qi_flush_context;
2650 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2651 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2652 	}
2653 }
2654 
2655 static int copy_context_table(struct intel_iommu *iommu,
2656 			      struct root_entry *old_re,
2657 			      struct context_entry **tbl,
2658 			      int bus, bool ext)
2659 {
2660 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2661 	struct context_entry *new_ce = NULL, ce;
2662 	struct context_entry *old_ce = NULL;
2663 	struct root_entry re;
2664 	phys_addr_t old_ce_phys;
2665 
2666 	tbl_idx = ext ? bus * 2 : bus;
2667 	memcpy(&re, old_re, sizeof(re));
2668 
2669 	for (devfn = 0; devfn < 256; devfn++) {
2670 		/* First calculate the correct index */
2671 		idx = (ext ? devfn * 2 : devfn) % 256;
2672 
2673 		if (idx == 0) {
2674 			/* First save what we may have and clean up */
2675 			if (new_ce) {
2676 				tbl[tbl_idx] = new_ce;
2677 				__iommu_flush_cache(iommu, new_ce,
2678 						    VTD_PAGE_SIZE);
2679 				pos = 1;
2680 			}
2681 
2682 			if (old_ce)
2683 				memunmap(old_ce);
2684 
2685 			ret = 0;
2686 			if (devfn < 0x80)
2687 				old_ce_phys = root_entry_lctp(&re);
2688 			else
2689 				old_ce_phys = root_entry_uctp(&re);
2690 
2691 			if (!old_ce_phys) {
2692 				if (ext && devfn == 0) {
2693 					/* No LCTP, try UCTP */
2694 					devfn = 0x7f;
2695 					continue;
2696 				} else {
2697 					goto out;
2698 				}
2699 			}
2700 
2701 			ret = -ENOMEM;
2702 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2703 					MEMREMAP_WB);
2704 			if (!old_ce)
2705 				goto out;
2706 
2707 			new_ce = alloc_pgtable_page(iommu->node);
2708 			if (!new_ce)
2709 				goto out_unmap;
2710 
2711 			ret = 0;
2712 		}
2713 
2714 		/* Now copy the context entry */
2715 		memcpy(&ce, old_ce + idx, sizeof(ce));
2716 
2717 		if (!context_present(&ce))
2718 			continue;
2719 
2720 		did = context_domain_id(&ce);
2721 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2722 			set_bit(did, iommu->domain_ids);
2723 
2724 		set_context_copied(iommu, bus, devfn);
2725 		new_ce[idx] = ce;
2726 	}
2727 
2728 	tbl[tbl_idx + pos] = new_ce;
2729 
2730 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2731 
2732 out_unmap:
2733 	memunmap(old_ce);
2734 
2735 out:
2736 	return ret;
2737 }
2738 
2739 static int copy_translation_tables(struct intel_iommu *iommu)
2740 {
2741 	struct context_entry **ctxt_tbls;
2742 	struct root_entry *old_rt;
2743 	phys_addr_t old_rt_phys;
2744 	int ctxt_table_entries;
2745 	u64 rtaddr_reg;
2746 	int bus, ret;
2747 	bool new_ext, ext;
2748 
2749 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2750 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2751 	new_ext    = !!sm_supported(iommu);
2752 
2753 	/*
2754 	 * The RTT bit can only be changed when translation is disabled,
2755 	 * but disabling translation means to open a window for data
2756 	 * corruption. So bail out and don't copy anything if we would
2757 	 * have to change the bit.
2758 	 */
2759 	if (new_ext != ext)
2760 		return -EINVAL;
2761 
2762 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2763 	if (!iommu->copied_tables)
2764 		return -ENOMEM;
2765 
2766 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2767 	if (!old_rt_phys)
2768 		return -EINVAL;
2769 
2770 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2771 	if (!old_rt)
2772 		return -ENOMEM;
2773 
2774 	/* This is too big for the stack - allocate it from slab */
2775 	ctxt_table_entries = ext ? 512 : 256;
2776 	ret = -ENOMEM;
2777 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2778 	if (!ctxt_tbls)
2779 		goto out_unmap;
2780 
2781 	for (bus = 0; bus < 256; bus++) {
2782 		ret = copy_context_table(iommu, &old_rt[bus],
2783 					 ctxt_tbls, bus, ext);
2784 		if (ret) {
2785 			pr_err("%s: Failed to copy context table for bus %d\n",
2786 				iommu->name, bus);
2787 			continue;
2788 		}
2789 	}
2790 
2791 	spin_lock(&iommu->lock);
2792 
2793 	/* Context tables are copied, now write them to the root_entry table */
2794 	for (bus = 0; bus < 256; bus++) {
2795 		int idx = ext ? bus * 2 : bus;
2796 		u64 val;
2797 
2798 		if (ctxt_tbls[idx]) {
2799 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2800 			iommu->root_entry[bus].lo = val;
2801 		}
2802 
2803 		if (!ext || !ctxt_tbls[idx + 1])
2804 			continue;
2805 
2806 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2807 		iommu->root_entry[bus].hi = val;
2808 	}
2809 
2810 	spin_unlock(&iommu->lock);
2811 
2812 	kfree(ctxt_tbls);
2813 
2814 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2815 
2816 	ret = 0;
2817 
2818 out_unmap:
2819 	memunmap(old_rt);
2820 
2821 	return ret;
2822 }
2823 
2824 #ifdef CONFIG_INTEL_IOMMU_SVM
2825 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2826 {
2827 	struct intel_iommu *iommu = data;
2828 	ioasid_t ioasid;
2829 
2830 	if (!iommu)
2831 		return INVALID_IOASID;
2832 	/*
2833 	 * VT-d virtual command interface always uses the full 20 bit
2834 	 * PASID range. Host can partition guest PASID range based on
2835 	 * policies but it is out of guest's control.
2836 	 */
2837 	if (min < PASID_MIN || max > intel_pasid_max_id)
2838 		return INVALID_IOASID;
2839 
2840 	if (vcmd_alloc_pasid(iommu, &ioasid))
2841 		return INVALID_IOASID;
2842 
2843 	return ioasid;
2844 }
2845 
2846 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2847 {
2848 	struct intel_iommu *iommu = data;
2849 
2850 	if (!iommu)
2851 		return;
2852 	/*
2853 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2854 	 * We can only free the PASID when all the devices are unbound.
2855 	 */
2856 	if (ioasid_find(NULL, ioasid, NULL)) {
2857 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2858 		return;
2859 	}
2860 	vcmd_free_pasid(iommu, ioasid);
2861 }
2862 
2863 static void register_pasid_allocator(struct intel_iommu *iommu)
2864 {
2865 	/*
2866 	 * If we are running in the host, no need for custom allocator
2867 	 * in that PASIDs are allocated from the host system-wide.
2868 	 */
2869 	if (!cap_caching_mode(iommu->cap))
2870 		return;
2871 
2872 	if (!sm_supported(iommu)) {
2873 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2874 		return;
2875 	}
2876 
2877 	/*
2878 	 * Register a custom PASID allocator if we are running in a guest,
2879 	 * guest PASID must be obtained via virtual command interface.
2880 	 * There can be multiple vIOMMUs in each guest but only one allocator
2881 	 * is active. All vIOMMU allocators will eventually be calling the same
2882 	 * host allocator.
2883 	 */
2884 	if (!vccap_pasid(iommu->vccap))
2885 		return;
2886 
2887 	pr_info("Register custom PASID allocator\n");
2888 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2889 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2890 	iommu->pasid_allocator.pdata = (void *)iommu;
2891 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2892 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2893 		/*
2894 		 * Disable scalable mode on this IOMMU if there
2895 		 * is no custom allocator. Mixing SM capable vIOMMU
2896 		 * and non-SM vIOMMU are not supported.
2897 		 */
2898 		intel_iommu_sm = 0;
2899 	}
2900 }
2901 #endif
2902 
2903 static int __init init_dmars(void)
2904 {
2905 	struct dmar_drhd_unit *drhd;
2906 	struct intel_iommu *iommu;
2907 	int ret;
2908 
2909 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2910 	if (ret)
2911 		goto free_iommu;
2912 
2913 	for_each_iommu(iommu, drhd) {
2914 		if (drhd->ignored) {
2915 			iommu_disable_translation(iommu);
2916 			continue;
2917 		}
2918 
2919 		/*
2920 		 * Find the max pasid size of all IOMMU's in the system.
2921 		 * We need to ensure the system pasid table is no bigger
2922 		 * than the smallest supported.
2923 		 */
2924 		if (pasid_supported(iommu)) {
2925 			u32 temp = 2 << ecap_pss(iommu->ecap);
2926 
2927 			intel_pasid_max_id = min_t(u32, temp,
2928 						   intel_pasid_max_id);
2929 		}
2930 
2931 		intel_iommu_init_qi(iommu);
2932 
2933 		ret = iommu_init_domains(iommu);
2934 		if (ret)
2935 			goto free_iommu;
2936 
2937 		init_translation_status(iommu);
2938 
2939 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2940 			iommu_disable_translation(iommu);
2941 			clear_translation_pre_enabled(iommu);
2942 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2943 				iommu->name);
2944 		}
2945 
2946 		/*
2947 		 * TBD:
2948 		 * we could share the same root & context tables
2949 		 * among all IOMMU's. Need to Split it later.
2950 		 */
2951 		ret = iommu_alloc_root_entry(iommu);
2952 		if (ret)
2953 			goto free_iommu;
2954 
2955 		if (translation_pre_enabled(iommu)) {
2956 			pr_info("Translation already enabled - trying to copy translation structures\n");
2957 
2958 			ret = copy_translation_tables(iommu);
2959 			if (ret) {
2960 				/*
2961 				 * We found the IOMMU with translation
2962 				 * enabled - but failed to copy over the
2963 				 * old root-entry table. Try to proceed
2964 				 * by disabling translation now and
2965 				 * allocating a clean root-entry table.
2966 				 * This might cause DMAR faults, but
2967 				 * probably the dump will still succeed.
2968 				 */
2969 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2970 				       iommu->name);
2971 				iommu_disable_translation(iommu);
2972 				clear_translation_pre_enabled(iommu);
2973 			} else {
2974 				pr_info("Copied translation tables from previous kernel for %s\n",
2975 					iommu->name);
2976 			}
2977 		}
2978 
2979 		if (!ecap_pass_through(iommu->ecap))
2980 			hw_pass_through = 0;
2981 		intel_svm_check(iommu);
2982 	}
2983 
2984 	/*
2985 	 * Now that qi is enabled on all iommus, set the root entry and flush
2986 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2987 	 * flush_context function will loop forever and the boot hangs.
2988 	 */
2989 	for_each_active_iommu(iommu, drhd) {
2990 		iommu_flush_write_buffer(iommu);
2991 #ifdef CONFIG_INTEL_IOMMU_SVM
2992 		register_pasid_allocator(iommu);
2993 #endif
2994 		iommu_set_root_entry(iommu);
2995 	}
2996 
2997 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2998 	dmar_map_gfx = 0;
2999 #endif
3000 
3001 	if (!dmar_map_gfx)
3002 		iommu_identity_mapping |= IDENTMAP_GFX;
3003 
3004 	check_tylersburg_isoch();
3005 
3006 	ret = si_domain_init(hw_pass_through);
3007 	if (ret)
3008 		goto free_iommu;
3009 
3010 	/*
3011 	 * for each drhd
3012 	 *   enable fault log
3013 	 *   global invalidate context cache
3014 	 *   global invalidate iotlb
3015 	 *   enable translation
3016 	 */
3017 	for_each_iommu(iommu, drhd) {
3018 		if (drhd->ignored) {
3019 			/*
3020 			 * we always have to disable PMRs or DMA may fail on
3021 			 * this device
3022 			 */
3023 			if (force_on)
3024 				iommu_disable_protect_mem_regions(iommu);
3025 			continue;
3026 		}
3027 
3028 		iommu_flush_write_buffer(iommu);
3029 
3030 #ifdef CONFIG_INTEL_IOMMU_SVM
3031 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3032 			/*
3033 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3034 			 * could cause possible lock race condition.
3035 			 */
3036 			up_write(&dmar_global_lock);
3037 			ret = intel_svm_enable_prq(iommu);
3038 			down_write(&dmar_global_lock);
3039 			if (ret)
3040 				goto free_iommu;
3041 		}
3042 #endif
3043 		ret = dmar_set_interrupt(iommu);
3044 		if (ret)
3045 			goto free_iommu;
3046 	}
3047 
3048 	return 0;
3049 
3050 free_iommu:
3051 	for_each_active_iommu(iommu, drhd) {
3052 		disable_dmar_iommu(iommu);
3053 		free_dmar_iommu(iommu);
3054 	}
3055 
3056 	return ret;
3057 }
3058 
3059 static void __init init_no_remapping_devices(void)
3060 {
3061 	struct dmar_drhd_unit *drhd;
3062 	struct device *dev;
3063 	int i;
3064 
3065 	for_each_drhd_unit(drhd) {
3066 		if (!drhd->include_all) {
3067 			for_each_active_dev_scope(drhd->devices,
3068 						  drhd->devices_cnt, i, dev)
3069 				break;
3070 			/* ignore DMAR unit if no devices exist */
3071 			if (i == drhd->devices_cnt)
3072 				drhd->ignored = 1;
3073 		}
3074 	}
3075 
3076 	for_each_active_drhd_unit(drhd) {
3077 		if (drhd->include_all)
3078 			continue;
3079 
3080 		for_each_active_dev_scope(drhd->devices,
3081 					  drhd->devices_cnt, i, dev)
3082 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3083 				break;
3084 		if (i < drhd->devices_cnt)
3085 			continue;
3086 
3087 		/* This IOMMU has *only* gfx devices. Either bypass it or
3088 		   set the gfx_mapped flag, as appropriate */
3089 		drhd->gfx_dedicated = 1;
3090 		if (!dmar_map_gfx)
3091 			drhd->ignored = 1;
3092 	}
3093 }
3094 
3095 #ifdef CONFIG_SUSPEND
3096 static int init_iommu_hw(void)
3097 {
3098 	struct dmar_drhd_unit *drhd;
3099 	struct intel_iommu *iommu = NULL;
3100 
3101 	for_each_active_iommu(iommu, drhd)
3102 		if (iommu->qi)
3103 			dmar_reenable_qi(iommu);
3104 
3105 	for_each_iommu(iommu, drhd) {
3106 		if (drhd->ignored) {
3107 			/*
3108 			 * we always have to disable PMRs or DMA may fail on
3109 			 * this device
3110 			 */
3111 			if (force_on)
3112 				iommu_disable_protect_mem_regions(iommu);
3113 			continue;
3114 		}
3115 
3116 		iommu_flush_write_buffer(iommu);
3117 		iommu_set_root_entry(iommu);
3118 		iommu_enable_translation(iommu);
3119 		iommu_disable_protect_mem_regions(iommu);
3120 	}
3121 
3122 	return 0;
3123 }
3124 
3125 static void iommu_flush_all(void)
3126 {
3127 	struct dmar_drhd_unit *drhd;
3128 	struct intel_iommu *iommu;
3129 
3130 	for_each_active_iommu(iommu, drhd) {
3131 		iommu->flush.flush_context(iommu, 0, 0, 0,
3132 					   DMA_CCMD_GLOBAL_INVL);
3133 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134 					 DMA_TLB_GLOBAL_FLUSH);
3135 	}
3136 }
3137 
3138 static int iommu_suspend(void)
3139 {
3140 	struct dmar_drhd_unit *drhd;
3141 	struct intel_iommu *iommu = NULL;
3142 	unsigned long flag;
3143 
3144 	for_each_active_iommu(iommu, drhd) {
3145 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3146 					     GFP_KERNEL);
3147 		if (!iommu->iommu_state)
3148 			goto nomem;
3149 	}
3150 
3151 	iommu_flush_all();
3152 
3153 	for_each_active_iommu(iommu, drhd) {
3154 		iommu_disable_translation(iommu);
3155 
3156 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3157 
3158 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3159 			readl(iommu->reg + DMAR_FECTL_REG);
3160 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3161 			readl(iommu->reg + DMAR_FEDATA_REG);
3162 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3163 			readl(iommu->reg + DMAR_FEADDR_REG);
3164 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3165 			readl(iommu->reg + DMAR_FEUADDR_REG);
3166 
3167 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3168 	}
3169 	return 0;
3170 
3171 nomem:
3172 	for_each_active_iommu(iommu, drhd)
3173 		kfree(iommu->iommu_state);
3174 
3175 	return -ENOMEM;
3176 }
3177 
3178 static void iommu_resume(void)
3179 {
3180 	struct dmar_drhd_unit *drhd;
3181 	struct intel_iommu *iommu = NULL;
3182 	unsigned long flag;
3183 
3184 	if (init_iommu_hw()) {
3185 		if (force_on)
3186 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3187 		else
3188 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3189 		return;
3190 	}
3191 
3192 	for_each_active_iommu(iommu, drhd) {
3193 
3194 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3195 
3196 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3197 			iommu->reg + DMAR_FECTL_REG);
3198 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3199 			iommu->reg + DMAR_FEDATA_REG);
3200 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3201 			iommu->reg + DMAR_FEADDR_REG);
3202 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3203 			iommu->reg + DMAR_FEUADDR_REG);
3204 
3205 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3206 	}
3207 
3208 	for_each_active_iommu(iommu, drhd)
3209 		kfree(iommu->iommu_state);
3210 }
3211 
3212 static struct syscore_ops iommu_syscore_ops = {
3213 	.resume		= iommu_resume,
3214 	.suspend	= iommu_suspend,
3215 };
3216 
3217 static void __init init_iommu_pm_ops(void)
3218 {
3219 	register_syscore_ops(&iommu_syscore_ops);
3220 }
3221 
3222 #else
3223 static inline void init_iommu_pm_ops(void) {}
3224 #endif	/* CONFIG_PM */
3225 
3226 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3227 {
3228 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3229 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3230 	    rmrr->end_address <= rmrr->base_address ||
3231 	    arch_rmrr_sanity_check(rmrr))
3232 		return -EINVAL;
3233 
3234 	return 0;
3235 }
3236 
3237 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3238 {
3239 	struct acpi_dmar_reserved_memory *rmrr;
3240 	struct dmar_rmrr_unit *rmrru;
3241 
3242 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3243 	if (rmrr_sanity_check(rmrr)) {
3244 		pr_warn(FW_BUG
3245 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3246 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3247 			   rmrr->base_address, rmrr->end_address,
3248 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3249 			   dmi_get_system_info(DMI_BIOS_VERSION),
3250 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3251 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3252 	}
3253 
3254 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3255 	if (!rmrru)
3256 		goto out;
3257 
3258 	rmrru->hdr = header;
3259 
3260 	rmrru->base_address = rmrr->base_address;
3261 	rmrru->end_address = rmrr->end_address;
3262 
3263 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3264 				((void *)rmrr) + rmrr->header.length,
3265 				&rmrru->devices_cnt);
3266 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3267 		goto free_rmrru;
3268 
3269 	list_add(&rmrru->list, &dmar_rmrr_units);
3270 
3271 	return 0;
3272 free_rmrru:
3273 	kfree(rmrru);
3274 out:
3275 	return -ENOMEM;
3276 }
3277 
3278 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3279 {
3280 	struct dmar_atsr_unit *atsru;
3281 	struct acpi_dmar_atsr *tmp;
3282 
3283 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3284 				dmar_rcu_check()) {
3285 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3286 		if (atsr->segment != tmp->segment)
3287 			continue;
3288 		if (atsr->header.length != tmp->header.length)
3289 			continue;
3290 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3291 			return atsru;
3292 	}
3293 
3294 	return NULL;
3295 }
3296 
3297 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3298 {
3299 	struct acpi_dmar_atsr *atsr;
3300 	struct dmar_atsr_unit *atsru;
3301 
3302 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3303 		return 0;
3304 
3305 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3306 	atsru = dmar_find_atsr(atsr);
3307 	if (atsru)
3308 		return 0;
3309 
3310 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3311 	if (!atsru)
3312 		return -ENOMEM;
3313 
3314 	/*
3315 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3316 	 * copy the memory content because the memory buffer will be freed
3317 	 * on return.
3318 	 */
3319 	atsru->hdr = (void *)(atsru + 1);
3320 	memcpy(atsru->hdr, hdr, hdr->length);
3321 	atsru->include_all = atsr->flags & 0x1;
3322 	if (!atsru->include_all) {
3323 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3324 				(void *)atsr + atsr->header.length,
3325 				&atsru->devices_cnt);
3326 		if (atsru->devices_cnt && atsru->devices == NULL) {
3327 			kfree(atsru);
3328 			return -ENOMEM;
3329 		}
3330 	}
3331 
3332 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3333 
3334 	return 0;
3335 }
3336 
3337 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3338 {
3339 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3340 	kfree(atsru);
3341 }
3342 
3343 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3344 {
3345 	struct acpi_dmar_atsr *atsr;
3346 	struct dmar_atsr_unit *atsru;
3347 
3348 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3349 	atsru = dmar_find_atsr(atsr);
3350 	if (atsru) {
3351 		list_del_rcu(&atsru->list);
3352 		synchronize_rcu();
3353 		intel_iommu_free_atsr(atsru);
3354 	}
3355 
3356 	return 0;
3357 }
3358 
3359 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3360 {
3361 	int i;
3362 	struct device *dev;
3363 	struct acpi_dmar_atsr *atsr;
3364 	struct dmar_atsr_unit *atsru;
3365 
3366 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3367 	atsru = dmar_find_atsr(atsr);
3368 	if (!atsru)
3369 		return 0;
3370 
3371 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3372 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3373 					  i, dev)
3374 			return -EBUSY;
3375 	}
3376 
3377 	return 0;
3378 }
3379 
3380 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3381 {
3382 	struct dmar_satc_unit *satcu;
3383 	struct acpi_dmar_satc *tmp;
3384 
3385 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3386 				dmar_rcu_check()) {
3387 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3388 		if (satc->segment != tmp->segment)
3389 			continue;
3390 		if (satc->header.length != tmp->header.length)
3391 			continue;
3392 		if (memcmp(satc, tmp, satc->header.length) == 0)
3393 			return satcu;
3394 	}
3395 
3396 	return NULL;
3397 }
3398 
3399 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3400 {
3401 	struct acpi_dmar_satc *satc;
3402 	struct dmar_satc_unit *satcu;
3403 
3404 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3405 		return 0;
3406 
3407 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3408 	satcu = dmar_find_satc(satc);
3409 	if (satcu)
3410 		return 0;
3411 
3412 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3413 	if (!satcu)
3414 		return -ENOMEM;
3415 
3416 	satcu->hdr = (void *)(satcu + 1);
3417 	memcpy(satcu->hdr, hdr, hdr->length);
3418 	satcu->atc_required = satc->flags & 0x1;
3419 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3420 					      (void *)satc + satc->header.length,
3421 					      &satcu->devices_cnt);
3422 	if (satcu->devices_cnt && !satcu->devices) {
3423 		kfree(satcu);
3424 		return -ENOMEM;
3425 	}
3426 	list_add_rcu(&satcu->list, &dmar_satc_units);
3427 
3428 	return 0;
3429 }
3430 
3431 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3432 {
3433 	int sp, ret;
3434 	struct intel_iommu *iommu = dmaru->iommu;
3435 
3436 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3437 	if (ret)
3438 		goto out;
3439 
3440 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3441 		pr_warn("%s: Doesn't support hardware pass through.\n",
3442 			iommu->name);
3443 		return -ENXIO;
3444 	}
3445 
3446 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3447 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3448 		pr_warn("%s: Doesn't support large page.\n",
3449 			iommu->name);
3450 		return -ENXIO;
3451 	}
3452 
3453 	/*
3454 	 * Disable translation if already enabled prior to OS handover.
3455 	 */
3456 	if (iommu->gcmd & DMA_GCMD_TE)
3457 		iommu_disable_translation(iommu);
3458 
3459 	ret = iommu_init_domains(iommu);
3460 	if (ret == 0)
3461 		ret = iommu_alloc_root_entry(iommu);
3462 	if (ret)
3463 		goto out;
3464 
3465 	intel_svm_check(iommu);
3466 
3467 	if (dmaru->ignored) {
3468 		/*
3469 		 * we always have to disable PMRs or DMA may fail on this device
3470 		 */
3471 		if (force_on)
3472 			iommu_disable_protect_mem_regions(iommu);
3473 		return 0;
3474 	}
3475 
3476 	intel_iommu_init_qi(iommu);
3477 	iommu_flush_write_buffer(iommu);
3478 
3479 #ifdef CONFIG_INTEL_IOMMU_SVM
3480 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3481 		ret = intel_svm_enable_prq(iommu);
3482 		if (ret)
3483 			goto disable_iommu;
3484 	}
3485 #endif
3486 	ret = dmar_set_interrupt(iommu);
3487 	if (ret)
3488 		goto disable_iommu;
3489 
3490 	iommu_set_root_entry(iommu);
3491 	iommu_enable_translation(iommu);
3492 
3493 	iommu_disable_protect_mem_regions(iommu);
3494 	return 0;
3495 
3496 disable_iommu:
3497 	disable_dmar_iommu(iommu);
3498 out:
3499 	free_dmar_iommu(iommu);
3500 	return ret;
3501 }
3502 
3503 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3504 {
3505 	int ret = 0;
3506 	struct intel_iommu *iommu = dmaru->iommu;
3507 
3508 	if (!intel_iommu_enabled)
3509 		return 0;
3510 	if (iommu == NULL)
3511 		return -EINVAL;
3512 
3513 	if (insert) {
3514 		ret = intel_iommu_add(dmaru);
3515 	} else {
3516 		disable_dmar_iommu(iommu);
3517 		free_dmar_iommu(iommu);
3518 	}
3519 
3520 	return ret;
3521 }
3522 
3523 static void intel_iommu_free_dmars(void)
3524 {
3525 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3526 	struct dmar_atsr_unit *atsru, *atsr_n;
3527 	struct dmar_satc_unit *satcu, *satc_n;
3528 
3529 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3530 		list_del(&rmrru->list);
3531 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3532 		kfree(rmrru);
3533 	}
3534 
3535 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3536 		list_del(&atsru->list);
3537 		intel_iommu_free_atsr(atsru);
3538 	}
3539 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3540 		list_del(&satcu->list);
3541 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3542 		kfree(satcu);
3543 	}
3544 }
3545 
3546 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3547 {
3548 	struct dmar_satc_unit *satcu;
3549 	struct acpi_dmar_satc *satc;
3550 	struct device *tmp;
3551 	int i;
3552 
3553 	dev = pci_physfn(dev);
3554 	rcu_read_lock();
3555 
3556 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3557 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3558 		if (satc->segment != pci_domain_nr(dev->bus))
3559 			continue;
3560 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3561 			if (to_pci_dev(tmp) == dev)
3562 				goto out;
3563 	}
3564 	satcu = NULL;
3565 out:
3566 	rcu_read_unlock();
3567 	return satcu;
3568 }
3569 
3570 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3571 {
3572 	int i, ret = 1;
3573 	struct pci_bus *bus;
3574 	struct pci_dev *bridge = NULL;
3575 	struct device *tmp;
3576 	struct acpi_dmar_atsr *atsr;
3577 	struct dmar_atsr_unit *atsru;
3578 	struct dmar_satc_unit *satcu;
3579 
3580 	dev = pci_physfn(dev);
3581 	satcu = dmar_find_matched_satc_unit(dev);
3582 	if (satcu)
3583 		/*
3584 		 * This device supports ATS as it is in SATC table.
3585 		 * When IOMMU is in legacy mode, enabling ATS is done
3586 		 * automatically by HW for the device that requires
3587 		 * ATS, hence OS should not enable this device ATS
3588 		 * to avoid duplicated TLB invalidation.
3589 		 */
3590 		return !(satcu->atc_required && !sm_supported(iommu));
3591 
3592 	for (bus = dev->bus; bus; bus = bus->parent) {
3593 		bridge = bus->self;
3594 		/* If it's an integrated device, allow ATS */
3595 		if (!bridge)
3596 			return 1;
3597 		/* Connected via non-PCIe: no ATS */
3598 		if (!pci_is_pcie(bridge) ||
3599 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3600 			return 0;
3601 		/* If we found the root port, look it up in the ATSR */
3602 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3603 			break;
3604 	}
3605 
3606 	rcu_read_lock();
3607 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3608 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3609 		if (atsr->segment != pci_domain_nr(dev->bus))
3610 			continue;
3611 
3612 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3613 			if (tmp == &bridge->dev)
3614 				goto out;
3615 
3616 		if (atsru->include_all)
3617 			goto out;
3618 	}
3619 	ret = 0;
3620 out:
3621 	rcu_read_unlock();
3622 
3623 	return ret;
3624 }
3625 
3626 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3627 {
3628 	int ret;
3629 	struct dmar_rmrr_unit *rmrru;
3630 	struct dmar_atsr_unit *atsru;
3631 	struct dmar_satc_unit *satcu;
3632 	struct acpi_dmar_atsr *atsr;
3633 	struct acpi_dmar_reserved_memory *rmrr;
3634 	struct acpi_dmar_satc *satc;
3635 
3636 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3637 		return 0;
3638 
3639 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3640 		rmrr = container_of(rmrru->hdr,
3641 				    struct acpi_dmar_reserved_memory, header);
3642 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3643 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3644 				((void *)rmrr) + rmrr->header.length,
3645 				rmrr->segment, rmrru->devices,
3646 				rmrru->devices_cnt);
3647 			if (ret < 0)
3648 				return ret;
3649 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3650 			dmar_remove_dev_scope(info, rmrr->segment,
3651 				rmrru->devices, rmrru->devices_cnt);
3652 		}
3653 	}
3654 
3655 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3656 		if (atsru->include_all)
3657 			continue;
3658 
3659 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3660 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3661 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3662 					(void *)atsr + atsr->header.length,
3663 					atsr->segment, atsru->devices,
3664 					atsru->devices_cnt);
3665 			if (ret > 0)
3666 				break;
3667 			else if (ret < 0)
3668 				return ret;
3669 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3670 			if (dmar_remove_dev_scope(info, atsr->segment,
3671 					atsru->devices, atsru->devices_cnt))
3672 				break;
3673 		}
3674 	}
3675 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3676 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3677 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3678 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3679 					(void *)satc + satc->header.length,
3680 					satc->segment, satcu->devices,
3681 					satcu->devices_cnt);
3682 			if (ret > 0)
3683 				break;
3684 			else if (ret < 0)
3685 				return ret;
3686 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3687 			if (dmar_remove_dev_scope(info, satc->segment,
3688 					satcu->devices, satcu->devices_cnt))
3689 				break;
3690 		}
3691 	}
3692 
3693 	return 0;
3694 }
3695 
3696 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3697 				       unsigned long val, void *v)
3698 {
3699 	struct memory_notify *mhp = v;
3700 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3701 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3702 			mhp->nr_pages - 1);
3703 
3704 	switch (val) {
3705 	case MEM_GOING_ONLINE:
3706 		if (iommu_domain_identity_map(si_domain,
3707 					      start_vpfn, last_vpfn)) {
3708 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3709 				start_vpfn, last_vpfn);
3710 			return NOTIFY_BAD;
3711 		}
3712 		break;
3713 
3714 	case MEM_OFFLINE:
3715 	case MEM_CANCEL_ONLINE:
3716 		{
3717 			struct dmar_drhd_unit *drhd;
3718 			struct intel_iommu *iommu;
3719 			LIST_HEAD(freelist);
3720 
3721 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3722 
3723 			rcu_read_lock();
3724 			for_each_active_iommu(iommu, drhd)
3725 				iommu_flush_iotlb_psi(iommu, si_domain,
3726 					start_vpfn, mhp->nr_pages,
3727 					list_empty(&freelist), 0);
3728 			rcu_read_unlock();
3729 			put_pages_list(&freelist);
3730 		}
3731 		break;
3732 	}
3733 
3734 	return NOTIFY_OK;
3735 }
3736 
3737 static struct notifier_block intel_iommu_memory_nb = {
3738 	.notifier_call = intel_iommu_memory_notifier,
3739 	.priority = 0
3740 };
3741 
3742 static void intel_disable_iommus(void)
3743 {
3744 	struct intel_iommu *iommu = NULL;
3745 	struct dmar_drhd_unit *drhd;
3746 
3747 	for_each_iommu(iommu, drhd)
3748 		iommu_disable_translation(iommu);
3749 }
3750 
3751 void intel_iommu_shutdown(void)
3752 {
3753 	struct dmar_drhd_unit *drhd;
3754 	struct intel_iommu *iommu = NULL;
3755 
3756 	if (no_iommu || dmar_disabled)
3757 		return;
3758 
3759 	down_write(&dmar_global_lock);
3760 
3761 	/* Disable PMRs explicitly here. */
3762 	for_each_iommu(iommu, drhd)
3763 		iommu_disable_protect_mem_regions(iommu);
3764 
3765 	/* Make sure the IOMMUs are switched off */
3766 	intel_disable_iommus();
3767 
3768 	up_write(&dmar_global_lock);
3769 }
3770 
3771 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3772 {
3773 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3774 
3775 	return container_of(iommu_dev, struct intel_iommu, iommu);
3776 }
3777 
3778 static ssize_t version_show(struct device *dev,
3779 			    struct device_attribute *attr, char *buf)
3780 {
3781 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3782 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3783 	return sprintf(buf, "%d:%d\n",
3784 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3785 }
3786 static DEVICE_ATTR_RO(version);
3787 
3788 static ssize_t address_show(struct device *dev,
3789 			    struct device_attribute *attr, char *buf)
3790 {
3791 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3793 }
3794 static DEVICE_ATTR_RO(address);
3795 
3796 static ssize_t cap_show(struct device *dev,
3797 			struct device_attribute *attr, char *buf)
3798 {
3799 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 	return sprintf(buf, "%llx\n", iommu->cap);
3801 }
3802 static DEVICE_ATTR_RO(cap);
3803 
3804 static ssize_t ecap_show(struct device *dev,
3805 			 struct device_attribute *attr, char *buf)
3806 {
3807 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3808 	return sprintf(buf, "%llx\n", iommu->ecap);
3809 }
3810 static DEVICE_ATTR_RO(ecap);
3811 
3812 static ssize_t domains_supported_show(struct device *dev,
3813 				      struct device_attribute *attr, char *buf)
3814 {
3815 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3816 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3817 }
3818 static DEVICE_ATTR_RO(domains_supported);
3819 
3820 static ssize_t domains_used_show(struct device *dev,
3821 				 struct device_attribute *attr, char *buf)
3822 {
3823 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3824 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3825 						  cap_ndoms(iommu->cap)));
3826 }
3827 static DEVICE_ATTR_RO(domains_used);
3828 
3829 static struct attribute *intel_iommu_attrs[] = {
3830 	&dev_attr_version.attr,
3831 	&dev_attr_address.attr,
3832 	&dev_attr_cap.attr,
3833 	&dev_attr_ecap.attr,
3834 	&dev_attr_domains_supported.attr,
3835 	&dev_attr_domains_used.attr,
3836 	NULL,
3837 };
3838 
3839 static struct attribute_group intel_iommu_group = {
3840 	.name = "intel-iommu",
3841 	.attrs = intel_iommu_attrs,
3842 };
3843 
3844 const struct attribute_group *intel_iommu_groups[] = {
3845 	&intel_iommu_group,
3846 	NULL,
3847 };
3848 
3849 static inline bool has_external_pci(void)
3850 {
3851 	struct pci_dev *pdev = NULL;
3852 
3853 	for_each_pci_dev(pdev)
3854 		if (pdev->external_facing)
3855 			return true;
3856 
3857 	return false;
3858 }
3859 
3860 static int __init platform_optin_force_iommu(void)
3861 {
3862 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3863 		return 0;
3864 
3865 	if (no_iommu || dmar_disabled)
3866 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3867 
3868 	/*
3869 	 * If Intel-IOMMU is disabled by default, we will apply identity
3870 	 * map for all devices except those marked as being untrusted.
3871 	 */
3872 	if (dmar_disabled)
3873 		iommu_set_default_passthrough(false);
3874 
3875 	dmar_disabled = 0;
3876 	no_iommu = 0;
3877 
3878 	return 1;
3879 }
3880 
3881 static int __init probe_acpi_namespace_devices(void)
3882 {
3883 	struct dmar_drhd_unit *drhd;
3884 	/* To avoid a -Wunused-but-set-variable warning. */
3885 	struct intel_iommu *iommu __maybe_unused;
3886 	struct device *dev;
3887 	int i, ret = 0;
3888 
3889 	for_each_active_iommu(iommu, drhd) {
3890 		for_each_active_dev_scope(drhd->devices,
3891 					  drhd->devices_cnt, i, dev) {
3892 			struct acpi_device_physical_node *pn;
3893 			struct iommu_group *group;
3894 			struct acpi_device *adev;
3895 
3896 			if (dev->bus != &acpi_bus_type)
3897 				continue;
3898 
3899 			adev = to_acpi_device(dev);
3900 			mutex_lock(&adev->physical_node_lock);
3901 			list_for_each_entry(pn,
3902 					    &adev->physical_node_list, node) {
3903 				group = iommu_group_get(pn->dev);
3904 				if (group) {
3905 					iommu_group_put(group);
3906 					continue;
3907 				}
3908 
3909 				ret = iommu_probe_device(pn->dev);
3910 				if (ret)
3911 					break;
3912 			}
3913 			mutex_unlock(&adev->physical_node_lock);
3914 
3915 			if (ret)
3916 				return ret;
3917 		}
3918 	}
3919 
3920 	return 0;
3921 }
3922 
3923 static __init int tboot_force_iommu(void)
3924 {
3925 	if (!tboot_enabled())
3926 		return 0;
3927 
3928 	if (no_iommu || dmar_disabled)
3929 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3930 
3931 	dmar_disabled = 0;
3932 	no_iommu = 0;
3933 
3934 	return 1;
3935 }
3936 
3937 int __init intel_iommu_init(void)
3938 {
3939 	int ret = -ENODEV;
3940 	struct dmar_drhd_unit *drhd;
3941 	struct intel_iommu *iommu;
3942 
3943 	/*
3944 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3945 	 * opt in, so enforce that.
3946 	 */
3947 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3948 		    platform_optin_force_iommu();
3949 
3950 	down_write(&dmar_global_lock);
3951 	if (dmar_table_init()) {
3952 		if (force_on)
3953 			panic("tboot: Failed to initialize DMAR table\n");
3954 		goto out_free_dmar;
3955 	}
3956 
3957 	if (dmar_dev_scope_init() < 0) {
3958 		if (force_on)
3959 			panic("tboot: Failed to initialize DMAR device scope\n");
3960 		goto out_free_dmar;
3961 	}
3962 
3963 	up_write(&dmar_global_lock);
3964 
3965 	/*
3966 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3967 	 * complain later when we register it under the lock.
3968 	 */
3969 	dmar_register_bus_notifier();
3970 
3971 	down_write(&dmar_global_lock);
3972 
3973 	if (!no_iommu)
3974 		intel_iommu_debugfs_init();
3975 
3976 	if (no_iommu || dmar_disabled) {
3977 		/*
3978 		 * We exit the function here to ensure IOMMU's remapping and
3979 		 * mempool aren't setup, which means that the IOMMU's PMRs
3980 		 * won't be disabled via the call to init_dmars(). So disable
3981 		 * it explicitly here. The PMRs were setup by tboot prior to
3982 		 * calling SENTER, but the kernel is expected to reset/tear
3983 		 * down the PMRs.
3984 		 */
3985 		if (intel_iommu_tboot_noforce) {
3986 			for_each_iommu(iommu, drhd)
3987 				iommu_disable_protect_mem_regions(iommu);
3988 		}
3989 
3990 		/*
3991 		 * Make sure the IOMMUs are switched off, even when we
3992 		 * boot into a kexec kernel and the previous kernel left
3993 		 * them enabled
3994 		 */
3995 		intel_disable_iommus();
3996 		goto out_free_dmar;
3997 	}
3998 
3999 	if (list_empty(&dmar_rmrr_units))
4000 		pr_info("No RMRR found\n");
4001 
4002 	if (list_empty(&dmar_atsr_units))
4003 		pr_info("No ATSR found\n");
4004 
4005 	if (list_empty(&dmar_satc_units))
4006 		pr_info("No SATC found\n");
4007 
4008 	init_no_remapping_devices();
4009 
4010 	ret = init_dmars();
4011 	if (ret) {
4012 		if (force_on)
4013 			panic("tboot: Failed to initialize DMARs\n");
4014 		pr_err("Initialization failed\n");
4015 		goto out_free_dmar;
4016 	}
4017 	up_write(&dmar_global_lock);
4018 
4019 	init_iommu_pm_ops();
4020 
4021 	down_read(&dmar_global_lock);
4022 	for_each_active_iommu(iommu, drhd) {
4023 		/*
4024 		 * The flush queue implementation does not perform
4025 		 * page-selective invalidations that are required for efficient
4026 		 * TLB flushes in virtual environments.  The benefit of batching
4027 		 * is likely to be much lower than the overhead of synchronizing
4028 		 * the virtual and physical IOMMU page-tables.
4029 		 */
4030 		if (cap_caching_mode(iommu->cap)) {
4031 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4032 			iommu_set_dma_strict();
4033 		}
4034 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4035 				       intel_iommu_groups,
4036 				       "%s", iommu->name);
4037 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4038 	}
4039 	up_read(&dmar_global_lock);
4040 
4041 	if (si_domain && !hw_pass_through)
4042 		register_memory_notifier(&intel_iommu_memory_nb);
4043 
4044 	down_read(&dmar_global_lock);
4045 	if (probe_acpi_namespace_devices())
4046 		pr_warn("ACPI name space devices didn't probe correctly\n");
4047 
4048 	/* Finally, we enable the DMA remapping hardware. */
4049 	for_each_iommu(iommu, drhd) {
4050 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4051 			iommu_enable_translation(iommu);
4052 
4053 		iommu_disable_protect_mem_regions(iommu);
4054 	}
4055 	up_read(&dmar_global_lock);
4056 
4057 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4058 
4059 	intel_iommu_enabled = 1;
4060 
4061 	return 0;
4062 
4063 out_free_dmar:
4064 	intel_iommu_free_dmars();
4065 	up_write(&dmar_global_lock);
4066 	return ret;
4067 }
4068 
4069 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4070 {
4071 	struct device_domain_info *info = opaque;
4072 
4073 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4074 	return 0;
4075 }
4076 
4077 /*
4078  * NB - intel-iommu lacks any sort of reference counting for the users of
4079  * dependent devices.  If multiple endpoints have intersecting dependent
4080  * devices, unbinding the driver from any one of them will possibly leave
4081  * the others unable to operate.
4082  */
4083 static void domain_context_clear(struct device_domain_info *info)
4084 {
4085 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4086 		return;
4087 
4088 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4089 			       &domain_context_clear_one_cb, info);
4090 }
4091 
4092 static void dmar_remove_one_dev_info(struct device *dev)
4093 {
4094 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4095 	struct dmar_domain *domain = info->domain;
4096 	struct intel_iommu *iommu = info->iommu;
4097 	unsigned long flags;
4098 
4099 	if (!dev_is_real_dma_subdevice(info->dev)) {
4100 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4101 			intel_pasid_tear_down_entry(iommu, info->dev,
4102 					PASID_RID2PASID, false);
4103 
4104 		iommu_disable_dev_iotlb(info);
4105 		domain_context_clear(info);
4106 		intel_pasid_free_table(info->dev);
4107 	}
4108 
4109 	spin_lock_irqsave(&domain->lock, flags);
4110 	list_del(&info->link);
4111 	spin_unlock_irqrestore(&domain->lock, flags);
4112 
4113 	domain_detach_iommu(domain, iommu);
4114 	info->domain = NULL;
4115 }
4116 
4117 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4118 {
4119 	int adjust_width;
4120 
4121 	/* calculate AGAW */
4122 	domain->gaw = guest_width;
4123 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4124 	domain->agaw = width_to_agaw(adjust_width);
4125 
4126 	domain->iommu_coherency = false;
4127 	domain->iommu_superpage = 0;
4128 	domain->max_addr = 0;
4129 
4130 	/* always allocate the top pgd */
4131 	domain->pgd = alloc_pgtable_page(domain->nid);
4132 	if (!domain->pgd)
4133 		return -ENOMEM;
4134 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4135 	return 0;
4136 }
4137 
4138 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4139 {
4140 	struct dmar_domain *dmar_domain;
4141 	struct iommu_domain *domain;
4142 
4143 	switch (type) {
4144 	case IOMMU_DOMAIN_DMA:
4145 	case IOMMU_DOMAIN_DMA_FQ:
4146 	case IOMMU_DOMAIN_UNMANAGED:
4147 		dmar_domain = alloc_domain(type);
4148 		if (!dmar_domain) {
4149 			pr_err("Can't allocate dmar_domain\n");
4150 			return NULL;
4151 		}
4152 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4153 			pr_err("Domain initialization failed\n");
4154 			domain_exit(dmar_domain);
4155 			return NULL;
4156 		}
4157 
4158 		domain = &dmar_domain->domain;
4159 		domain->geometry.aperture_start = 0;
4160 		domain->geometry.aperture_end   =
4161 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4162 		domain->geometry.force_aperture = true;
4163 
4164 		return domain;
4165 	case IOMMU_DOMAIN_IDENTITY:
4166 		return &si_domain->domain;
4167 	default:
4168 		return NULL;
4169 	}
4170 
4171 	return NULL;
4172 }
4173 
4174 static void intel_iommu_domain_free(struct iommu_domain *domain)
4175 {
4176 	if (domain != &si_domain->domain)
4177 		domain_exit(to_dmar_domain(domain));
4178 }
4179 
4180 static int prepare_domain_attach_device(struct iommu_domain *domain,
4181 					struct device *dev)
4182 {
4183 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4184 	struct intel_iommu *iommu;
4185 	int addr_width;
4186 
4187 	iommu = device_to_iommu(dev, NULL, NULL);
4188 	if (!iommu)
4189 		return -ENODEV;
4190 
4191 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4192 		return -EOPNOTSUPP;
4193 
4194 	/* check if this iommu agaw is sufficient for max mapped address */
4195 	addr_width = agaw_to_width(iommu->agaw);
4196 	if (addr_width > cap_mgaw(iommu->cap))
4197 		addr_width = cap_mgaw(iommu->cap);
4198 
4199 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4200 		dev_err(dev, "%s: iommu width (%d) is not "
4201 		        "sufficient for the mapped address (%llx)\n",
4202 		        __func__, addr_width, dmar_domain->max_addr);
4203 		return -EFAULT;
4204 	}
4205 	dmar_domain->gaw = addr_width;
4206 
4207 	/*
4208 	 * Knock out extra levels of page tables if necessary
4209 	 */
4210 	while (iommu->agaw < dmar_domain->agaw) {
4211 		struct dma_pte *pte;
4212 
4213 		pte = dmar_domain->pgd;
4214 		if (dma_pte_present(pte)) {
4215 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4216 			free_pgtable_page(pte);
4217 		}
4218 		dmar_domain->agaw--;
4219 	}
4220 
4221 	return 0;
4222 }
4223 
4224 static int intel_iommu_attach_device(struct iommu_domain *domain,
4225 				     struct device *dev)
4226 {
4227 	int ret;
4228 
4229 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4230 	    device_is_rmrr_locked(dev)) {
4231 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4232 		return -EPERM;
4233 	}
4234 
4235 	/* normally dev is not mapped */
4236 	if (unlikely(domain_context_mapped(dev))) {
4237 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4238 
4239 		if (info->domain)
4240 			dmar_remove_one_dev_info(dev);
4241 	}
4242 
4243 	ret = prepare_domain_attach_device(domain, dev);
4244 	if (ret)
4245 		return ret;
4246 
4247 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4248 }
4249 
4250 static void intel_iommu_detach_device(struct iommu_domain *domain,
4251 				      struct device *dev)
4252 {
4253 	dmar_remove_one_dev_info(dev);
4254 }
4255 
4256 static int intel_iommu_map(struct iommu_domain *domain,
4257 			   unsigned long iova, phys_addr_t hpa,
4258 			   size_t size, int iommu_prot, gfp_t gfp)
4259 {
4260 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4261 	u64 max_addr;
4262 	int prot = 0;
4263 
4264 	if (iommu_prot & IOMMU_READ)
4265 		prot |= DMA_PTE_READ;
4266 	if (iommu_prot & IOMMU_WRITE)
4267 		prot |= DMA_PTE_WRITE;
4268 	if (dmar_domain->set_pte_snp)
4269 		prot |= DMA_PTE_SNP;
4270 
4271 	max_addr = iova + size;
4272 	if (dmar_domain->max_addr < max_addr) {
4273 		u64 end;
4274 
4275 		/* check if minimum agaw is sufficient for mapped address */
4276 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4277 		if (end < max_addr) {
4278 			pr_err("%s: iommu width (%d) is not "
4279 			       "sufficient for the mapped address (%llx)\n",
4280 			       __func__, dmar_domain->gaw, max_addr);
4281 			return -EFAULT;
4282 		}
4283 		dmar_domain->max_addr = max_addr;
4284 	}
4285 	/* Round up size to next multiple of PAGE_SIZE, if it and
4286 	   the low bits of hpa would take us onto the next page */
4287 	size = aligned_nrpages(hpa, size);
4288 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4289 				hpa >> VTD_PAGE_SHIFT, size, prot);
4290 }
4291 
4292 static int intel_iommu_map_pages(struct iommu_domain *domain,
4293 				 unsigned long iova, phys_addr_t paddr,
4294 				 size_t pgsize, size_t pgcount,
4295 				 int prot, gfp_t gfp, size_t *mapped)
4296 {
4297 	unsigned long pgshift = __ffs(pgsize);
4298 	size_t size = pgcount << pgshift;
4299 	int ret;
4300 
4301 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4302 		return -EINVAL;
4303 
4304 	if (!IS_ALIGNED(iova | paddr, pgsize))
4305 		return -EINVAL;
4306 
4307 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4308 	if (!ret && mapped)
4309 		*mapped = size;
4310 
4311 	return ret;
4312 }
4313 
4314 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4315 				unsigned long iova, size_t size,
4316 				struct iommu_iotlb_gather *gather)
4317 {
4318 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4319 	unsigned long start_pfn, last_pfn;
4320 	int level = 0;
4321 
4322 	/* Cope with horrid API which requires us to unmap more than the
4323 	   size argument if it happens to be a large-page mapping. */
4324 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4325 
4326 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4327 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4328 
4329 	start_pfn = iova >> VTD_PAGE_SHIFT;
4330 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4331 
4332 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4333 
4334 	if (dmar_domain->max_addr == iova + size)
4335 		dmar_domain->max_addr = iova;
4336 
4337 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4338 
4339 	return size;
4340 }
4341 
4342 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4343 				      unsigned long iova,
4344 				      size_t pgsize, size_t pgcount,
4345 				      struct iommu_iotlb_gather *gather)
4346 {
4347 	unsigned long pgshift = __ffs(pgsize);
4348 	size_t size = pgcount << pgshift;
4349 
4350 	return intel_iommu_unmap(domain, iova, size, gather);
4351 }
4352 
4353 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4354 				 struct iommu_iotlb_gather *gather)
4355 {
4356 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4357 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4358 	size_t size = gather->end - gather->start;
4359 	struct iommu_domain_info *info;
4360 	unsigned long start_pfn;
4361 	unsigned long nrpages;
4362 	unsigned long i;
4363 
4364 	nrpages = aligned_nrpages(gather->start, size);
4365 	start_pfn = mm_to_dma_pfn(iova_pfn);
4366 
4367 	xa_for_each(&dmar_domain->iommu_array, i, info)
4368 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4369 				      start_pfn, nrpages,
4370 				      list_empty(&gather->freelist), 0);
4371 
4372 	put_pages_list(&gather->freelist);
4373 }
4374 
4375 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4376 					    dma_addr_t iova)
4377 {
4378 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4379 	struct dma_pte *pte;
4380 	int level = 0;
4381 	u64 phys = 0;
4382 
4383 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4384 	if (pte && dma_pte_present(pte))
4385 		phys = dma_pte_addr(pte) +
4386 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4387 						VTD_PAGE_SHIFT) - 1));
4388 
4389 	return phys;
4390 }
4391 
4392 static bool domain_support_force_snooping(struct dmar_domain *domain)
4393 {
4394 	struct device_domain_info *info;
4395 	bool support = true;
4396 
4397 	assert_spin_locked(&domain->lock);
4398 	list_for_each_entry(info, &domain->devices, link) {
4399 		if (!ecap_sc_support(info->iommu->ecap)) {
4400 			support = false;
4401 			break;
4402 		}
4403 	}
4404 
4405 	return support;
4406 }
4407 
4408 static void domain_set_force_snooping(struct dmar_domain *domain)
4409 {
4410 	struct device_domain_info *info;
4411 
4412 	assert_spin_locked(&domain->lock);
4413 	/*
4414 	 * Second level page table supports per-PTE snoop control. The
4415 	 * iommu_map() interface will handle this by setting SNP bit.
4416 	 */
4417 	if (!domain_use_first_level(domain)) {
4418 		domain->set_pte_snp = true;
4419 		return;
4420 	}
4421 
4422 	list_for_each_entry(info, &domain->devices, link)
4423 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4424 						     PASID_RID2PASID);
4425 }
4426 
4427 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4428 {
4429 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4430 	unsigned long flags;
4431 
4432 	if (dmar_domain->force_snooping)
4433 		return true;
4434 
4435 	spin_lock_irqsave(&dmar_domain->lock, flags);
4436 	if (!domain_support_force_snooping(dmar_domain)) {
4437 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4438 		return false;
4439 	}
4440 
4441 	domain_set_force_snooping(dmar_domain);
4442 	dmar_domain->force_snooping = true;
4443 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4444 
4445 	return true;
4446 }
4447 
4448 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4449 {
4450 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4451 		return true;
4452 	if (cap == IOMMU_CAP_INTR_REMAP)
4453 		return irq_remapping_enabled == 1;
4454 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4455 		return dmar_platform_optin();
4456 
4457 	return false;
4458 }
4459 
4460 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4461 {
4462 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4463 	struct device_domain_info *info;
4464 	struct intel_iommu *iommu;
4465 	u8 bus, devfn;
4466 
4467 	iommu = device_to_iommu(dev, &bus, &devfn);
4468 	if (!iommu || !iommu->iommu.ops)
4469 		return ERR_PTR(-ENODEV);
4470 
4471 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4472 	if (!info)
4473 		return ERR_PTR(-ENOMEM);
4474 
4475 	if (dev_is_real_dma_subdevice(dev)) {
4476 		info->bus = pdev->bus->number;
4477 		info->devfn = pdev->devfn;
4478 		info->segment = pci_domain_nr(pdev->bus);
4479 	} else {
4480 		info->bus = bus;
4481 		info->devfn = devfn;
4482 		info->segment = iommu->segment;
4483 	}
4484 
4485 	info->dev = dev;
4486 	info->iommu = iommu;
4487 	if (dev_is_pci(dev)) {
4488 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4489 		    pci_ats_supported(pdev) &&
4490 		    dmar_ats_supported(pdev, iommu))
4491 			info->ats_supported = 1;
4492 
4493 		if (sm_supported(iommu)) {
4494 			if (pasid_supported(iommu)) {
4495 				int features = pci_pasid_features(pdev);
4496 
4497 				if (features >= 0)
4498 					info->pasid_supported = features | 1;
4499 			}
4500 
4501 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4502 			    pci_pri_supported(pdev))
4503 				info->pri_supported = 1;
4504 		}
4505 	}
4506 
4507 	dev_iommu_priv_set(dev, info);
4508 
4509 	return &iommu->iommu;
4510 }
4511 
4512 static void intel_iommu_release_device(struct device *dev)
4513 {
4514 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4515 
4516 	dmar_remove_one_dev_info(dev);
4517 	dev_iommu_priv_set(dev, NULL);
4518 	kfree(info);
4519 	set_dma_ops(dev, NULL);
4520 }
4521 
4522 static void intel_iommu_probe_finalize(struct device *dev)
4523 {
4524 	set_dma_ops(dev, NULL);
4525 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4526 }
4527 
4528 static void intel_iommu_get_resv_regions(struct device *device,
4529 					 struct list_head *head)
4530 {
4531 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4532 	struct iommu_resv_region *reg;
4533 	struct dmar_rmrr_unit *rmrr;
4534 	struct device *i_dev;
4535 	int i;
4536 
4537 	down_read(&dmar_global_lock);
4538 	for_each_rmrr_units(rmrr) {
4539 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4540 					  i, i_dev) {
4541 			struct iommu_resv_region *resv;
4542 			enum iommu_resv_type type;
4543 			size_t length;
4544 
4545 			if (i_dev != device &&
4546 			    !is_downstream_to_pci_bridge(device, i_dev))
4547 				continue;
4548 
4549 			length = rmrr->end_address - rmrr->base_address + 1;
4550 
4551 			type = device_rmrr_is_relaxable(device) ?
4552 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4553 
4554 			resv = iommu_alloc_resv_region(rmrr->base_address,
4555 						       length, prot, type);
4556 			if (!resv)
4557 				break;
4558 
4559 			list_add_tail(&resv->list, head);
4560 		}
4561 	}
4562 	up_read(&dmar_global_lock);
4563 
4564 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4565 	if (dev_is_pci(device)) {
4566 		struct pci_dev *pdev = to_pci_dev(device);
4567 
4568 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4569 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4570 						   IOMMU_RESV_DIRECT_RELAXABLE);
4571 			if (reg)
4572 				list_add_tail(&reg->list, head);
4573 		}
4574 	}
4575 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4576 
4577 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4578 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4579 				      0, IOMMU_RESV_MSI);
4580 	if (!reg)
4581 		return;
4582 	list_add_tail(&reg->list, head);
4583 }
4584 
4585 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4586 {
4587 	if (dev_is_pci(dev))
4588 		return pci_device_group(dev);
4589 	return generic_device_group(dev);
4590 }
4591 
4592 static int intel_iommu_enable_sva(struct device *dev)
4593 {
4594 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4595 	struct intel_iommu *iommu;
4596 	int ret;
4597 
4598 	if (!info || dmar_disabled)
4599 		return -EINVAL;
4600 
4601 	iommu = info->iommu;
4602 	if (!iommu)
4603 		return -EINVAL;
4604 
4605 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4606 		return -ENODEV;
4607 
4608 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4609 		return -EINVAL;
4610 
4611 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4612 	if (!ret)
4613 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4614 
4615 	return ret;
4616 }
4617 
4618 static int intel_iommu_disable_sva(struct device *dev)
4619 {
4620 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4621 	struct intel_iommu *iommu = info->iommu;
4622 	int ret;
4623 
4624 	ret = iommu_unregister_device_fault_handler(dev);
4625 	if (!ret)
4626 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4627 
4628 	return ret;
4629 }
4630 
4631 static int intel_iommu_enable_iopf(struct device *dev)
4632 {
4633 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4634 
4635 	if (info && info->pri_supported)
4636 		return 0;
4637 
4638 	return -ENODEV;
4639 }
4640 
4641 static int
4642 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4643 {
4644 	switch (feat) {
4645 	case IOMMU_DEV_FEAT_IOPF:
4646 		return intel_iommu_enable_iopf(dev);
4647 
4648 	case IOMMU_DEV_FEAT_SVA:
4649 		return intel_iommu_enable_sva(dev);
4650 
4651 	default:
4652 		return -ENODEV;
4653 	}
4654 }
4655 
4656 static int
4657 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4658 {
4659 	switch (feat) {
4660 	case IOMMU_DEV_FEAT_IOPF:
4661 		return 0;
4662 
4663 	case IOMMU_DEV_FEAT_SVA:
4664 		return intel_iommu_disable_sva(dev);
4665 
4666 	default:
4667 		return -ENODEV;
4668 	}
4669 }
4670 
4671 static bool intel_iommu_is_attach_deferred(struct device *dev)
4672 {
4673 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4674 
4675 	return translation_pre_enabled(info->iommu) && !info->domain;
4676 }
4677 
4678 /*
4679  * Check that the device does not live on an external facing PCI port that is
4680  * marked as untrusted. Such devices should not be able to apply quirks and
4681  * thus not be able to bypass the IOMMU restrictions.
4682  */
4683 static bool risky_device(struct pci_dev *pdev)
4684 {
4685 	if (pdev->untrusted) {
4686 		pci_info(pdev,
4687 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4688 			 pdev->vendor, pdev->device);
4689 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4690 		return true;
4691 	}
4692 	return false;
4693 }
4694 
4695 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4696 				       unsigned long iova, size_t size)
4697 {
4698 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4699 	unsigned long pages = aligned_nrpages(iova, size);
4700 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4701 	struct iommu_domain_info *info;
4702 	unsigned long i;
4703 
4704 	xa_for_each(&dmar_domain->iommu_array, i, info)
4705 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4706 }
4707 
4708 const struct iommu_ops intel_iommu_ops = {
4709 	.capable		= intel_iommu_capable,
4710 	.domain_alloc		= intel_iommu_domain_alloc,
4711 	.probe_device		= intel_iommu_probe_device,
4712 	.probe_finalize		= intel_iommu_probe_finalize,
4713 	.release_device		= intel_iommu_release_device,
4714 	.get_resv_regions	= intel_iommu_get_resv_regions,
4715 	.device_group		= intel_iommu_device_group,
4716 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4717 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4718 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4719 	.def_domain_type	= device_def_domain_type,
4720 	.pgsize_bitmap		= SZ_4K,
4721 #ifdef CONFIG_INTEL_IOMMU_SVM
4722 	.sva_bind		= intel_svm_bind,
4723 	.sva_unbind		= intel_svm_unbind,
4724 	.sva_get_pasid		= intel_svm_get_pasid,
4725 	.page_response		= intel_svm_page_response,
4726 #endif
4727 	.default_domain_ops = &(const struct iommu_domain_ops) {
4728 		.attach_dev		= intel_iommu_attach_device,
4729 		.detach_dev		= intel_iommu_detach_device,
4730 		.map_pages		= intel_iommu_map_pages,
4731 		.unmap_pages		= intel_iommu_unmap_pages,
4732 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4733 		.flush_iotlb_all        = intel_flush_iotlb_all,
4734 		.iotlb_sync		= intel_iommu_tlb_sync,
4735 		.iova_to_phys		= intel_iommu_iova_to_phys,
4736 		.free			= intel_iommu_domain_free,
4737 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4738 	}
4739 };
4740 
4741 static void quirk_iommu_igfx(struct pci_dev *dev)
4742 {
4743 	if (risky_device(dev))
4744 		return;
4745 
4746 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4747 	dmar_map_gfx = 0;
4748 }
4749 
4750 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4756 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4757 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4758 
4759 /* Broadwell igfx malfunctions with dmar */
4760 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4763 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4764 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4765 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4766 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4774 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4784 
4785 static void quirk_iommu_rwbf(struct pci_dev *dev)
4786 {
4787 	if (risky_device(dev))
4788 		return;
4789 
4790 	/*
4791 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4792 	 * but needs it. Same seems to hold for the desktop versions.
4793 	 */
4794 	pci_info(dev, "Forcing write-buffer flush capability\n");
4795 	rwbf_quirk = 1;
4796 }
4797 
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4805 
4806 #define GGC 0x52
4807 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4808 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4809 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4810 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4811 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4812 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4813 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4814 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4815 
4816 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4817 {
4818 	unsigned short ggc;
4819 
4820 	if (risky_device(dev))
4821 		return;
4822 
4823 	if (pci_read_config_word(dev, GGC, &ggc))
4824 		return;
4825 
4826 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4827 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4828 		dmar_map_gfx = 0;
4829 	} else if (dmar_map_gfx) {
4830 		/* we have to ensure the gfx device is idle before we flush */
4831 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4832 		iommu_set_dma_strict();
4833 	}
4834 }
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4839 
4840 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4841 {
4842 	unsigned short ver;
4843 
4844 	if (!IS_GFX_DEVICE(dev))
4845 		return;
4846 
4847 	ver = (dev->device >> 8) & 0xff;
4848 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4849 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4850 	    ver != 0x9a && ver != 0xa7)
4851 		return;
4852 
4853 	if (risky_device(dev))
4854 		return;
4855 
4856 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4857 	iommu_skip_te_disable = 1;
4858 }
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4860 
4861 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4862    ISOCH DMAR unit for the Azalia sound device, but not give it any
4863    TLB entries, which causes it to deadlock. Check for that.  We do
4864    this in a function called from init_dmars(), instead of in a PCI
4865    quirk, because we don't want to print the obnoxious "BIOS broken"
4866    message if VT-d is actually disabled.
4867 */
4868 static void __init check_tylersburg_isoch(void)
4869 {
4870 	struct pci_dev *pdev;
4871 	uint32_t vtisochctrl;
4872 
4873 	/* If there's no Azalia in the system anyway, forget it. */
4874 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4875 	if (!pdev)
4876 		return;
4877 
4878 	if (risky_device(pdev)) {
4879 		pci_dev_put(pdev);
4880 		return;
4881 	}
4882 
4883 	pci_dev_put(pdev);
4884 
4885 	/* System Management Registers. Might be hidden, in which case
4886 	   we can't do the sanity check. But that's OK, because the
4887 	   known-broken BIOSes _don't_ actually hide it, so far. */
4888 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4889 	if (!pdev)
4890 		return;
4891 
4892 	if (risky_device(pdev)) {
4893 		pci_dev_put(pdev);
4894 		return;
4895 	}
4896 
4897 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4898 		pci_dev_put(pdev);
4899 		return;
4900 	}
4901 
4902 	pci_dev_put(pdev);
4903 
4904 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4905 	if (vtisochctrl & 1)
4906 		return;
4907 
4908 	/* Drop all bits other than the number of TLB entries */
4909 	vtisochctrl &= 0x1c;
4910 
4911 	/* If we have the recommended number of TLB entries (16), fine. */
4912 	if (vtisochctrl == 0x10)
4913 		return;
4914 
4915 	/* Zero TLB entries? You get to ride the short bus to school. */
4916 	if (!vtisochctrl) {
4917 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4918 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4919 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4920 		     dmi_get_system_info(DMI_BIOS_VERSION),
4921 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4922 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4923 		return;
4924 	}
4925 
4926 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4927 	       vtisochctrl);
4928 }
4929