xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 6c8c1406)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void dmar_remove_one_dev_info(struct device *dev);
281 
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284 
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287 
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292 
293 #define IDENTMAP_GFX		2
294 #define IDENTMAP_AZALIA		4
295 
296 const struct iommu_ops intel_iommu_ops;
297 
298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302 
303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307 
308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310 	u32 gsts;
311 
312 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
313 	if (gsts & DMA_GSTS_TES)
314 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316 
317 static int __init intel_iommu_setup(char *str)
318 {
319 	if (!str)
320 		return -EINVAL;
321 
322 	while (*str) {
323 		if (!strncmp(str, "on", 2)) {
324 			dmar_disabled = 0;
325 			pr_info("IOMMU enabled\n");
326 		} else if (!strncmp(str, "off", 3)) {
327 			dmar_disabled = 1;
328 			no_platform_optin = 1;
329 			pr_info("IOMMU disabled\n");
330 		} else if (!strncmp(str, "igfx_off", 8)) {
331 			dmar_map_gfx = 0;
332 			pr_info("Disable GFX device mapping\n");
333 		} else if (!strncmp(str, "forcedac", 8)) {
334 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335 			iommu_dma_forcedac = true;
336 		} else if (!strncmp(str, "strict", 6)) {
337 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338 			iommu_set_dma_strict();
339 		} else if (!strncmp(str, "sp_off", 6)) {
340 			pr_info("Disable supported super page\n");
341 			intel_iommu_superpage = 0;
342 		} else if (!strncmp(str, "sm_on", 5)) {
343 			pr_info("Enable scalable mode if hardware supports\n");
344 			intel_iommu_sm = 1;
345 		} else if (!strncmp(str, "sm_off", 6)) {
346 			pr_info("Scalable mode is disallowed\n");
347 			intel_iommu_sm = 0;
348 		} else if (!strncmp(str, "tboot_noforce", 13)) {
349 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350 			intel_iommu_tboot_noforce = 1;
351 		} else {
352 			pr_notice("Unknown option - '%s'\n", str);
353 		}
354 
355 		str += strcspn(str, ",");
356 		while (*str == ',')
357 			str++;
358 	}
359 
360 	return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363 
364 void *alloc_pgtable_page(int node)
365 {
366 	struct page *page;
367 	void *vaddr = NULL;
368 
369 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370 	if (page)
371 		vaddr = page_address(page);
372 	return vaddr;
373 }
374 
375 void free_pgtable_page(void *vaddr)
376 {
377 	free_page((unsigned long)vaddr);
378 }
379 
380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384 
385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389 
390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 				       unsigned long pfn)
392 {
393 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394 
395 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397 
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 	unsigned long fl_sagaw, sl_sagaw;
406 
407 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 	sl_sagaw = cap_sagaw(iommu->cap);
409 
410 	/* Second level only. */
411 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 		return sl_sagaw;
413 
414 	/* First level only. */
415 	if (!ecap_slts(iommu->ecap))
416 		return fl_sagaw;
417 
418 	return fl_sagaw & sl_sagaw;
419 }
420 
421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 	unsigned long sagaw;
424 	int agaw;
425 
426 	sagaw = __iommu_calculate_sagaw(iommu);
427 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 		if (test_bit(agaw, &sagaw))
429 			break;
430 	}
431 
432 	return agaw;
433 }
434 
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442 
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452 
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 	return sm_supported(iommu) ?
456 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458 
459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 	struct iommu_domain_info *info;
462 	struct dmar_drhd_unit *drhd;
463 	struct intel_iommu *iommu;
464 	bool found = false;
465 	unsigned long i;
466 
467 	domain->iommu_coherency = true;
468 	xa_for_each(&domain->iommu_array, i, info) {
469 		found = true;
470 		if (!iommu_paging_structure_coherency(info->iommu)) {
471 			domain->iommu_coherency = false;
472 			break;
473 		}
474 	}
475 	if (found)
476 		return;
477 
478 	/* No hardware attached; use lowest common denominator */
479 	rcu_read_lock();
480 	for_each_active_iommu(iommu, drhd) {
481 		if (!iommu_paging_structure_coherency(iommu)) {
482 			domain->iommu_coherency = false;
483 			break;
484 		}
485 	}
486 	rcu_read_unlock();
487 }
488 
489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 					 struct intel_iommu *skip)
491 {
492 	struct dmar_drhd_unit *drhd;
493 	struct intel_iommu *iommu;
494 	int mask = 0x3;
495 
496 	if (!intel_iommu_superpage)
497 		return 0;
498 
499 	/* set iommu_superpage to the smallest common denominator */
500 	rcu_read_lock();
501 	for_each_active_iommu(iommu, drhd) {
502 		if (iommu != skip) {
503 			if (domain && domain_use_first_level(domain)) {
504 				if (!cap_fl1gp_support(iommu->cap))
505 					mask = 0x1;
506 			} else {
507 				mask &= cap_super_page_val(iommu->cap);
508 			}
509 
510 			if (!mask)
511 				break;
512 		}
513 	}
514 	rcu_read_unlock();
515 
516 	return fls(mask);
517 }
518 
519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 	struct device_domain_info *info;
522 	int nid = NUMA_NO_NODE;
523 	unsigned long flags;
524 
525 	spin_lock_irqsave(&domain->lock, flags);
526 	list_for_each_entry(info, &domain->devices, link) {
527 		/*
528 		 * There could possibly be multiple device numa nodes as devices
529 		 * within the same domain may sit behind different IOMMUs. There
530 		 * isn't perfect answer in such situation, so we select first
531 		 * come first served policy.
532 		 */
533 		nid = dev_to_node(info->dev);
534 		if (nid != NUMA_NO_NODE)
535 			break;
536 	}
537 	spin_unlock_irqrestore(&domain->lock, flags);
538 
539 	return nid;
540 }
541 
542 static void domain_update_iotlb(struct dmar_domain *domain);
543 
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 	unsigned long bitmap = 0;
548 
549 	/*
550 	 * 1-level super page supports page size of 2MiB, 2-level super page
551 	 * supports page size of both 2MiB and 1GiB.
552 	 */
553 	if (domain->iommu_superpage == 1)
554 		bitmap |= SZ_2M;
555 	else if (domain->iommu_superpage == 2)
556 		bitmap |= SZ_2M | SZ_1G;
557 
558 	return bitmap;
559 }
560 
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 	domain_update_iommu_coherency(domain);
565 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566 
567 	/*
568 	 * If RHSA is missing, we should default to the device numa domain
569 	 * as fall back.
570 	 */
571 	if (domain->nid == NUMA_NO_NODE)
572 		domain->nid = domain_update_device_node(domain);
573 
574 	/*
575 	 * First-level translation restricts the input-address to a
576 	 * canonical address (i.e., address bits 63:N have the same
577 	 * value as address bit [N-1], where N is 48-bits with 4-level
578 	 * paging and 57-bits with 5-level paging). Hence, skip bit
579 	 * [N-1].
580 	 */
581 	if (domain_use_first_level(domain))
582 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 	else
584 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585 
586 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 	domain_update_iotlb(domain);
588 }
589 
590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 					 u8 devfn, int alloc)
592 {
593 	struct root_entry *root = &iommu->root_entry[bus];
594 	struct context_entry *context;
595 	u64 *entry;
596 
597 	/*
598 	 * Except that the caller requested to allocate a new entry,
599 	 * returning a copied context entry makes no sense.
600 	 */
601 	if (!alloc && context_copied(iommu, bus, devfn))
602 		return NULL;
603 
604 	entry = &root->lo;
605 	if (sm_supported(iommu)) {
606 		if (devfn >= 0x80) {
607 			devfn -= 0x80;
608 			entry = &root->hi;
609 		}
610 		devfn *= 2;
611 	}
612 	if (*entry & 1)
613 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 	else {
615 		unsigned long phy_addr;
616 		if (!alloc)
617 			return NULL;
618 
619 		context = alloc_pgtable_page(iommu->node);
620 		if (!context)
621 			return NULL;
622 
623 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 		phy_addr = virt_to_phys((void *)context);
625 		*entry = phy_addr | 1;
626 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
627 	}
628 	return &context[devfn];
629 }
630 
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *				 sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 	struct pci_dev *pdev, *pbridge;
643 
644 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 		return false;
646 
647 	pdev = to_pci_dev(dev);
648 	pbridge = to_pci_dev(bridge);
649 
650 	if (pbridge->subordinate &&
651 	    pbridge->subordinate->number <= pdev->bus->number &&
652 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 		return true;
654 
655 	return false;
656 }
657 
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 	struct dmar_drhd_unit *drhd;
661 	u32 vtbar;
662 	int rc;
663 
664 	/* We know that this device on this chipset has its own IOMMU.
665 	 * If we find it under a different IOMMU, then the BIOS is lying
666 	 * to us. Hope that the IOMMU for this device is actually
667 	 * disabled, and it needs no translation...
668 	 */
669 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 	if (rc) {
671 		/* "can't" happen */
672 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 		return false;
674 	}
675 	vtbar &= 0xffff0000;
676 
677 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
678 	drhd = dmar_find_matched_drhd_unit(pdev);
679 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 		return true;
683 	}
684 
685 	return false;
686 }
687 
688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 	if (!iommu || iommu->drhd->ignored)
691 		return true;
692 
693 	if (dev_is_pci(dev)) {
694 		struct pci_dev *pdev = to_pci_dev(dev);
695 
696 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 		    quirk_ioat_snb_local_iommu(pdev))
699 			return true;
700 	}
701 
702 	return false;
703 }
704 
705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 	struct dmar_drhd_unit *drhd = NULL;
708 	struct pci_dev *pdev = NULL;
709 	struct intel_iommu *iommu;
710 	struct device *tmp;
711 	u16 segment = 0;
712 	int i;
713 
714 	if (!dev)
715 		return NULL;
716 
717 	if (dev_is_pci(dev)) {
718 		struct pci_dev *pf_pdev;
719 
720 		pdev = pci_real_dma_dev(to_pci_dev(dev));
721 
722 		/* VFs aren't listed in scope tables; we need to look up
723 		 * the PF instead to find the IOMMU. */
724 		pf_pdev = pci_physfn(pdev);
725 		dev = &pf_pdev->dev;
726 		segment = pci_domain_nr(pdev->bus);
727 	} else if (has_acpi_companion(dev))
728 		dev = &ACPI_COMPANION(dev)->dev;
729 
730 	rcu_read_lock();
731 	for_each_iommu(iommu, drhd) {
732 		if (pdev && segment != drhd->segment)
733 			continue;
734 
735 		for_each_active_dev_scope(drhd->devices,
736 					  drhd->devices_cnt, i, tmp) {
737 			if (tmp == dev) {
738 				/* For a VF use its original BDF# not that of the PF
739 				 * which we used for the IOMMU lookup. Strictly speaking
740 				 * we could do this for all PCI devices; we only need to
741 				 * get the BDF# from the scope table for ACPI matches. */
742 				if (pdev && pdev->is_virtfn)
743 					goto got_pdev;
744 
745 				if (bus && devfn) {
746 					*bus = drhd->devices[i].bus;
747 					*devfn = drhd->devices[i].devfn;
748 				}
749 				goto out;
750 			}
751 
752 			if (is_downstream_to_pci_bridge(dev, tmp))
753 				goto got_pdev;
754 		}
755 
756 		if (pdev && drhd->include_all) {
757 got_pdev:
758 			if (bus && devfn) {
759 				*bus = pdev->bus->number;
760 				*devfn = pdev->devfn;
761 			}
762 			goto out;
763 		}
764 	}
765 	iommu = NULL;
766 out:
767 	if (iommu_is_dummy(iommu, dev))
768 		iommu = NULL;
769 
770 	rcu_read_unlock();
771 
772 	return iommu;
773 }
774 
775 static void domain_flush_cache(struct dmar_domain *domain,
776 			       void *addr, int size)
777 {
778 	if (!domain->iommu_coherency)
779 		clflush_cache_range(addr, size);
780 }
781 
782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784 	struct context_entry *context;
785 	int ret = 0;
786 
787 	spin_lock(&iommu->lock);
788 	context = iommu_context_addr(iommu, bus, devfn, 0);
789 	if (context)
790 		ret = context_present(context);
791 	spin_unlock(&iommu->lock);
792 	return ret;
793 }
794 
795 static void free_context_table(struct intel_iommu *iommu)
796 {
797 	struct context_entry *context;
798 	int i;
799 
800 	if (!iommu->root_entry)
801 		return;
802 
803 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
804 		context = iommu_context_addr(iommu, i, 0, 0);
805 		if (context)
806 			free_pgtable_page(context);
807 
808 		if (!sm_supported(iommu))
809 			continue;
810 
811 		context = iommu_context_addr(iommu, i, 0x80, 0);
812 		if (context)
813 			free_pgtable_page(context);
814 	}
815 
816 	free_pgtable_page(iommu->root_entry);
817 	iommu->root_entry = NULL;
818 }
819 
820 #ifdef CONFIG_DMAR_DEBUG
821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824 	struct dma_pte *pte;
825 	int offset;
826 
827 	while (1) {
828 		offset = pfn_level_offset(pfn, level);
829 		pte = &parent[offset];
830 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831 			pr_info("PTE not present at level %d\n", level);
832 			break;
833 		}
834 
835 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836 
837 		if (level == 1)
838 			break;
839 
840 		parent = phys_to_virt(dma_pte_addr(pte));
841 		level--;
842 	}
843 }
844 
845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846 			  unsigned long long addr, u32 pasid)
847 {
848 	struct pasid_dir_entry *dir, *pde;
849 	struct pasid_entry *entries, *pte;
850 	struct context_entry *ctx_entry;
851 	struct root_entry *rt_entry;
852 	int i, dir_index, index, level;
853 	u8 devfn = source_id & 0xff;
854 	u8 bus = source_id >> 8;
855 	struct dma_pte *pgtable;
856 
857 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858 
859 	/* root entry dump */
860 	rt_entry = &iommu->root_entry[bus];
861 	if (!rt_entry) {
862 		pr_info("root table entry is not present\n");
863 		return;
864 	}
865 
866 	if (sm_supported(iommu))
867 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868 			rt_entry->hi, rt_entry->lo);
869 	else
870 		pr_info("root entry: 0x%016llx", rt_entry->lo);
871 
872 	/* context entry dump */
873 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874 	if (!ctx_entry) {
875 		pr_info("context table entry is not present\n");
876 		return;
877 	}
878 
879 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880 		ctx_entry->hi, ctx_entry->lo);
881 
882 	/* legacy mode does not require PASID entries */
883 	if (!sm_supported(iommu)) {
884 		level = agaw_to_level(ctx_entry->hi & 7);
885 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886 		goto pgtable_walk;
887 	}
888 
889 	/* get the pointer to pasid directory entry */
890 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891 	if (!dir) {
892 		pr_info("pasid directory entry is not present\n");
893 		return;
894 	}
895 	/* For request-without-pasid, get the pasid from context entry */
896 	if (intel_iommu_sm && pasid == INVALID_IOASID)
897 		pasid = PASID_RID2PASID;
898 
899 	dir_index = pasid >> PASID_PDE_SHIFT;
900 	pde = &dir[dir_index];
901 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902 
903 	/* get the pointer to the pasid table entry */
904 	entries = get_pasid_table_from_pde(pde);
905 	if (!entries) {
906 		pr_info("pasid table entry is not present\n");
907 		return;
908 	}
909 	index = pasid & PASID_PTE_MASK;
910 	pte = &entries[index];
911 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level)
929 {
930 	struct dma_pte *parent, *pte;
931 	int level = agaw_to_level(domain->agaw);
932 	int offset;
933 
934 	BUG_ON(!domain->pgd);
935 
936 	if (!domain_pfn_supported(domain, pfn))
937 		/* Address beyond IOMMU's addressing capabilities. */
938 		return NULL;
939 
940 	parent = domain->pgd;
941 
942 	while (1) {
943 		void *tmp_page;
944 
945 		offset = pfn_level_offset(pfn, level);
946 		pte = &parent[offset];
947 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948 			break;
949 		if (level == *target_level)
950 			break;
951 
952 		if (!dma_pte_present(pte)) {
953 			uint64_t pteval;
954 
955 			tmp_page = alloc_pgtable_page(domain->nid);
956 
957 			if (!tmp_page)
958 				return NULL;
959 
960 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962 			if (domain_use_first_level(domain)) {
963 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
964 				if (iommu_is_dma_domain(&domain->domain))
965 					pteval |= DMA_FL_PTE_ACCESS;
966 			}
967 			if (cmpxchg64(&pte->val, 0ULL, pteval))
968 				/* Someone else set it while we were thinking; use theirs. */
969 				free_pgtable_page(tmp_page);
970 			else
971 				domain_flush_cache(domain, pte, sizeof(*pte));
972 		}
973 		if (level == 1)
974 			break;
975 
976 		parent = phys_to_virt(dma_pte_addr(pte));
977 		level--;
978 	}
979 
980 	if (!*target_level)
981 		*target_level = level;
982 
983 	return pte;
984 }
985 
986 /* return address's pte at specific level */
987 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
988 					 unsigned long pfn,
989 					 int level, int *large_page)
990 {
991 	struct dma_pte *parent, *pte;
992 	int total = agaw_to_level(domain->agaw);
993 	int offset;
994 
995 	parent = domain->pgd;
996 	while (level <= total) {
997 		offset = pfn_level_offset(pfn, total);
998 		pte = &parent[offset];
999 		if (level == total)
1000 			return pte;
1001 
1002 		if (!dma_pte_present(pte)) {
1003 			*large_page = total;
1004 			break;
1005 		}
1006 
1007 		if (dma_pte_superpage(pte)) {
1008 			*large_page = total;
1009 			return pte;
1010 		}
1011 
1012 		parent = phys_to_virt(dma_pte_addr(pte));
1013 		total--;
1014 	}
1015 	return NULL;
1016 }
1017 
1018 /* clear last level pte, a tlb flush should be followed */
1019 static void dma_pte_clear_range(struct dmar_domain *domain,
1020 				unsigned long start_pfn,
1021 				unsigned long last_pfn)
1022 {
1023 	unsigned int large_page;
1024 	struct dma_pte *first_pte, *pte;
1025 
1026 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1027 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1028 	BUG_ON(start_pfn > last_pfn);
1029 
1030 	/* we don't need lock here; nobody else touches the iova range */
1031 	do {
1032 		large_page = 1;
1033 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1034 		if (!pte) {
1035 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1036 			continue;
1037 		}
1038 		do {
1039 			dma_clear_pte(pte);
1040 			start_pfn += lvl_to_nr_pages(large_page);
1041 			pte++;
1042 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1043 
1044 		domain_flush_cache(domain, first_pte,
1045 				   (void *)pte - (void *)first_pte);
1046 
1047 	} while (start_pfn && start_pfn <= last_pfn);
1048 }
1049 
1050 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1051 			       int retain_level, struct dma_pte *pte,
1052 			       unsigned long pfn, unsigned long start_pfn,
1053 			       unsigned long last_pfn)
1054 {
1055 	pfn = max(start_pfn, pfn);
1056 	pte = &pte[pfn_level_offset(pfn, level)];
1057 
1058 	do {
1059 		unsigned long level_pfn;
1060 		struct dma_pte *level_pte;
1061 
1062 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1063 			goto next;
1064 
1065 		level_pfn = pfn & level_mask(level);
1066 		level_pte = phys_to_virt(dma_pte_addr(pte));
1067 
1068 		if (level > 2) {
1069 			dma_pte_free_level(domain, level - 1, retain_level,
1070 					   level_pte, level_pfn, start_pfn,
1071 					   last_pfn);
1072 		}
1073 
1074 		/*
1075 		 * Free the page table if we're below the level we want to
1076 		 * retain and the range covers the entire table.
1077 		 */
1078 		if (level < retain_level && !(start_pfn > level_pfn ||
1079 		      last_pfn < level_pfn + level_size(level) - 1)) {
1080 			dma_clear_pte(pte);
1081 			domain_flush_cache(domain, pte, sizeof(*pte));
1082 			free_pgtable_page(level_pte);
1083 		}
1084 next:
1085 		pfn += level_size(level);
1086 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1087 }
1088 
1089 /*
1090  * clear last level (leaf) ptes and free page table pages below the
1091  * level we wish to keep intact.
1092  */
1093 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1094 				   unsigned long start_pfn,
1095 				   unsigned long last_pfn,
1096 				   int retain_level)
1097 {
1098 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1099 
1100 	/* We don't need lock here; nobody else touches the iova range */
1101 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1102 			   domain->pgd, 0, start_pfn, last_pfn);
1103 
1104 	/* free pgd */
1105 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1106 		free_pgtable_page(domain->pgd);
1107 		domain->pgd = NULL;
1108 	}
1109 }
1110 
1111 /* When a page at a given level is being unlinked from its parent, we don't
1112    need to *modify* it at all. All we need to do is make a list of all the
1113    pages which can be freed just as soon as we've flushed the IOTLB and we
1114    know the hardware page-walk will no longer touch them.
1115    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1116    be freed. */
1117 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1118 				    int level, struct dma_pte *pte,
1119 				    struct list_head *freelist)
1120 {
1121 	struct page *pg;
1122 
1123 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1124 	list_add_tail(&pg->lru, freelist);
1125 
1126 	if (level == 1)
1127 		return;
1128 
1129 	pte = page_address(pg);
1130 	do {
1131 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1132 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1133 		pte++;
1134 	} while (!first_pte_in_page(pte));
1135 }
1136 
1137 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1138 				struct dma_pte *pte, unsigned long pfn,
1139 				unsigned long start_pfn, unsigned long last_pfn,
1140 				struct list_head *freelist)
1141 {
1142 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1143 
1144 	pfn = max(start_pfn, pfn);
1145 	pte = &pte[pfn_level_offset(pfn, level)];
1146 
1147 	do {
1148 		unsigned long level_pfn = pfn & level_mask(level);
1149 
1150 		if (!dma_pte_present(pte))
1151 			goto next;
1152 
1153 		/* If range covers entire pagetable, free it */
1154 		if (start_pfn <= level_pfn &&
1155 		    last_pfn >= level_pfn + level_size(level) - 1) {
1156 			/* These suborbinate page tables are going away entirely. Don't
1157 			   bother to clear them; we're just going to *free* them. */
1158 			if (level > 1 && !dma_pte_superpage(pte))
1159 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1160 
1161 			dma_clear_pte(pte);
1162 			if (!first_pte)
1163 				first_pte = pte;
1164 			last_pte = pte;
1165 		} else if (level > 1) {
1166 			/* Recurse down into a level that isn't *entirely* obsolete */
1167 			dma_pte_clear_level(domain, level - 1,
1168 					    phys_to_virt(dma_pte_addr(pte)),
1169 					    level_pfn, start_pfn, last_pfn,
1170 					    freelist);
1171 		}
1172 next:
1173 		pfn = level_pfn + level_size(level);
1174 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1175 
1176 	if (first_pte)
1177 		domain_flush_cache(domain, first_pte,
1178 				   (void *)++last_pte - (void *)first_pte);
1179 }
1180 
1181 /* We can't just free the pages because the IOMMU may still be walking
1182    the page tables, and may have cached the intermediate levels. The
1183    pages can only be freed after the IOTLB flush has been done. */
1184 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1185 			 unsigned long last_pfn, struct list_head *freelist)
1186 {
1187 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1188 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1189 	BUG_ON(start_pfn > last_pfn);
1190 
1191 	/* we don't need lock here; nobody else touches the iova range */
1192 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1193 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1194 
1195 	/* free pgd */
1196 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1197 		struct page *pgd_page = virt_to_page(domain->pgd);
1198 		list_add_tail(&pgd_page->lru, freelist);
1199 		domain->pgd = NULL;
1200 	}
1201 }
1202 
1203 /* iommu handling */
1204 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1205 {
1206 	struct root_entry *root;
1207 
1208 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1209 	if (!root) {
1210 		pr_err("Allocating root entry for %s failed\n",
1211 			iommu->name);
1212 		return -ENOMEM;
1213 	}
1214 
1215 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1216 	iommu->root_entry = root;
1217 
1218 	return 0;
1219 }
1220 
1221 static void iommu_set_root_entry(struct intel_iommu *iommu)
1222 {
1223 	u64 addr;
1224 	u32 sts;
1225 	unsigned long flag;
1226 
1227 	addr = virt_to_phys(iommu->root_entry);
1228 	if (sm_supported(iommu))
1229 		addr |= DMA_RTADDR_SMT;
1230 
1231 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1233 
1234 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1235 
1236 	/* Make sure hardware complete it */
1237 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238 		      readl, (sts & DMA_GSTS_RTPS), sts);
1239 
1240 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241 
1242 	/*
1243 	 * Hardware invalidates all DMA remapping hardware translation
1244 	 * caches as part of SRTP flow.
1245 	 */
1246 	if (cap_esrtps(iommu->cap))
1247 		return;
1248 
1249 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1250 	if (sm_supported(iommu))
1251 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1252 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1253 }
1254 
1255 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1256 {
1257 	u32 val;
1258 	unsigned long flag;
1259 
1260 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1261 		return;
1262 
1263 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1264 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1265 
1266 	/* Make sure hardware complete it */
1267 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1268 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1269 
1270 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1271 }
1272 
1273 /* return value determine if we need a write buffer flush */
1274 static void __iommu_flush_context(struct intel_iommu *iommu,
1275 				  u16 did, u16 source_id, u8 function_mask,
1276 				  u64 type)
1277 {
1278 	u64 val = 0;
1279 	unsigned long flag;
1280 
1281 	switch (type) {
1282 	case DMA_CCMD_GLOBAL_INVL:
1283 		val = DMA_CCMD_GLOBAL_INVL;
1284 		break;
1285 	case DMA_CCMD_DOMAIN_INVL:
1286 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1287 		break;
1288 	case DMA_CCMD_DEVICE_INVL:
1289 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1290 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1291 		break;
1292 	default:
1293 		BUG();
1294 	}
1295 	val |= DMA_CCMD_ICC;
1296 
1297 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1298 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1299 
1300 	/* Make sure hardware complete it */
1301 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1302 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1303 
1304 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1305 }
1306 
1307 /* return value determine if we need a write buffer flush */
1308 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1309 				u64 addr, unsigned int size_order, u64 type)
1310 {
1311 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1312 	u64 val = 0, val_iva = 0;
1313 	unsigned long flag;
1314 
1315 	switch (type) {
1316 	case DMA_TLB_GLOBAL_FLUSH:
1317 		/* global flush doesn't need set IVA_REG */
1318 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1319 		break;
1320 	case DMA_TLB_DSI_FLUSH:
1321 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1322 		break;
1323 	case DMA_TLB_PSI_FLUSH:
1324 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1325 		/* IH bit is passed in as part of address */
1326 		val_iva = size_order | addr;
1327 		break;
1328 	default:
1329 		BUG();
1330 	}
1331 	/* Note: set drain read/write */
1332 #if 0
1333 	/*
1334 	 * This is probably to be super secure.. Looks like we can
1335 	 * ignore it without any impact.
1336 	 */
1337 	if (cap_read_drain(iommu->cap))
1338 		val |= DMA_TLB_READ_DRAIN;
1339 #endif
1340 	if (cap_write_drain(iommu->cap))
1341 		val |= DMA_TLB_WRITE_DRAIN;
1342 
1343 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1344 	/* Note: Only uses first TLB reg currently */
1345 	if (val_iva)
1346 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1347 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1348 
1349 	/* Make sure hardware complete it */
1350 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1351 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1352 
1353 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1354 
1355 	/* check IOTLB invalidation granularity */
1356 	if (DMA_TLB_IAIG(val) == 0)
1357 		pr_err("Flush IOTLB failed\n");
1358 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1359 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1360 			(unsigned long long)DMA_TLB_IIRG(type),
1361 			(unsigned long long)DMA_TLB_IAIG(val));
1362 }
1363 
1364 static struct device_domain_info *
1365 domain_lookup_dev_info(struct dmar_domain *domain,
1366 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1367 {
1368 	struct device_domain_info *info;
1369 	unsigned long flags;
1370 
1371 	spin_lock_irqsave(&domain->lock, flags);
1372 	list_for_each_entry(info, &domain->devices, link) {
1373 		if (info->iommu == iommu && info->bus == bus &&
1374 		    info->devfn == devfn) {
1375 			spin_unlock_irqrestore(&domain->lock, flags);
1376 			return info;
1377 		}
1378 	}
1379 	spin_unlock_irqrestore(&domain->lock, flags);
1380 
1381 	return NULL;
1382 }
1383 
1384 static void domain_update_iotlb(struct dmar_domain *domain)
1385 {
1386 	struct device_domain_info *info;
1387 	bool has_iotlb_device = false;
1388 	unsigned long flags;
1389 
1390 	spin_lock_irqsave(&domain->lock, flags);
1391 	list_for_each_entry(info, &domain->devices, link) {
1392 		if (info->ats_enabled) {
1393 			has_iotlb_device = true;
1394 			break;
1395 		}
1396 	}
1397 	domain->has_iotlb_device = has_iotlb_device;
1398 	spin_unlock_irqrestore(&domain->lock, flags);
1399 }
1400 
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1402 {
1403 	struct pci_dev *pdev;
1404 
1405 	if (!info || !dev_is_pci(info->dev))
1406 		return;
1407 
1408 	pdev = to_pci_dev(info->dev);
1409 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412 	 * reserved, which should be set to 0.
1413 	 */
1414 	if (!ecap_dit(info->iommu->ecap))
1415 		info->pfsid = 0;
1416 	else {
1417 		struct pci_dev *pf_pdev;
1418 
1419 		/* pdev will be returned if device is not a vf */
1420 		pf_pdev = pci_physfn(pdev);
1421 		info->pfsid = pci_dev_id(pf_pdev);
1422 	}
1423 
1424 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1425 	   the device if you enable PASID support after ATS support is
1426 	   undefined. So always enable PASID support on devices which
1427 	   have it, even if we can't yet know if we're ever going to
1428 	   use it. */
1429 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430 		info->pasid_enabled = 1;
1431 
1432 	if (info->pri_supported &&
1433 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1434 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435 		info->pri_enabled = 1;
1436 
1437 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439 		info->ats_enabled = 1;
1440 		domain_update_iotlb(info->domain);
1441 		info->ats_qdep = pci_ats_queue_depth(pdev);
1442 	}
1443 }
1444 
1445 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1446 {
1447 	struct pci_dev *pdev;
1448 
1449 	if (!dev_is_pci(info->dev))
1450 		return;
1451 
1452 	pdev = to_pci_dev(info->dev);
1453 
1454 	if (info->ats_enabled) {
1455 		pci_disable_ats(pdev);
1456 		info->ats_enabled = 0;
1457 		domain_update_iotlb(info->domain);
1458 	}
1459 
1460 	if (info->pri_enabled) {
1461 		pci_disable_pri(pdev);
1462 		info->pri_enabled = 0;
1463 	}
1464 
1465 	if (info->pasid_enabled) {
1466 		pci_disable_pasid(pdev);
1467 		info->pasid_enabled = 0;
1468 	}
1469 }
1470 
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472 				    u64 addr, unsigned int mask)
1473 {
1474 	u16 sid, qdep;
1475 
1476 	if (!info || !info->ats_enabled)
1477 		return;
1478 
1479 	sid = info->bus << 8 | info->devfn;
1480 	qdep = info->ats_qdep;
1481 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1482 			   qdep, addr, mask);
1483 }
1484 
1485 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 				  u64 addr, unsigned mask)
1487 {
1488 	struct device_domain_info *info;
1489 	unsigned long flags;
1490 
1491 	if (!domain->has_iotlb_device)
1492 		return;
1493 
1494 	spin_lock_irqsave(&domain->lock, flags);
1495 	list_for_each_entry(info, &domain->devices, link)
1496 		__iommu_flush_dev_iotlb(info, addr, mask);
1497 	spin_unlock_irqrestore(&domain->lock, flags);
1498 }
1499 
1500 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 				  struct dmar_domain *domain,
1502 				  unsigned long pfn, unsigned int pages,
1503 				  int ih, int map)
1504 {
1505 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 	unsigned int mask = ilog2(aligned_pages);
1507 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 	u16 did = domain_id_iommu(domain, iommu);
1509 
1510 	BUG_ON(pages == 0);
1511 
1512 	if (ih)
1513 		ih = 1 << 6;
1514 
1515 	if (domain_use_first_level(domain)) {
1516 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 	} else {
1518 		unsigned long bitmask = aligned_pages - 1;
1519 
1520 		/*
1521 		 * PSI masks the low order bits of the base address. If the
1522 		 * address isn't aligned to the mask, then compute a mask value
1523 		 * needed to ensure the target range is flushed.
1524 		 */
1525 		if (unlikely(bitmask & pfn)) {
1526 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527 
1528 			/*
1529 			 * Since end_pfn <= pfn + bitmask, the only way bits
1530 			 * higher than bitmask can differ in pfn and end_pfn is
1531 			 * by carrying. This means after masking out bitmask,
1532 			 * high bits starting with the first set bit in
1533 			 * shared_bits are all equal in both pfn and end_pfn.
1534 			 */
1535 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 		}
1538 
1539 		/*
1540 		 * Fallback to domain selective flush if no PSI support or
1541 		 * the size is too big.
1542 		 */
1543 		if (!cap_pgsel_inv(iommu->cap) ||
1544 		    mask > cap_max_amask_val(iommu->cap))
1545 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 							DMA_TLB_DSI_FLUSH);
1547 		else
1548 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 							DMA_TLB_PSI_FLUSH);
1550 	}
1551 
1552 	/*
1553 	 * In caching mode, changes of pages from non-present to present require
1554 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 	 */
1556 	if (!cap_caching_mode(iommu->cap) || !map)
1557 		iommu_flush_dev_iotlb(domain, addr, mask);
1558 }
1559 
1560 /* Notification for newly created mappings */
1561 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 					struct dmar_domain *domain,
1563 					unsigned long pfn, unsigned int pages)
1564 {
1565 	/*
1566 	 * It's a non-present to present mapping. Only flush if caching mode
1567 	 * and second level.
1568 	 */
1569 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1570 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 	else
1572 		iommu_flush_write_buffer(iommu);
1573 }
1574 
1575 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576 {
1577 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 	struct iommu_domain_info *info;
1579 	unsigned long idx;
1580 
1581 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 		struct intel_iommu *iommu = info->iommu;
1583 		u16 did = domain_id_iommu(dmar_domain, iommu);
1584 
1585 		if (domain_use_first_level(dmar_domain))
1586 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 		else
1588 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 						 DMA_TLB_DSI_FLUSH);
1590 
1591 		if (!cap_caching_mode(iommu->cap))
1592 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 	}
1594 }
1595 
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598 	u32 pmen;
1599 	unsigned long flags;
1600 
1601 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 		return;
1603 
1604 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 	pmen &= ~DMA_PMEN_EPM;
1607 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608 
1609 	/* wait for the protected region status bit to clear */
1610 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612 
1613 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615 
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618 	u32 sts;
1619 	unsigned long flags;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	iommu->gcmd |= DMA_GCMD_TE;
1623 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624 
1625 	/* Make sure hardware complete it */
1626 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 		      readl, (sts & DMA_GSTS_TES), sts);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flag;
1636 
1637 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 		return;
1640 
1641 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 	iommu->gcmd &= ~DMA_GCMD_TE;
1643 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644 
1645 	/* Make sure hardware complete it */
1646 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648 
1649 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651 
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654 	u32 ndomains;
1655 
1656 	ndomains = cap_ndoms(iommu->cap);
1657 	pr_debug("%s: Number of Domains supported <%d>\n",
1658 		 iommu->name, ndomains);
1659 
1660 	spin_lock_init(&iommu->lock);
1661 
1662 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 	if (!iommu->domain_ids)
1664 		return -ENOMEM;
1665 
1666 	/*
1667 	 * If Caching mode is set, then invalid translations are tagged
1668 	 * with domain-id 0, hence we need to pre-allocate it. We also
1669 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 	 * make sure it is not used for a real domain.
1671 	 */
1672 	set_bit(0, iommu->domain_ids);
1673 
1674 	/*
1675 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 	 * entry for first-level or pass-through translation modes should
1677 	 * be programmed with a domain id different from those used for
1678 	 * second-level or nested translation. We reserve a domain id for
1679 	 * this purpose.
1680 	 */
1681 	if (sm_supported(iommu))
1682 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683 
1684 	return 0;
1685 }
1686 
1687 static void disable_dmar_iommu(struct intel_iommu *iommu)
1688 {
1689 	if (!iommu->domain_ids)
1690 		return;
1691 
1692 	/*
1693 	 * All iommu domains must have been detached from the devices,
1694 	 * hence there should be no domain IDs in use.
1695 	 */
1696 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 		    > NUM_RESERVED_DID))
1698 		return;
1699 
1700 	if (iommu->gcmd & DMA_GCMD_TE)
1701 		iommu_disable_translation(iommu);
1702 }
1703 
1704 static void free_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706 	if (iommu->domain_ids) {
1707 		bitmap_free(iommu->domain_ids);
1708 		iommu->domain_ids = NULL;
1709 	}
1710 
1711 	if (iommu->copied_tables) {
1712 		bitmap_free(iommu->copied_tables);
1713 		iommu->copied_tables = NULL;
1714 	}
1715 
1716 	/* free context mapping */
1717 	free_context_table(iommu);
1718 
1719 #ifdef CONFIG_INTEL_IOMMU_SVM
1720 	if (pasid_supported(iommu)) {
1721 		if (ecap_prs(iommu->ecap))
1722 			intel_svm_finish_prq(iommu);
1723 	}
1724 	if (vccap_pasid(iommu->vccap))
1725 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726 
1727 #endif
1728 }
1729 
1730 /*
1731  * Check and return whether first level is used by default for
1732  * DMA translation.
1733  */
1734 static bool first_level_by_default(unsigned int type)
1735 {
1736 	/* Only SL is available in legacy mode */
1737 	if (!scalable_mode_support())
1738 		return false;
1739 
1740 	/* Only level (either FL or SL) is available, just use it */
1741 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 		return intel_cap_flts_sanity();
1743 
1744 	/* Both levels are available, decide it based on domain type */
1745 	return type != IOMMU_DOMAIN_UNMANAGED;
1746 }
1747 
1748 static struct dmar_domain *alloc_domain(unsigned int type)
1749 {
1750 	struct dmar_domain *domain;
1751 
1752 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 	if (!domain)
1754 		return NULL;
1755 
1756 	domain->nid = NUMA_NO_NODE;
1757 	if (first_level_by_default(type))
1758 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1759 	domain->has_iotlb_device = false;
1760 	INIT_LIST_HEAD(&domain->devices);
1761 	spin_lock_init(&domain->lock);
1762 	xa_init(&domain->iommu_array);
1763 
1764 	return domain;
1765 }
1766 
1767 static int domain_attach_iommu(struct dmar_domain *domain,
1768 			       struct intel_iommu *iommu)
1769 {
1770 	struct iommu_domain_info *info, *curr;
1771 	unsigned long ndomains;
1772 	int num, ret = -ENOSPC;
1773 
1774 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 	if (!info)
1776 		return -ENOMEM;
1777 
1778 	spin_lock(&iommu->lock);
1779 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 	if (curr) {
1781 		curr->refcnt++;
1782 		spin_unlock(&iommu->lock);
1783 		kfree(info);
1784 		return 0;
1785 	}
1786 
1787 	ndomains = cap_ndoms(iommu->cap);
1788 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 	if (num >= ndomains) {
1790 		pr_err("%s: No free domain ids\n", iommu->name);
1791 		goto err_unlock;
1792 	}
1793 
1794 	set_bit(num, iommu->domain_ids);
1795 	info->refcnt	= 1;
1796 	info->did	= num;
1797 	info->iommu	= iommu;
1798 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 			  NULL, info, GFP_ATOMIC);
1800 	if (curr) {
1801 		ret = xa_err(curr) ? : -EBUSY;
1802 		goto err_clear;
1803 	}
1804 	domain_update_iommu_cap(domain);
1805 
1806 	spin_unlock(&iommu->lock);
1807 	return 0;
1808 
1809 err_clear:
1810 	clear_bit(info->did, iommu->domain_ids);
1811 err_unlock:
1812 	spin_unlock(&iommu->lock);
1813 	kfree(info);
1814 	return ret;
1815 }
1816 
1817 static void domain_detach_iommu(struct dmar_domain *domain,
1818 				struct intel_iommu *iommu)
1819 {
1820 	struct iommu_domain_info *info;
1821 
1822 	spin_lock(&iommu->lock);
1823 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 	if (--info->refcnt == 0) {
1825 		clear_bit(info->did, iommu->domain_ids);
1826 		xa_erase(&domain->iommu_array, iommu->seq_id);
1827 		domain->nid = NUMA_NO_NODE;
1828 		domain_update_iommu_cap(domain);
1829 		kfree(info);
1830 	}
1831 	spin_unlock(&iommu->lock);
1832 }
1833 
1834 static inline int guestwidth_to_adjustwidth(int gaw)
1835 {
1836 	int agaw;
1837 	int r = (gaw - 12) % 9;
1838 
1839 	if (r == 0)
1840 		agaw = gaw;
1841 	else
1842 		agaw = gaw + 9 - r;
1843 	if (agaw > 64)
1844 		agaw = 64;
1845 	return agaw;
1846 }
1847 
1848 static void domain_exit(struct dmar_domain *domain)
1849 {
1850 	if (domain->pgd) {
1851 		LIST_HEAD(freelist);
1852 
1853 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 		put_pages_list(&freelist);
1855 	}
1856 
1857 	if (WARN_ON(!list_empty(&domain->devices)))
1858 		return;
1859 
1860 	kfree(domain);
1861 }
1862 
1863 /*
1864  * Get the PASID directory size for scalable mode context entry.
1865  * Value of X in the PDTS field of a scalable mode context entry
1866  * indicates PASID directory with 2^(X + 7) entries.
1867  */
1868 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869 {
1870 	unsigned long pds, max_pde;
1871 
1872 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 	if (pds < 7)
1875 		return 0;
1876 
1877 	return pds - 7;
1878 }
1879 
1880 /*
1881  * Set the RID_PASID field of a scalable mode context entry. The
1882  * IOMMU hardware will use the PASID value set in this field for
1883  * DMA translations of DMA requests without PASID.
1884  */
1885 static inline void
1886 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887 {
1888 	context->hi |= pasid & ((1 << 20) - 1);
1889 }
1890 
1891 /*
1892  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893  * entry.
1894  */
1895 static inline void context_set_sm_dte(struct context_entry *context)
1896 {
1897 	context->lo |= (1 << 2);
1898 }
1899 
1900 /*
1901  * Set the PRE(Page Request Enable) field of a scalable mode context
1902  * entry.
1903  */
1904 static inline void context_set_sm_pre(struct context_entry *context)
1905 {
1906 	context->lo |= (1 << 4);
1907 }
1908 
1909 /* Convert value to context PASID directory size field coding. */
1910 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1911 
1912 static int domain_context_mapping_one(struct dmar_domain *domain,
1913 				      struct intel_iommu *iommu,
1914 				      struct pasid_table *table,
1915 				      u8 bus, u8 devfn)
1916 {
1917 	struct device_domain_info *info =
1918 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 	u16 did = domain_id_iommu(domain, iommu);
1920 	int translation = CONTEXT_TT_MULTI_LEVEL;
1921 	struct context_entry *context;
1922 	int ret;
1923 
1924 	WARN_ON(did == 0);
1925 
1926 	if (hw_pass_through && domain_type_is_si(domain))
1927 		translation = CONTEXT_TT_PASS_THROUGH;
1928 
1929 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931 
1932 	BUG_ON(!domain->pgd);
1933 
1934 	spin_lock(&iommu->lock);
1935 	ret = -ENOMEM;
1936 	context = iommu_context_addr(iommu, bus, devfn, 1);
1937 	if (!context)
1938 		goto out_unlock;
1939 
1940 	ret = 0;
1941 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 		goto out_unlock;
1943 
1944 	/*
1945 	 * For kdump cases, old valid entries may be cached due to the
1946 	 * in-flight DMA and copied pgtable, but there is no unmapping
1947 	 * behaviour for them, thus we need an explicit cache flush for
1948 	 * the newly-mapped device. For kdump, at this point, the device
1949 	 * is supposed to finish reset at its driver probe stage, so no
1950 	 * in-flight DMA will exist, and we don't need to worry anymore
1951 	 * hereafter.
1952 	 */
1953 	if (context_copied(iommu, bus, devfn)) {
1954 		u16 did_old = context_domain_id(context);
1955 
1956 		if (did_old < cap_ndoms(iommu->cap)) {
1957 			iommu->flush.flush_context(iommu, did_old,
1958 						   (((u16)bus) << 8) | devfn,
1959 						   DMA_CCMD_MASK_NOBIT,
1960 						   DMA_CCMD_DEVICE_INVL);
1961 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 						 DMA_TLB_DSI_FLUSH);
1963 		}
1964 
1965 		clear_context_copied(iommu, bus, devfn);
1966 	}
1967 
1968 	context_clear_entry(context);
1969 
1970 	if (sm_supported(iommu)) {
1971 		unsigned long pds;
1972 
1973 		WARN_ON(!table);
1974 
1975 		/* Setup the PASID DIR pointer: */
1976 		pds = context_get_sm_pds(table);
1977 		context->lo = (u64)virt_to_phys(table->table) |
1978 				context_pdts(pds);
1979 
1980 		/* Setup the RID_PASID field: */
1981 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982 
1983 		/*
1984 		 * Setup the Device-TLB enable bit and Page request
1985 		 * Enable bit:
1986 		 */
1987 		if (info && info->ats_supported)
1988 			context_set_sm_dte(context);
1989 		if (info && info->pri_supported)
1990 			context_set_sm_pre(context);
1991 		if (info && info->pasid_supported)
1992 			context_set_pasid(context);
1993 	} else {
1994 		struct dma_pte *pgd = domain->pgd;
1995 		int agaw;
1996 
1997 		context_set_domain_id(context, did);
1998 
1999 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 			/*
2001 			 * Skip top levels of page tables for iommu which has
2002 			 * less agaw than default. Unnecessary for PT mode.
2003 			 */
2004 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 				ret = -ENOMEM;
2006 				pgd = phys_to_virt(dma_pte_addr(pgd));
2007 				if (!dma_pte_present(pgd))
2008 					goto out_unlock;
2009 			}
2010 
2011 			if (info && info->ats_supported)
2012 				translation = CONTEXT_TT_DEV_IOTLB;
2013 			else
2014 				translation = CONTEXT_TT_MULTI_LEVEL;
2015 
2016 			context_set_address_root(context, virt_to_phys(pgd));
2017 			context_set_address_width(context, agaw);
2018 		} else {
2019 			/*
2020 			 * In pass through mode, AW must be programmed to
2021 			 * indicate the largest AGAW value supported by
2022 			 * hardware. And ASR is ignored by hardware.
2023 			 */
2024 			context_set_address_width(context, iommu->msagaw);
2025 		}
2026 
2027 		context_set_translation_type(context, translation);
2028 	}
2029 
2030 	context_set_fault_enable(context);
2031 	context_set_present(context);
2032 	if (!ecap_coherent(iommu->ecap))
2033 		clflush_cache_range(context, sizeof(*context));
2034 
2035 	/*
2036 	 * It's a non-present to present mapping. If hardware doesn't cache
2037 	 * non-present entry we only need to flush the write-buffer. If the
2038 	 * _does_ cache non-present entries, then it does so in the special
2039 	 * domain #0, which we have to flush:
2040 	 */
2041 	if (cap_caching_mode(iommu->cap)) {
2042 		iommu->flush.flush_context(iommu, 0,
2043 					   (((u16)bus) << 8) | devfn,
2044 					   DMA_CCMD_MASK_NOBIT,
2045 					   DMA_CCMD_DEVICE_INVL);
2046 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 	} else {
2048 		iommu_flush_write_buffer(iommu);
2049 	}
2050 	iommu_enable_pci_caps(info);
2051 
2052 	ret = 0;
2053 
2054 out_unlock:
2055 	spin_unlock(&iommu->lock);
2056 
2057 	return ret;
2058 }
2059 
2060 struct domain_context_mapping_data {
2061 	struct dmar_domain *domain;
2062 	struct intel_iommu *iommu;
2063 	struct pasid_table *table;
2064 };
2065 
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067 				     u16 alias, void *opaque)
2068 {
2069 	struct domain_context_mapping_data *data = opaque;
2070 
2071 	return domain_context_mapping_one(data->domain, data->iommu,
2072 					  data->table, PCI_BUS_NUM(alias),
2073 					  alias & 0xff);
2074 }
2075 
2076 static int
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2078 {
2079 	struct domain_context_mapping_data data;
2080 	struct pasid_table *table;
2081 	struct intel_iommu *iommu;
2082 	u8 bus, devfn;
2083 
2084 	iommu = device_to_iommu(dev, &bus, &devfn);
2085 	if (!iommu)
2086 		return -ENODEV;
2087 
2088 	table = intel_pasid_get_table(dev);
2089 
2090 	if (!dev_is_pci(dev))
2091 		return domain_context_mapping_one(domain, iommu, table,
2092 						  bus, devfn);
2093 
2094 	data.domain = domain;
2095 	data.iommu = iommu;
2096 	data.table = table;
2097 
2098 	return pci_for_each_dma_alias(to_pci_dev(dev),
2099 				      &domain_context_mapping_cb, &data);
2100 }
2101 
2102 static int domain_context_mapped_cb(struct pci_dev *pdev,
2103 				    u16 alias, void *opaque)
2104 {
2105 	struct intel_iommu *iommu = opaque;
2106 
2107 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2108 }
2109 
2110 static int domain_context_mapped(struct device *dev)
2111 {
2112 	struct intel_iommu *iommu;
2113 	u8 bus, devfn;
2114 
2115 	iommu = device_to_iommu(dev, &bus, &devfn);
2116 	if (!iommu)
2117 		return -ENODEV;
2118 
2119 	if (!dev_is_pci(dev))
2120 		return device_context_mapped(iommu, bus, devfn);
2121 
2122 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2123 				       domain_context_mapped_cb, iommu);
2124 }
2125 
2126 /* Returns a number of VTD pages, but aligned to MM page size */
2127 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2128 					    size_t size)
2129 {
2130 	host_addr &= ~PAGE_MASK;
2131 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2132 }
2133 
2134 /* Return largest possible superpage level for a given mapping */
2135 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2136 					  unsigned long iov_pfn,
2137 					  unsigned long phy_pfn,
2138 					  unsigned long pages)
2139 {
2140 	int support, level = 1;
2141 	unsigned long pfnmerge;
2142 
2143 	support = domain->iommu_superpage;
2144 
2145 	/* To use a large page, the virtual *and* physical addresses
2146 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2147 	   of them will mean we have to use smaller pages. So just
2148 	   merge them and check both at once. */
2149 	pfnmerge = iov_pfn | phy_pfn;
2150 
2151 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2152 		pages >>= VTD_STRIDE_SHIFT;
2153 		if (!pages)
2154 			break;
2155 		pfnmerge >>= VTD_STRIDE_SHIFT;
2156 		level++;
2157 		support--;
2158 	}
2159 	return level;
2160 }
2161 
2162 /*
2163  * Ensure that old small page tables are removed to make room for superpage(s).
2164  * We're going to add new large pages, so make sure we don't remove their parent
2165  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2166  */
2167 static void switch_to_super_page(struct dmar_domain *domain,
2168 				 unsigned long start_pfn,
2169 				 unsigned long end_pfn, int level)
2170 {
2171 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2172 	struct iommu_domain_info *info;
2173 	struct dma_pte *pte = NULL;
2174 	unsigned long i;
2175 
2176 	while (start_pfn <= end_pfn) {
2177 		if (!pte)
2178 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2179 
2180 		if (dma_pte_present(pte)) {
2181 			dma_pte_free_pagetable(domain, start_pfn,
2182 					       start_pfn + lvl_pages - 1,
2183 					       level + 1);
2184 
2185 			xa_for_each(&domain->iommu_array, i, info)
2186 				iommu_flush_iotlb_psi(info->iommu, domain,
2187 						      start_pfn, lvl_pages,
2188 						      0, 0);
2189 		}
2190 
2191 		pte++;
2192 		start_pfn += lvl_pages;
2193 		if (first_pte_in_page(pte))
2194 			pte = NULL;
2195 	}
2196 }
2197 
2198 static int
2199 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2200 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2201 {
2202 	struct dma_pte *first_pte = NULL, *pte = NULL;
2203 	unsigned int largepage_lvl = 0;
2204 	unsigned long lvl_pages = 0;
2205 	phys_addr_t pteval;
2206 	u64 attr;
2207 
2208 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2209 
2210 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2211 		return -EINVAL;
2212 
2213 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2214 	attr |= DMA_FL_PTE_PRESENT;
2215 	if (domain_use_first_level(domain)) {
2216 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2217 		if (prot & DMA_PTE_WRITE)
2218 			attr |= DMA_FL_PTE_DIRTY;
2219 	}
2220 
2221 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2222 
2223 	while (nr_pages > 0) {
2224 		uint64_t tmp;
2225 
2226 		if (!pte) {
2227 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2228 					phys_pfn, nr_pages);
2229 
2230 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2231 			if (!pte)
2232 				return -ENOMEM;
2233 			first_pte = pte;
2234 
2235 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2236 
2237 			/* It is large page*/
2238 			if (largepage_lvl > 1) {
2239 				unsigned long end_pfn;
2240 				unsigned long pages_to_remove;
2241 
2242 				pteval |= DMA_PTE_LARGE_PAGE;
2243 				pages_to_remove = min_t(unsigned long, nr_pages,
2244 							nr_pte_to_next_page(pte) * lvl_pages);
2245 				end_pfn = iov_pfn + pages_to_remove - 1;
2246 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2247 			} else {
2248 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2249 			}
2250 
2251 		}
2252 		/* We don't need lock here, nobody else
2253 		 * touches the iova range
2254 		 */
2255 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2256 		if (tmp) {
2257 			static int dumps = 5;
2258 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2259 				iov_pfn, tmp, (unsigned long long)pteval);
2260 			if (dumps) {
2261 				dumps--;
2262 				debug_dma_dump_mappings(NULL);
2263 			}
2264 			WARN_ON(1);
2265 		}
2266 
2267 		nr_pages -= lvl_pages;
2268 		iov_pfn += lvl_pages;
2269 		phys_pfn += lvl_pages;
2270 		pteval += lvl_pages * VTD_PAGE_SIZE;
2271 
2272 		/* If the next PTE would be the first in a new page, then we
2273 		 * need to flush the cache on the entries we've just written.
2274 		 * And then we'll need to recalculate 'pte', so clear it and
2275 		 * let it get set again in the if (!pte) block above.
2276 		 *
2277 		 * If we're done (!nr_pages) we need to flush the cache too.
2278 		 *
2279 		 * Also if we've been setting superpages, we may need to
2280 		 * recalculate 'pte' and switch back to smaller pages for the
2281 		 * end of the mapping, if the trailing size is not enough to
2282 		 * use another superpage (i.e. nr_pages < lvl_pages).
2283 		 */
2284 		pte++;
2285 		if (!nr_pages || first_pte_in_page(pte) ||
2286 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2287 			domain_flush_cache(domain, first_pte,
2288 					   (void *)pte - (void *)first_pte);
2289 			pte = NULL;
2290 		}
2291 	}
2292 
2293 	return 0;
2294 }
2295 
2296 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2297 {
2298 	struct intel_iommu *iommu = info->iommu;
2299 	struct context_entry *context;
2300 	u16 did_old;
2301 
2302 	if (!iommu)
2303 		return;
2304 
2305 	spin_lock(&iommu->lock);
2306 	context = iommu_context_addr(iommu, bus, devfn, 0);
2307 	if (!context) {
2308 		spin_unlock(&iommu->lock);
2309 		return;
2310 	}
2311 
2312 	if (sm_supported(iommu)) {
2313 		if (hw_pass_through && domain_type_is_si(info->domain))
2314 			did_old = FLPT_DEFAULT_DID;
2315 		else
2316 			did_old = domain_id_iommu(info->domain, iommu);
2317 	} else {
2318 		did_old = context_domain_id(context);
2319 	}
2320 
2321 	context_clear_entry(context);
2322 	__iommu_flush_cache(iommu, context, sizeof(*context));
2323 	spin_unlock(&iommu->lock);
2324 	iommu->flush.flush_context(iommu,
2325 				   did_old,
2326 				   (((u16)bus) << 8) | devfn,
2327 				   DMA_CCMD_MASK_NOBIT,
2328 				   DMA_CCMD_DEVICE_INVL);
2329 
2330 	if (sm_supported(iommu))
2331 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2332 
2333 	iommu->flush.flush_iotlb(iommu,
2334 				 did_old,
2335 				 0,
2336 				 0,
2337 				 DMA_TLB_DSI_FLUSH);
2338 
2339 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2340 }
2341 
2342 static int domain_setup_first_level(struct intel_iommu *iommu,
2343 				    struct dmar_domain *domain,
2344 				    struct device *dev,
2345 				    u32 pasid)
2346 {
2347 	struct dma_pte *pgd = domain->pgd;
2348 	int agaw, level;
2349 	int flags = 0;
2350 
2351 	/*
2352 	 * Skip top levels of page tables for iommu which has
2353 	 * less agaw than default. Unnecessary for PT mode.
2354 	 */
2355 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2356 		pgd = phys_to_virt(dma_pte_addr(pgd));
2357 		if (!dma_pte_present(pgd))
2358 			return -ENOMEM;
2359 	}
2360 
2361 	level = agaw_to_level(agaw);
2362 	if (level != 4 && level != 5)
2363 		return -EINVAL;
2364 
2365 	if (pasid != PASID_RID2PASID)
2366 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2367 	if (level == 5)
2368 		flags |= PASID_FLAG_FL5LP;
2369 
2370 	if (domain->force_snooping)
2371 		flags |= PASID_FLAG_PAGE_SNOOP;
2372 
2373 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2374 					     domain_id_iommu(domain, iommu),
2375 					     flags);
2376 }
2377 
2378 static bool dev_is_real_dma_subdevice(struct device *dev)
2379 {
2380 	return dev && dev_is_pci(dev) &&
2381 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2382 }
2383 
2384 static int iommu_domain_identity_map(struct dmar_domain *domain,
2385 				     unsigned long first_vpfn,
2386 				     unsigned long last_vpfn)
2387 {
2388 	/*
2389 	 * RMRR range might have overlap with physical memory range,
2390 	 * clear it first
2391 	 */
2392 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2393 
2394 	return __domain_mapping(domain, first_vpfn,
2395 				first_vpfn, last_vpfn - first_vpfn + 1,
2396 				DMA_PTE_READ|DMA_PTE_WRITE);
2397 }
2398 
2399 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2400 
2401 static int __init si_domain_init(int hw)
2402 {
2403 	struct dmar_rmrr_unit *rmrr;
2404 	struct device *dev;
2405 	int i, nid, ret;
2406 
2407 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2408 	if (!si_domain)
2409 		return -EFAULT;
2410 
2411 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2412 		domain_exit(si_domain);
2413 		si_domain = NULL;
2414 		return -EFAULT;
2415 	}
2416 
2417 	if (hw)
2418 		return 0;
2419 
2420 	for_each_online_node(nid) {
2421 		unsigned long start_pfn, end_pfn;
2422 		int i;
2423 
2424 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2425 			ret = iommu_domain_identity_map(si_domain,
2426 					mm_to_dma_pfn(start_pfn),
2427 					mm_to_dma_pfn(end_pfn));
2428 			if (ret)
2429 				return ret;
2430 		}
2431 	}
2432 
2433 	/*
2434 	 * Identity map the RMRRs so that devices with RMRRs could also use
2435 	 * the si_domain.
2436 	 */
2437 	for_each_rmrr_units(rmrr) {
2438 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2439 					  i, dev) {
2440 			unsigned long long start = rmrr->base_address;
2441 			unsigned long long end = rmrr->end_address;
2442 
2443 			if (WARN_ON(end < start ||
2444 				    end >> agaw_to_width(si_domain->agaw)))
2445 				continue;
2446 
2447 			ret = iommu_domain_identity_map(si_domain,
2448 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2449 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2450 			if (ret)
2451 				return ret;
2452 		}
2453 	}
2454 
2455 	return 0;
2456 }
2457 
2458 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2459 {
2460 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2461 	struct intel_iommu *iommu;
2462 	unsigned long flags;
2463 	u8 bus, devfn;
2464 	int ret;
2465 
2466 	iommu = device_to_iommu(dev, &bus, &devfn);
2467 	if (!iommu)
2468 		return -ENODEV;
2469 
2470 	ret = domain_attach_iommu(domain, iommu);
2471 	if (ret)
2472 		return ret;
2473 	info->domain = domain;
2474 	spin_lock_irqsave(&domain->lock, flags);
2475 	list_add(&info->link, &domain->devices);
2476 	spin_unlock_irqrestore(&domain->lock, flags);
2477 
2478 	/* PASID table is mandatory for a PCI device in scalable mode. */
2479 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2480 		ret = intel_pasid_alloc_table(dev);
2481 		if (ret) {
2482 			dev_err(dev, "PASID table allocation failed\n");
2483 			dmar_remove_one_dev_info(dev);
2484 			return ret;
2485 		}
2486 
2487 		/* Setup the PASID entry for requests without PASID: */
2488 		if (hw_pass_through && domain_type_is_si(domain))
2489 			ret = intel_pasid_setup_pass_through(iommu, domain,
2490 					dev, PASID_RID2PASID);
2491 		else if (domain_use_first_level(domain))
2492 			ret = domain_setup_first_level(iommu, domain, dev,
2493 					PASID_RID2PASID);
2494 		else
2495 			ret = intel_pasid_setup_second_level(iommu, domain,
2496 					dev, PASID_RID2PASID);
2497 		if (ret) {
2498 			dev_err(dev, "Setup RID2PASID failed\n");
2499 			dmar_remove_one_dev_info(dev);
2500 			return ret;
2501 		}
2502 	}
2503 
2504 	ret = domain_context_mapping(domain, dev);
2505 	if (ret) {
2506 		dev_err(dev, "Domain context map failed\n");
2507 		dmar_remove_one_dev_info(dev);
2508 		return ret;
2509 	}
2510 
2511 	return 0;
2512 }
2513 
2514 static bool device_has_rmrr(struct device *dev)
2515 {
2516 	struct dmar_rmrr_unit *rmrr;
2517 	struct device *tmp;
2518 	int i;
2519 
2520 	rcu_read_lock();
2521 	for_each_rmrr_units(rmrr) {
2522 		/*
2523 		 * Return TRUE if this RMRR contains the device that
2524 		 * is passed in.
2525 		 */
2526 		for_each_active_dev_scope(rmrr->devices,
2527 					  rmrr->devices_cnt, i, tmp)
2528 			if (tmp == dev ||
2529 			    is_downstream_to_pci_bridge(dev, tmp)) {
2530 				rcu_read_unlock();
2531 				return true;
2532 			}
2533 	}
2534 	rcu_read_unlock();
2535 	return false;
2536 }
2537 
2538 /**
2539  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2540  * is relaxable (ie. is allowed to be not enforced under some conditions)
2541  * @dev: device handle
2542  *
2543  * We assume that PCI USB devices with RMRRs have them largely
2544  * for historical reasons and that the RMRR space is not actively used post
2545  * boot.  This exclusion may change if vendors begin to abuse it.
2546  *
2547  * The same exception is made for graphics devices, with the requirement that
2548  * any use of the RMRR regions will be torn down before assigning the device
2549  * to a guest.
2550  *
2551  * Return: true if the RMRR is relaxable, false otherwise
2552  */
2553 static bool device_rmrr_is_relaxable(struct device *dev)
2554 {
2555 	struct pci_dev *pdev;
2556 
2557 	if (!dev_is_pci(dev))
2558 		return false;
2559 
2560 	pdev = to_pci_dev(dev);
2561 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2562 		return true;
2563 	else
2564 		return false;
2565 }
2566 
2567 /*
2568  * There are a couple cases where we need to restrict the functionality of
2569  * devices associated with RMRRs.  The first is when evaluating a device for
2570  * identity mapping because problems exist when devices are moved in and out
2571  * of domains and their respective RMRR information is lost.  This means that
2572  * a device with associated RMRRs will never be in a "passthrough" domain.
2573  * The second is use of the device through the IOMMU API.  This interface
2574  * expects to have full control of the IOVA space for the device.  We cannot
2575  * satisfy both the requirement that RMRR access is maintained and have an
2576  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2577  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2578  * We therefore prevent devices associated with an RMRR from participating in
2579  * the IOMMU API, which eliminates them from device assignment.
2580  *
2581  * In both cases, devices which have relaxable RMRRs are not concerned by this
2582  * restriction. See device_rmrr_is_relaxable comment.
2583  */
2584 static bool device_is_rmrr_locked(struct device *dev)
2585 {
2586 	if (!device_has_rmrr(dev))
2587 		return false;
2588 
2589 	if (device_rmrr_is_relaxable(dev))
2590 		return false;
2591 
2592 	return true;
2593 }
2594 
2595 /*
2596  * Return the required default domain type for a specific device.
2597  *
2598  * @dev: the device in query
2599  * @startup: true if this is during early boot
2600  *
2601  * Returns:
2602  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2603  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2604  *  - 0: both identity and dynamic domains work for this device
2605  */
2606 static int device_def_domain_type(struct device *dev)
2607 {
2608 	if (dev_is_pci(dev)) {
2609 		struct pci_dev *pdev = to_pci_dev(dev);
2610 
2611 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2612 			return IOMMU_DOMAIN_IDENTITY;
2613 
2614 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2615 			return IOMMU_DOMAIN_IDENTITY;
2616 	}
2617 
2618 	return 0;
2619 }
2620 
2621 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2622 {
2623 	/*
2624 	 * Start from the sane iommu hardware state.
2625 	 * If the queued invalidation is already initialized by us
2626 	 * (for example, while enabling interrupt-remapping) then
2627 	 * we got the things already rolling from a sane state.
2628 	 */
2629 	if (!iommu->qi) {
2630 		/*
2631 		 * Clear any previous faults.
2632 		 */
2633 		dmar_fault(-1, iommu);
2634 		/*
2635 		 * Disable queued invalidation if supported and already enabled
2636 		 * before OS handover.
2637 		 */
2638 		dmar_disable_qi(iommu);
2639 	}
2640 
2641 	if (dmar_enable_qi(iommu)) {
2642 		/*
2643 		 * Queued Invalidate not enabled, use Register Based Invalidate
2644 		 */
2645 		iommu->flush.flush_context = __iommu_flush_context;
2646 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2647 		pr_info("%s: Using Register based invalidation\n",
2648 			iommu->name);
2649 	} else {
2650 		iommu->flush.flush_context = qi_flush_context;
2651 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2652 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2653 	}
2654 }
2655 
2656 static int copy_context_table(struct intel_iommu *iommu,
2657 			      struct root_entry *old_re,
2658 			      struct context_entry **tbl,
2659 			      int bus, bool ext)
2660 {
2661 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2662 	struct context_entry *new_ce = NULL, ce;
2663 	struct context_entry *old_ce = NULL;
2664 	struct root_entry re;
2665 	phys_addr_t old_ce_phys;
2666 
2667 	tbl_idx = ext ? bus * 2 : bus;
2668 	memcpy(&re, old_re, sizeof(re));
2669 
2670 	for (devfn = 0; devfn < 256; devfn++) {
2671 		/* First calculate the correct index */
2672 		idx = (ext ? devfn * 2 : devfn) % 256;
2673 
2674 		if (idx == 0) {
2675 			/* First save what we may have and clean up */
2676 			if (new_ce) {
2677 				tbl[tbl_idx] = new_ce;
2678 				__iommu_flush_cache(iommu, new_ce,
2679 						    VTD_PAGE_SIZE);
2680 				pos = 1;
2681 			}
2682 
2683 			if (old_ce)
2684 				memunmap(old_ce);
2685 
2686 			ret = 0;
2687 			if (devfn < 0x80)
2688 				old_ce_phys = root_entry_lctp(&re);
2689 			else
2690 				old_ce_phys = root_entry_uctp(&re);
2691 
2692 			if (!old_ce_phys) {
2693 				if (ext && devfn == 0) {
2694 					/* No LCTP, try UCTP */
2695 					devfn = 0x7f;
2696 					continue;
2697 				} else {
2698 					goto out;
2699 				}
2700 			}
2701 
2702 			ret = -ENOMEM;
2703 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2704 					MEMREMAP_WB);
2705 			if (!old_ce)
2706 				goto out;
2707 
2708 			new_ce = alloc_pgtable_page(iommu->node);
2709 			if (!new_ce)
2710 				goto out_unmap;
2711 
2712 			ret = 0;
2713 		}
2714 
2715 		/* Now copy the context entry */
2716 		memcpy(&ce, old_ce + idx, sizeof(ce));
2717 
2718 		if (!context_present(&ce))
2719 			continue;
2720 
2721 		did = context_domain_id(&ce);
2722 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2723 			set_bit(did, iommu->domain_ids);
2724 
2725 		set_context_copied(iommu, bus, devfn);
2726 		new_ce[idx] = ce;
2727 	}
2728 
2729 	tbl[tbl_idx + pos] = new_ce;
2730 
2731 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2732 
2733 out_unmap:
2734 	memunmap(old_ce);
2735 
2736 out:
2737 	return ret;
2738 }
2739 
2740 static int copy_translation_tables(struct intel_iommu *iommu)
2741 {
2742 	struct context_entry **ctxt_tbls;
2743 	struct root_entry *old_rt;
2744 	phys_addr_t old_rt_phys;
2745 	int ctxt_table_entries;
2746 	u64 rtaddr_reg;
2747 	int bus, ret;
2748 	bool new_ext, ext;
2749 
2750 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2751 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2752 	new_ext    = !!sm_supported(iommu);
2753 
2754 	/*
2755 	 * The RTT bit can only be changed when translation is disabled,
2756 	 * but disabling translation means to open a window for data
2757 	 * corruption. So bail out and don't copy anything if we would
2758 	 * have to change the bit.
2759 	 */
2760 	if (new_ext != ext)
2761 		return -EINVAL;
2762 
2763 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2764 	if (!iommu->copied_tables)
2765 		return -ENOMEM;
2766 
2767 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2768 	if (!old_rt_phys)
2769 		return -EINVAL;
2770 
2771 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2772 	if (!old_rt)
2773 		return -ENOMEM;
2774 
2775 	/* This is too big for the stack - allocate it from slab */
2776 	ctxt_table_entries = ext ? 512 : 256;
2777 	ret = -ENOMEM;
2778 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2779 	if (!ctxt_tbls)
2780 		goto out_unmap;
2781 
2782 	for (bus = 0; bus < 256; bus++) {
2783 		ret = copy_context_table(iommu, &old_rt[bus],
2784 					 ctxt_tbls, bus, ext);
2785 		if (ret) {
2786 			pr_err("%s: Failed to copy context table for bus %d\n",
2787 				iommu->name, bus);
2788 			continue;
2789 		}
2790 	}
2791 
2792 	spin_lock(&iommu->lock);
2793 
2794 	/* Context tables are copied, now write them to the root_entry table */
2795 	for (bus = 0; bus < 256; bus++) {
2796 		int idx = ext ? bus * 2 : bus;
2797 		u64 val;
2798 
2799 		if (ctxt_tbls[idx]) {
2800 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2801 			iommu->root_entry[bus].lo = val;
2802 		}
2803 
2804 		if (!ext || !ctxt_tbls[idx + 1])
2805 			continue;
2806 
2807 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2808 		iommu->root_entry[bus].hi = val;
2809 	}
2810 
2811 	spin_unlock(&iommu->lock);
2812 
2813 	kfree(ctxt_tbls);
2814 
2815 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2816 
2817 	ret = 0;
2818 
2819 out_unmap:
2820 	memunmap(old_rt);
2821 
2822 	return ret;
2823 }
2824 
2825 #ifdef CONFIG_INTEL_IOMMU_SVM
2826 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2827 {
2828 	struct intel_iommu *iommu = data;
2829 	ioasid_t ioasid;
2830 
2831 	if (!iommu)
2832 		return INVALID_IOASID;
2833 	/*
2834 	 * VT-d virtual command interface always uses the full 20 bit
2835 	 * PASID range. Host can partition guest PASID range based on
2836 	 * policies but it is out of guest's control.
2837 	 */
2838 	if (min < PASID_MIN || max > intel_pasid_max_id)
2839 		return INVALID_IOASID;
2840 
2841 	if (vcmd_alloc_pasid(iommu, &ioasid))
2842 		return INVALID_IOASID;
2843 
2844 	return ioasid;
2845 }
2846 
2847 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2848 {
2849 	struct intel_iommu *iommu = data;
2850 
2851 	if (!iommu)
2852 		return;
2853 	/*
2854 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2855 	 * We can only free the PASID when all the devices are unbound.
2856 	 */
2857 	if (ioasid_find(NULL, ioasid, NULL)) {
2858 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2859 		return;
2860 	}
2861 	vcmd_free_pasid(iommu, ioasid);
2862 }
2863 
2864 static void register_pasid_allocator(struct intel_iommu *iommu)
2865 {
2866 	/*
2867 	 * If we are running in the host, no need for custom allocator
2868 	 * in that PASIDs are allocated from the host system-wide.
2869 	 */
2870 	if (!cap_caching_mode(iommu->cap))
2871 		return;
2872 
2873 	if (!sm_supported(iommu)) {
2874 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2875 		return;
2876 	}
2877 
2878 	/*
2879 	 * Register a custom PASID allocator if we are running in a guest,
2880 	 * guest PASID must be obtained via virtual command interface.
2881 	 * There can be multiple vIOMMUs in each guest but only one allocator
2882 	 * is active. All vIOMMU allocators will eventually be calling the same
2883 	 * host allocator.
2884 	 */
2885 	if (!vccap_pasid(iommu->vccap))
2886 		return;
2887 
2888 	pr_info("Register custom PASID allocator\n");
2889 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2890 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2891 	iommu->pasid_allocator.pdata = (void *)iommu;
2892 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2893 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2894 		/*
2895 		 * Disable scalable mode on this IOMMU if there
2896 		 * is no custom allocator. Mixing SM capable vIOMMU
2897 		 * and non-SM vIOMMU are not supported.
2898 		 */
2899 		intel_iommu_sm = 0;
2900 	}
2901 }
2902 #endif
2903 
2904 static int __init init_dmars(void)
2905 {
2906 	struct dmar_drhd_unit *drhd;
2907 	struct intel_iommu *iommu;
2908 	int ret;
2909 
2910 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2911 	if (ret)
2912 		goto free_iommu;
2913 
2914 	for_each_iommu(iommu, drhd) {
2915 		if (drhd->ignored) {
2916 			iommu_disable_translation(iommu);
2917 			continue;
2918 		}
2919 
2920 		/*
2921 		 * Find the max pasid size of all IOMMU's in the system.
2922 		 * We need to ensure the system pasid table is no bigger
2923 		 * than the smallest supported.
2924 		 */
2925 		if (pasid_supported(iommu)) {
2926 			u32 temp = 2 << ecap_pss(iommu->ecap);
2927 
2928 			intel_pasid_max_id = min_t(u32, temp,
2929 						   intel_pasid_max_id);
2930 		}
2931 
2932 		intel_iommu_init_qi(iommu);
2933 
2934 		ret = iommu_init_domains(iommu);
2935 		if (ret)
2936 			goto free_iommu;
2937 
2938 		init_translation_status(iommu);
2939 
2940 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2941 			iommu_disable_translation(iommu);
2942 			clear_translation_pre_enabled(iommu);
2943 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2944 				iommu->name);
2945 		}
2946 
2947 		/*
2948 		 * TBD:
2949 		 * we could share the same root & context tables
2950 		 * among all IOMMU's. Need to Split it later.
2951 		 */
2952 		ret = iommu_alloc_root_entry(iommu);
2953 		if (ret)
2954 			goto free_iommu;
2955 
2956 		if (translation_pre_enabled(iommu)) {
2957 			pr_info("Translation already enabled - trying to copy translation structures\n");
2958 
2959 			ret = copy_translation_tables(iommu);
2960 			if (ret) {
2961 				/*
2962 				 * We found the IOMMU with translation
2963 				 * enabled - but failed to copy over the
2964 				 * old root-entry table. Try to proceed
2965 				 * by disabling translation now and
2966 				 * allocating a clean root-entry table.
2967 				 * This might cause DMAR faults, but
2968 				 * probably the dump will still succeed.
2969 				 */
2970 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2971 				       iommu->name);
2972 				iommu_disable_translation(iommu);
2973 				clear_translation_pre_enabled(iommu);
2974 			} else {
2975 				pr_info("Copied translation tables from previous kernel for %s\n",
2976 					iommu->name);
2977 			}
2978 		}
2979 
2980 		if (!ecap_pass_through(iommu->ecap))
2981 			hw_pass_through = 0;
2982 		intel_svm_check(iommu);
2983 	}
2984 
2985 	/*
2986 	 * Now that qi is enabled on all iommus, set the root entry and flush
2987 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2988 	 * flush_context function will loop forever and the boot hangs.
2989 	 */
2990 	for_each_active_iommu(iommu, drhd) {
2991 		iommu_flush_write_buffer(iommu);
2992 #ifdef CONFIG_INTEL_IOMMU_SVM
2993 		register_pasid_allocator(iommu);
2994 #endif
2995 		iommu_set_root_entry(iommu);
2996 	}
2997 
2998 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2999 	dmar_map_gfx = 0;
3000 #endif
3001 
3002 	if (!dmar_map_gfx)
3003 		iommu_identity_mapping |= IDENTMAP_GFX;
3004 
3005 	check_tylersburg_isoch();
3006 
3007 	ret = si_domain_init(hw_pass_through);
3008 	if (ret)
3009 		goto free_iommu;
3010 
3011 	/*
3012 	 * for each drhd
3013 	 *   enable fault log
3014 	 *   global invalidate context cache
3015 	 *   global invalidate iotlb
3016 	 *   enable translation
3017 	 */
3018 	for_each_iommu(iommu, drhd) {
3019 		if (drhd->ignored) {
3020 			/*
3021 			 * we always have to disable PMRs or DMA may fail on
3022 			 * this device
3023 			 */
3024 			if (force_on)
3025 				iommu_disable_protect_mem_regions(iommu);
3026 			continue;
3027 		}
3028 
3029 		iommu_flush_write_buffer(iommu);
3030 
3031 #ifdef CONFIG_INTEL_IOMMU_SVM
3032 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3033 			/*
3034 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3035 			 * could cause possible lock race condition.
3036 			 */
3037 			up_write(&dmar_global_lock);
3038 			ret = intel_svm_enable_prq(iommu);
3039 			down_write(&dmar_global_lock);
3040 			if (ret)
3041 				goto free_iommu;
3042 		}
3043 #endif
3044 		ret = dmar_set_interrupt(iommu);
3045 		if (ret)
3046 			goto free_iommu;
3047 	}
3048 
3049 	return 0;
3050 
3051 free_iommu:
3052 	for_each_active_iommu(iommu, drhd) {
3053 		disable_dmar_iommu(iommu);
3054 		free_dmar_iommu(iommu);
3055 	}
3056 	if (si_domain) {
3057 		domain_exit(si_domain);
3058 		si_domain = NULL;
3059 	}
3060 
3061 	return ret;
3062 }
3063 
3064 static void __init init_no_remapping_devices(void)
3065 {
3066 	struct dmar_drhd_unit *drhd;
3067 	struct device *dev;
3068 	int i;
3069 
3070 	for_each_drhd_unit(drhd) {
3071 		if (!drhd->include_all) {
3072 			for_each_active_dev_scope(drhd->devices,
3073 						  drhd->devices_cnt, i, dev)
3074 				break;
3075 			/* ignore DMAR unit if no devices exist */
3076 			if (i == drhd->devices_cnt)
3077 				drhd->ignored = 1;
3078 		}
3079 	}
3080 
3081 	for_each_active_drhd_unit(drhd) {
3082 		if (drhd->include_all)
3083 			continue;
3084 
3085 		for_each_active_dev_scope(drhd->devices,
3086 					  drhd->devices_cnt, i, dev)
3087 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3088 				break;
3089 		if (i < drhd->devices_cnt)
3090 			continue;
3091 
3092 		/* This IOMMU has *only* gfx devices. Either bypass it or
3093 		   set the gfx_mapped flag, as appropriate */
3094 		drhd->gfx_dedicated = 1;
3095 		if (!dmar_map_gfx)
3096 			drhd->ignored = 1;
3097 	}
3098 }
3099 
3100 #ifdef CONFIG_SUSPEND
3101 static int init_iommu_hw(void)
3102 {
3103 	struct dmar_drhd_unit *drhd;
3104 	struct intel_iommu *iommu = NULL;
3105 
3106 	for_each_active_iommu(iommu, drhd)
3107 		if (iommu->qi)
3108 			dmar_reenable_qi(iommu);
3109 
3110 	for_each_iommu(iommu, drhd) {
3111 		if (drhd->ignored) {
3112 			/*
3113 			 * we always have to disable PMRs or DMA may fail on
3114 			 * this device
3115 			 */
3116 			if (force_on)
3117 				iommu_disable_protect_mem_regions(iommu);
3118 			continue;
3119 		}
3120 
3121 		iommu_flush_write_buffer(iommu);
3122 		iommu_set_root_entry(iommu);
3123 		iommu_enable_translation(iommu);
3124 		iommu_disable_protect_mem_regions(iommu);
3125 	}
3126 
3127 	return 0;
3128 }
3129 
3130 static void iommu_flush_all(void)
3131 {
3132 	struct dmar_drhd_unit *drhd;
3133 	struct intel_iommu *iommu;
3134 
3135 	for_each_active_iommu(iommu, drhd) {
3136 		iommu->flush.flush_context(iommu, 0, 0, 0,
3137 					   DMA_CCMD_GLOBAL_INVL);
3138 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3139 					 DMA_TLB_GLOBAL_FLUSH);
3140 	}
3141 }
3142 
3143 static int iommu_suspend(void)
3144 {
3145 	struct dmar_drhd_unit *drhd;
3146 	struct intel_iommu *iommu = NULL;
3147 	unsigned long flag;
3148 
3149 	for_each_active_iommu(iommu, drhd) {
3150 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3151 					     GFP_KERNEL);
3152 		if (!iommu->iommu_state)
3153 			goto nomem;
3154 	}
3155 
3156 	iommu_flush_all();
3157 
3158 	for_each_active_iommu(iommu, drhd) {
3159 		iommu_disable_translation(iommu);
3160 
3161 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3162 
3163 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3164 			readl(iommu->reg + DMAR_FECTL_REG);
3165 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3166 			readl(iommu->reg + DMAR_FEDATA_REG);
3167 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3168 			readl(iommu->reg + DMAR_FEADDR_REG);
3169 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3170 			readl(iommu->reg + DMAR_FEUADDR_REG);
3171 
3172 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3173 	}
3174 	return 0;
3175 
3176 nomem:
3177 	for_each_active_iommu(iommu, drhd)
3178 		kfree(iommu->iommu_state);
3179 
3180 	return -ENOMEM;
3181 }
3182 
3183 static void iommu_resume(void)
3184 {
3185 	struct dmar_drhd_unit *drhd;
3186 	struct intel_iommu *iommu = NULL;
3187 	unsigned long flag;
3188 
3189 	if (init_iommu_hw()) {
3190 		if (force_on)
3191 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3192 		else
3193 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3194 		return;
3195 	}
3196 
3197 	for_each_active_iommu(iommu, drhd) {
3198 
3199 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3200 
3201 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3202 			iommu->reg + DMAR_FECTL_REG);
3203 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3204 			iommu->reg + DMAR_FEDATA_REG);
3205 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3206 			iommu->reg + DMAR_FEADDR_REG);
3207 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3208 			iommu->reg + DMAR_FEUADDR_REG);
3209 
3210 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3211 	}
3212 
3213 	for_each_active_iommu(iommu, drhd)
3214 		kfree(iommu->iommu_state);
3215 }
3216 
3217 static struct syscore_ops iommu_syscore_ops = {
3218 	.resume		= iommu_resume,
3219 	.suspend	= iommu_suspend,
3220 };
3221 
3222 static void __init init_iommu_pm_ops(void)
3223 {
3224 	register_syscore_ops(&iommu_syscore_ops);
3225 }
3226 
3227 #else
3228 static inline void init_iommu_pm_ops(void) {}
3229 #endif	/* CONFIG_PM */
3230 
3231 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3232 {
3233 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3234 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3235 	    rmrr->end_address <= rmrr->base_address ||
3236 	    arch_rmrr_sanity_check(rmrr))
3237 		return -EINVAL;
3238 
3239 	return 0;
3240 }
3241 
3242 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3243 {
3244 	struct acpi_dmar_reserved_memory *rmrr;
3245 	struct dmar_rmrr_unit *rmrru;
3246 
3247 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3248 	if (rmrr_sanity_check(rmrr)) {
3249 		pr_warn(FW_BUG
3250 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3251 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3252 			   rmrr->base_address, rmrr->end_address,
3253 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3254 			   dmi_get_system_info(DMI_BIOS_VERSION),
3255 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3256 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3257 	}
3258 
3259 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3260 	if (!rmrru)
3261 		goto out;
3262 
3263 	rmrru->hdr = header;
3264 
3265 	rmrru->base_address = rmrr->base_address;
3266 	rmrru->end_address = rmrr->end_address;
3267 
3268 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3269 				((void *)rmrr) + rmrr->header.length,
3270 				&rmrru->devices_cnt);
3271 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3272 		goto free_rmrru;
3273 
3274 	list_add(&rmrru->list, &dmar_rmrr_units);
3275 
3276 	return 0;
3277 free_rmrru:
3278 	kfree(rmrru);
3279 out:
3280 	return -ENOMEM;
3281 }
3282 
3283 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3284 {
3285 	struct dmar_atsr_unit *atsru;
3286 	struct acpi_dmar_atsr *tmp;
3287 
3288 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3289 				dmar_rcu_check()) {
3290 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3291 		if (atsr->segment != tmp->segment)
3292 			continue;
3293 		if (atsr->header.length != tmp->header.length)
3294 			continue;
3295 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3296 			return atsru;
3297 	}
3298 
3299 	return NULL;
3300 }
3301 
3302 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3303 {
3304 	struct acpi_dmar_atsr *atsr;
3305 	struct dmar_atsr_unit *atsru;
3306 
3307 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3308 		return 0;
3309 
3310 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3311 	atsru = dmar_find_atsr(atsr);
3312 	if (atsru)
3313 		return 0;
3314 
3315 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3316 	if (!atsru)
3317 		return -ENOMEM;
3318 
3319 	/*
3320 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3321 	 * copy the memory content because the memory buffer will be freed
3322 	 * on return.
3323 	 */
3324 	atsru->hdr = (void *)(atsru + 1);
3325 	memcpy(atsru->hdr, hdr, hdr->length);
3326 	atsru->include_all = atsr->flags & 0x1;
3327 	if (!atsru->include_all) {
3328 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3329 				(void *)atsr + atsr->header.length,
3330 				&atsru->devices_cnt);
3331 		if (atsru->devices_cnt && atsru->devices == NULL) {
3332 			kfree(atsru);
3333 			return -ENOMEM;
3334 		}
3335 	}
3336 
3337 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3338 
3339 	return 0;
3340 }
3341 
3342 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3343 {
3344 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3345 	kfree(atsru);
3346 }
3347 
3348 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3349 {
3350 	struct acpi_dmar_atsr *atsr;
3351 	struct dmar_atsr_unit *atsru;
3352 
3353 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3354 	atsru = dmar_find_atsr(atsr);
3355 	if (atsru) {
3356 		list_del_rcu(&atsru->list);
3357 		synchronize_rcu();
3358 		intel_iommu_free_atsr(atsru);
3359 	}
3360 
3361 	return 0;
3362 }
3363 
3364 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3365 {
3366 	int i;
3367 	struct device *dev;
3368 	struct acpi_dmar_atsr *atsr;
3369 	struct dmar_atsr_unit *atsru;
3370 
3371 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3372 	atsru = dmar_find_atsr(atsr);
3373 	if (!atsru)
3374 		return 0;
3375 
3376 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3377 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3378 					  i, dev)
3379 			return -EBUSY;
3380 	}
3381 
3382 	return 0;
3383 }
3384 
3385 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3386 {
3387 	struct dmar_satc_unit *satcu;
3388 	struct acpi_dmar_satc *tmp;
3389 
3390 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3391 				dmar_rcu_check()) {
3392 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3393 		if (satc->segment != tmp->segment)
3394 			continue;
3395 		if (satc->header.length != tmp->header.length)
3396 			continue;
3397 		if (memcmp(satc, tmp, satc->header.length) == 0)
3398 			return satcu;
3399 	}
3400 
3401 	return NULL;
3402 }
3403 
3404 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3405 {
3406 	struct acpi_dmar_satc *satc;
3407 	struct dmar_satc_unit *satcu;
3408 
3409 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3410 		return 0;
3411 
3412 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3413 	satcu = dmar_find_satc(satc);
3414 	if (satcu)
3415 		return 0;
3416 
3417 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3418 	if (!satcu)
3419 		return -ENOMEM;
3420 
3421 	satcu->hdr = (void *)(satcu + 1);
3422 	memcpy(satcu->hdr, hdr, hdr->length);
3423 	satcu->atc_required = satc->flags & 0x1;
3424 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3425 					      (void *)satc + satc->header.length,
3426 					      &satcu->devices_cnt);
3427 	if (satcu->devices_cnt && !satcu->devices) {
3428 		kfree(satcu);
3429 		return -ENOMEM;
3430 	}
3431 	list_add_rcu(&satcu->list, &dmar_satc_units);
3432 
3433 	return 0;
3434 }
3435 
3436 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3437 {
3438 	int sp, ret;
3439 	struct intel_iommu *iommu = dmaru->iommu;
3440 
3441 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3442 	if (ret)
3443 		goto out;
3444 
3445 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3446 		pr_warn("%s: Doesn't support hardware pass through.\n",
3447 			iommu->name);
3448 		return -ENXIO;
3449 	}
3450 
3451 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3452 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3453 		pr_warn("%s: Doesn't support large page.\n",
3454 			iommu->name);
3455 		return -ENXIO;
3456 	}
3457 
3458 	/*
3459 	 * Disable translation if already enabled prior to OS handover.
3460 	 */
3461 	if (iommu->gcmd & DMA_GCMD_TE)
3462 		iommu_disable_translation(iommu);
3463 
3464 	ret = iommu_init_domains(iommu);
3465 	if (ret == 0)
3466 		ret = iommu_alloc_root_entry(iommu);
3467 	if (ret)
3468 		goto out;
3469 
3470 	intel_svm_check(iommu);
3471 
3472 	if (dmaru->ignored) {
3473 		/*
3474 		 * we always have to disable PMRs or DMA may fail on this device
3475 		 */
3476 		if (force_on)
3477 			iommu_disable_protect_mem_regions(iommu);
3478 		return 0;
3479 	}
3480 
3481 	intel_iommu_init_qi(iommu);
3482 	iommu_flush_write_buffer(iommu);
3483 
3484 #ifdef CONFIG_INTEL_IOMMU_SVM
3485 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3486 		ret = intel_svm_enable_prq(iommu);
3487 		if (ret)
3488 			goto disable_iommu;
3489 	}
3490 #endif
3491 	ret = dmar_set_interrupt(iommu);
3492 	if (ret)
3493 		goto disable_iommu;
3494 
3495 	iommu_set_root_entry(iommu);
3496 	iommu_enable_translation(iommu);
3497 
3498 	iommu_disable_protect_mem_regions(iommu);
3499 	return 0;
3500 
3501 disable_iommu:
3502 	disable_dmar_iommu(iommu);
3503 out:
3504 	free_dmar_iommu(iommu);
3505 	return ret;
3506 }
3507 
3508 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3509 {
3510 	int ret = 0;
3511 	struct intel_iommu *iommu = dmaru->iommu;
3512 
3513 	if (!intel_iommu_enabled)
3514 		return 0;
3515 	if (iommu == NULL)
3516 		return -EINVAL;
3517 
3518 	if (insert) {
3519 		ret = intel_iommu_add(dmaru);
3520 	} else {
3521 		disable_dmar_iommu(iommu);
3522 		free_dmar_iommu(iommu);
3523 	}
3524 
3525 	return ret;
3526 }
3527 
3528 static void intel_iommu_free_dmars(void)
3529 {
3530 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3531 	struct dmar_atsr_unit *atsru, *atsr_n;
3532 	struct dmar_satc_unit *satcu, *satc_n;
3533 
3534 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3535 		list_del(&rmrru->list);
3536 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3537 		kfree(rmrru);
3538 	}
3539 
3540 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3541 		list_del(&atsru->list);
3542 		intel_iommu_free_atsr(atsru);
3543 	}
3544 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3545 		list_del(&satcu->list);
3546 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3547 		kfree(satcu);
3548 	}
3549 }
3550 
3551 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3552 {
3553 	struct dmar_satc_unit *satcu;
3554 	struct acpi_dmar_satc *satc;
3555 	struct device *tmp;
3556 	int i;
3557 
3558 	dev = pci_physfn(dev);
3559 	rcu_read_lock();
3560 
3561 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3562 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3563 		if (satc->segment != pci_domain_nr(dev->bus))
3564 			continue;
3565 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3566 			if (to_pci_dev(tmp) == dev)
3567 				goto out;
3568 	}
3569 	satcu = NULL;
3570 out:
3571 	rcu_read_unlock();
3572 	return satcu;
3573 }
3574 
3575 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3576 {
3577 	int i, ret = 1;
3578 	struct pci_bus *bus;
3579 	struct pci_dev *bridge = NULL;
3580 	struct device *tmp;
3581 	struct acpi_dmar_atsr *atsr;
3582 	struct dmar_atsr_unit *atsru;
3583 	struct dmar_satc_unit *satcu;
3584 
3585 	dev = pci_physfn(dev);
3586 	satcu = dmar_find_matched_satc_unit(dev);
3587 	if (satcu)
3588 		/*
3589 		 * This device supports ATS as it is in SATC table.
3590 		 * When IOMMU is in legacy mode, enabling ATS is done
3591 		 * automatically by HW for the device that requires
3592 		 * ATS, hence OS should not enable this device ATS
3593 		 * to avoid duplicated TLB invalidation.
3594 		 */
3595 		return !(satcu->atc_required && !sm_supported(iommu));
3596 
3597 	for (bus = dev->bus; bus; bus = bus->parent) {
3598 		bridge = bus->self;
3599 		/* If it's an integrated device, allow ATS */
3600 		if (!bridge)
3601 			return 1;
3602 		/* Connected via non-PCIe: no ATS */
3603 		if (!pci_is_pcie(bridge) ||
3604 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3605 			return 0;
3606 		/* If we found the root port, look it up in the ATSR */
3607 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3608 			break;
3609 	}
3610 
3611 	rcu_read_lock();
3612 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3613 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3614 		if (atsr->segment != pci_domain_nr(dev->bus))
3615 			continue;
3616 
3617 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3618 			if (tmp == &bridge->dev)
3619 				goto out;
3620 
3621 		if (atsru->include_all)
3622 			goto out;
3623 	}
3624 	ret = 0;
3625 out:
3626 	rcu_read_unlock();
3627 
3628 	return ret;
3629 }
3630 
3631 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3632 {
3633 	int ret;
3634 	struct dmar_rmrr_unit *rmrru;
3635 	struct dmar_atsr_unit *atsru;
3636 	struct dmar_satc_unit *satcu;
3637 	struct acpi_dmar_atsr *atsr;
3638 	struct acpi_dmar_reserved_memory *rmrr;
3639 	struct acpi_dmar_satc *satc;
3640 
3641 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3642 		return 0;
3643 
3644 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3645 		rmrr = container_of(rmrru->hdr,
3646 				    struct acpi_dmar_reserved_memory, header);
3647 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3648 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3649 				((void *)rmrr) + rmrr->header.length,
3650 				rmrr->segment, rmrru->devices,
3651 				rmrru->devices_cnt);
3652 			if (ret < 0)
3653 				return ret;
3654 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3655 			dmar_remove_dev_scope(info, rmrr->segment,
3656 				rmrru->devices, rmrru->devices_cnt);
3657 		}
3658 	}
3659 
3660 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3661 		if (atsru->include_all)
3662 			continue;
3663 
3664 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3665 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3666 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3667 					(void *)atsr + atsr->header.length,
3668 					atsr->segment, atsru->devices,
3669 					atsru->devices_cnt);
3670 			if (ret > 0)
3671 				break;
3672 			else if (ret < 0)
3673 				return ret;
3674 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3675 			if (dmar_remove_dev_scope(info, atsr->segment,
3676 					atsru->devices, atsru->devices_cnt))
3677 				break;
3678 		}
3679 	}
3680 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3681 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3682 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3683 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3684 					(void *)satc + satc->header.length,
3685 					satc->segment, satcu->devices,
3686 					satcu->devices_cnt);
3687 			if (ret > 0)
3688 				break;
3689 			else if (ret < 0)
3690 				return ret;
3691 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3692 			if (dmar_remove_dev_scope(info, satc->segment,
3693 					satcu->devices, satcu->devices_cnt))
3694 				break;
3695 		}
3696 	}
3697 
3698 	return 0;
3699 }
3700 
3701 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3702 				       unsigned long val, void *v)
3703 {
3704 	struct memory_notify *mhp = v;
3705 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3706 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3707 			mhp->nr_pages - 1);
3708 
3709 	switch (val) {
3710 	case MEM_GOING_ONLINE:
3711 		if (iommu_domain_identity_map(si_domain,
3712 					      start_vpfn, last_vpfn)) {
3713 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3714 				start_vpfn, last_vpfn);
3715 			return NOTIFY_BAD;
3716 		}
3717 		break;
3718 
3719 	case MEM_OFFLINE:
3720 	case MEM_CANCEL_ONLINE:
3721 		{
3722 			struct dmar_drhd_unit *drhd;
3723 			struct intel_iommu *iommu;
3724 			LIST_HEAD(freelist);
3725 
3726 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3727 
3728 			rcu_read_lock();
3729 			for_each_active_iommu(iommu, drhd)
3730 				iommu_flush_iotlb_psi(iommu, si_domain,
3731 					start_vpfn, mhp->nr_pages,
3732 					list_empty(&freelist), 0);
3733 			rcu_read_unlock();
3734 			put_pages_list(&freelist);
3735 		}
3736 		break;
3737 	}
3738 
3739 	return NOTIFY_OK;
3740 }
3741 
3742 static struct notifier_block intel_iommu_memory_nb = {
3743 	.notifier_call = intel_iommu_memory_notifier,
3744 	.priority = 0
3745 };
3746 
3747 static void intel_disable_iommus(void)
3748 {
3749 	struct intel_iommu *iommu = NULL;
3750 	struct dmar_drhd_unit *drhd;
3751 
3752 	for_each_iommu(iommu, drhd)
3753 		iommu_disable_translation(iommu);
3754 }
3755 
3756 void intel_iommu_shutdown(void)
3757 {
3758 	struct dmar_drhd_unit *drhd;
3759 	struct intel_iommu *iommu = NULL;
3760 
3761 	if (no_iommu || dmar_disabled)
3762 		return;
3763 
3764 	down_write(&dmar_global_lock);
3765 
3766 	/* Disable PMRs explicitly here. */
3767 	for_each_iommu(iommu, drhd)
3768 		iommu_disable_protect_mem_regions(iommu);
3769 
3770 	/* Make sure the IOMMUs are switched off */
3771 	intel_disable_iommus();
3772 
3773 	up_write(&dmar_global_lock);
3774 }
3775 
3776 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3777 {
3778 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3779 
3780 	return container_of(iommu_dev, struct intel_iommu, iommu);
3781 }
3782 
3783 static ssize_t version_show(struct device *dev,
3784 			    struct device_attribute *attr, char *buf)
3785 {
3786 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3787 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3788 	return sprintf(buf, "%d:%d\n",
3789 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3790 }
3791 static DEVICE_ATTR_RO(version);
3792 
3793 static ssize_t address_show(struct device *dev,
3794 			    struct device_attribute *attr, char *buf)
3795 {
3796 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3797 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3798 }
3799 static DEVICE_ATTR_RO(address);
3800 
3801 static ssize_t cap_show(struct device *dev,
3802 			struct device_attribute *attr, char *buf)
3803 {
3804 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3805 	return sprintf(buf, "%llx\n", iommu->cap);
3806 }
3807 static DEVICE_ATTR_RO(cap);
3808 
3809 static ssize_t ecap_show(struct device *dev,
3810 			 struct device_attribute *attr, char *buf)
3811 {
3812 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3813 	return sprintf(buf, "%llx\n", iommu->ecap);
3814 }
3815 static DEVICE_ATTR_RO(ecap);
3816 
3817 static ssize_t domains_supported_show(struct device *dev,
3818 				      struct device_attribute *attr, char *buf)
3819 {
3820 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3821 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3822 }
3823 static DEVICE_ATTR_RO(domains_supported);
3824 
3825 static ssize_t domains_used_show(struct device *dev,
3826 				 struct device_attribute *attr, char *buf)
3827 {
3828 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3829 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3830 						  cap_ndoms(iommu->cap)));
3831 }
3832 static DEVICE_ATTR_RO(domains_used);
3833 
3834 static struct attribute *intel_iommu_attrs[] = {
3835 	&dev_attr_version.attr,
3836 	&dev_attr_address.attr,
3837 	&dev_attr_cap.attr,
3838 	&dev_attr_ecap.attr,
3839 	&dev_attr_domains_supported.attr,
3840 	&dev_attr_domains_used.attr,
3841 	NULL,
3842 };
3843 
3844 static struct attribute_group intel_iommu_group = {
3845 	.name = "intel-iommu",
3846 	.attrs = intel_iommu_attrs,
3847 };
3848 
3849 const struct attribute_group *intel_iommu_groups[] = {
3850 	&intel_iommu_group,
3851 	NULL,
3852 };
3853 
3854 static inline bool has_external_pci(void)
3855 {
3856 	struct pci_dev *pdev = NULL;
3857 
3858 	for_each_pci_dev(pdev)
3859 		if (pdev->external_facing)
3860 			return true;
3861 
3862 	return false;
3863 }
3864 
3865 static int __init platform_optin_force_iommu(void)
3866 {
3867 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3868 		return 0;
3869 
3870 	if (no_iommu || dmar_disabled)
3871 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3872 
3873 	/*
3874 	 * If Intel-IOMMU is disabled by default, we will apply identity
3875 	 * map for all devices except those marked as being untrusted.
3876 	 */
3877 	if (dmar_disabled)
3878 		iommu_set_default_passthrough(false);
3879 
3880 	dmar_disabled = 0;
3881 	no_iommu = 0;
3882 
3883 	return 1;
3884 }
3885 
3886 static int __init probe_acpi_namespace_devices(void)
3887 {
3888 	struct dmar_drhd_unit *drhd;
3889 	/* To avoid a -Wunused-but-set-variable warning. */
3890 	struct intel_iommu *iommu __maybe_unused;
3891 	struct device *dev;
3892 	int i, ret = 0;
3893 
3894 	for_each_active_iommu(iommu, drhd) {
3895 		for_each_active_dev_scope(drhd->devices,
3896 					  drhd->devices_cnt, i, dev) {
3897 			struct acpi_device_physical_node *pn;
3898 			struct iommu_group *group;
3899 			struct acpi_device *adev;
3900 
3901 			if (dev->bus != &acpi_bus_type)
3902 				continue;
3903 
3904 			adev = to_acpi_device(dev);
3905 			mutex_lock(&adev->physical_node_lock);
3906 			list_for_each_entry(pn,
3907 					    &adev->physical_node_list, node) {
3908 				group = iommu_group_get(pn->dev);
3909 				if (group) {
3910 					iommu_group_put(group);
3911 					continue;
3912 				}
3913 
3914 				ret = iommu_probe_device(pn->dev);
3915 				if (ret)
3916 					break;
3917 			}
3918 			mutex_unlock(&adev->physical_node_lock);
3919 
3920 			if (ret)
3921 				return ret;
3922 		}
3923 	}
3924 
3925 	return 0;
3926 }
3927 
3928 static __init int tboot_force_iommu(void)
3929 {
3930 	if (!tboot_enabled())
3931 		return 0;
3932 
3933 	if (no_iommu || dmar_disabled)
3934 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3935 
3936 	dmar_disabled = 0;
3937 	no_iommu = 0;
3938 
3939 	return 1;
3940 }
3941 
3942 int __init intel_iommu_init(void)
3943 {
3944 	int ret = -ENODEV;
3945 	struct dmar_drhd_unit *drhd;
3946 	struct intel_iommu *iommu;
3947 
3948 	/*
3949 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3950 	 * opt in, so enforce that.
3951 	 */
3952 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3953 		    platform_optin_force_iommu();
3954 
3955 	down_write(&dmar_global_lock);
3956 	if (dmar_table_init()) {
3957 		if (force_on)
3958 			panic("tboot: Failed to initialize DMAR table\n");
3959 		goto out_free_dmar;
3960 	}
3961 
3962 	if (dmar_dev_scope_init() < 0) {
3963 		if (force_on)
3964 			panic("tboot: Failed to initialize DMAR device scope\n");
3965 		goto out_free_dmar;
3966 	}
3967 
3968 	up_write(&dmar_global_lock);
3969 
3970 	/*
3971 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3972 	 * complain later when we register it under the lock.
3973 	 */
3974 	dmar_register_bus_notifier();
3975 
3976 	down_write(&dmar_global_lock);
3977 
3978 	if (!no_iommu)
3979 		intel_iommu_debugfs_init();
3980 
3981 	if (no_iommu || dmar_disabled) {
3982 		/*
3983 		 * We exit the function here to ensure IOMMU's remapping and
3984 		 * mempool aren't setup, which means that the IOMMU's PMRs
3985 		 * won't be disabled via the call to init_dmars(). So disable
3986 		 * it explicitly here. The PMRs were setup by tboot prior to
3987 		 * calling SENTER, but the kernel is expected to reset/tear
3988 		 * down the PMRs.
3989 		 */
3990 		if (intel_iommu_tboot_noforce) {
3991 			for_each_iommu(iommu, drhd)
3992 				iommu_disable_protect_mem_regions(iommu);
3993 		}
3994 
3995 		/*
3996 		 * Make sure the IOMMUs are switched off, even when we
3997 		 * boot into a kexec kernel and the previous kernel left
3998 		 * them enabled
3999 		 */
4000 		intel_disable_iommus();
4001 		goto out_free_dmar;
4002 	}
4003 
4004 	if (list_empty(&dmar_rmrr_units))
4005 		pr_info("No RMRR found\n");
4006 
4007 	if (list_empty(&dmar_atsr_units))
4008 		pr_info("No ATSR found\n");
4009 
4010 	if (list_empty(&dmar_satc_units))
4011 		pr_info("No SATC found\n");
4012 
4013 	init_no_remapping_devices();
4014 
4015 	ret = init_dmars();
4016 	if (ret) {
4017 		if (force_on)
4018 			panic("tboot: Failed to initialize DMARs\n");
4019 		pr_err("Initialization failed\n");
4020 		goto out_free_dmar;
4021 	}
4022 	up_write(&dmar_global_lock);
4023 
4024 	init_iommu_pm_ops();
4025 
4026 	down_read(&dmar_global_lock);
4027 	for_each_active_iommu(iommu, drhd) {
4028 		/*
4029 		 * The flush queue implementation does not perform
4030 		 * page-selective invalidations that are required for efficient
4031 		 * TLB flushes in virtual environments.  The benefit of batching
4032 		 * is likely to be much lower than the overhead of synchronizing
4033 		 * the virtual and physical IOMMU page-tables.
4034 		 */
4035 		if (cap_caching_mode(iommu->cap)) {
4036 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4037 			iommu_set_dma_strict();
4038 		}
4039 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4040 				       intel_iommu_groups,
4041 				       "%s", iommu->name);
4042 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4043 	}
4044 	up_read(&dmar_global_lock);
4045 
4046 	if (si_domain && !hw_pass_through)
4047 		register_memory_notifier(&intel_iommu_memory_nb);
4048 
4049 	down_read(&dmar_global_lock);
4050 	if (probe_acpi_namespace_devices())
4051 		pr_warn("ACPI name space devices didn't probe correctly\n");
4052 
4053 	/* Finally, we enable the DMA remapping hardware. */
4054 	for_each_iommu(iommu, drhd) {
4055 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4056 			iommu_enable_translation(iommu);
4057 
4058 		iommu_disable_protect_mem_regions(iommu);
4059 	}
4060 	up_read(&dmar_global_lock);
4061 
4062 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4063 
4064 	intel_iommu_enabled = 1;
4065 
4066 	return 0;
4067 
4068 out_free_dmar:
4069 	intel_iommu_free_dmars();
4070 	up_write(&dmar_global_lock);
4071 	return ret;
4072 }
4073 
4074 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4075 {
4076 	struct device_domain_info *info = opaque;
4077 
4078 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4079 	return 0;
4080 }
4081 
4082 /*
4083  * NB - intel-iommu lacks any sort of reference counting for the users of
4084  * dependent devices.  If multiple endpoints have intersecting dependent
4085  * devices, unbinding the driver from any one of them will possibly leave
4086  * the others unable to operate.
4087  */
4088 static void domain_context_clear(struct device_domain_info *info)
4089 {
4090 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4091 		return;
4092 
4093 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4094 			       &domain_context_clear_one_cb, info);
4095 }
4096 
4097 static void dmar_remove_one_dev_info(struct device *dev)
4098 {
4099 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4100 	struct dmar_domain *domain = info->domain;
4101 	struct intel_iommu *iommu = info->iommu;
4102 	unsigned long flags;
4103 
4104 	if (!dev_is_real_dma_subdevice(info->dev)) {
4105 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4106 			intel_pasid_tear_down_entry(iommu, info->dev,
4107 					PASID_RID2PASID, false);
4108 
4109 		iommu_disable_dev_iotlb(info);
4110 		domain_context_clear(info);
4111 		intel_pasid_free_table(info->dev);
4112 	}
4113 
4114 	spin_lock_irqsave(&domain->lock, flags);
4115 	list_del(&info->link);
4116 	spin_unlock_irqrestore(&domain->lock, flags);
4117 
4118 	domain_detach_iommu(domain, iommu);
4119 	info->domain = NULL;
4120 }
4121 
4122 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4123 {
4124 	int adjust_width;
4125 
4126 	/* calculate AGAW */
4127 	domain->gaw = guest_width;
4128 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4129 	domain->agaw = width_to_agaw(adjust_width);
4130 
4131 	domain->iommu_coherency = false;
4132 	domain->iommu_superpage = 0;
4133 	domain->max_addr = 0;
4134 
4135 	/* always allocate the top pgd */
4136 	domain->pgd = alloc_pgtable_page(domain->nid);
4137 	if (!domain->pgd)
4138 		return -ENOMEM;
4139 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4140 	return 0;
4141 }
4142 
4143 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4144 {
4145 	struct dmar_domain *dmar_domain;
4146 	struct iommu_domain *domain;
4147 
4148 	switch (type) {
4149 	case IOMMU_DOMAIN_DMA:
4150 	case IOMMU_DOMAIN_DMA_FQ:
4151 	case IOMMU_DOMAIN_UNMANAGED:
4152 		dmar_domain = alloc_domain(type);
4153 		if (!dmar_domain) {
4154 			pr_err("Can't allocate dmar_domain\n");
4155 			return NULL;
4156 		}
4157 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4158 			pr_err("Domain initialization failed\n");
4159 			domain_exit(dmar_domain);
4160 			return NULL;
4161 		}
4162 
4163 		domain = &dmar_domain->domain;
4164 		domain->geometry.aperture_start = 0;
4165 		domain->geometry.aperture_end   =
4166 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4167 		domain->geometry.force_aperture = true;
4168 
4169 		return domain;
4170 	case IOMMU_DOMAIN_IDENTITY:
4171 		return &si_domain->domain;
4172 	default:
4173 		return NULL;
4174 	}
4175 
4176 	return NULL;
4177 }
4178 
4179 static void intel_iommu_domain_free(struct iommu_domain *domain)
4180 {
4181 	if (domain != &si_domain->domain)
4182 		domain_exit(to_dmar_domain(domain));
4183 }
4184 
4185 static int prepare_domain_attach_device(struct iommu_domain *domain,
4186 					struct device *dev)
4187 {
4188 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4189 	struct intel_iommu *iommu;
4190 	int addr_width;
4191 
4192 	iommu = device_to_iommu(dev, NULL, NULL);
4193 	if (!iommu)
4194 		return -ENODEV;
4195 
4196 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4197 		return -EOPNOTSUPP;
4198 
4199 	/* check if this iommu agaw is sufficient for max mapped address */
4200 	addr_width = agaw_to_width(iommu->agaw);
4201 	if (addr_width > cap_mgaw(iommu->cap))
4202 		addr_width = cap_mgaw(iommu->cap);
4203 
4204 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4205 		dev_err(dev, "%s: iommu width (%d) is not "
4206 		        "sufficient for the mapped address (%llx)\n",
4207 		        __func__, addr_width, dmar_domain->max_addr);
4208 		return -EFAULT;
4209 	}
4210 	dmar_domain->gaw = addr_width;
4211 
4212 	/*
4213 	 * Knock out extra levels of page tables if necessary
4214 	 */
4215 	while (iommu->agaw < dmar_domain->agaw) {
4216 		struct dma_pte *pte;
4217 
4218 		pte = dmar_domain->pgd;
4219 		if (dma_pte_present(pte)) {
4220 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4221 			free_pgtable_page(pte);
4222 		}
4223 		dmar_domain->agaw--;
4224 	}
4225 
4226 	return 0;
4227 }
4228 
4229 static int intel_iommu_attach_device(struct iommu_domain *domain,
4230 				     struct device *dev)
4231 {
4232 	int ret;
4233 
4234 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4235 	    device_is_rmrr_locked(dev)) {
4236 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4237 		return -EPERM;
4238 	}
4239 
4240 	/* normally dev is not mapped */
4241 	if (unlikely(domain_context_mapped(dev))) {
4242 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4243 
4244 		if (info->domain)
4245 			dmar_remove_one_dev_info(dev);
4246 	}
4247 
4248 	ret = prepare_domain_attach_device(domain, dev);
4249 	if (ret)
4250 		return ret;
4251 
4252 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4253 }
4254 
4255 static void intel_iommu_detach_device(struct iommu_domain *domain,
4256 				      struct device *dev)
4257 {
4258 	dmar_remove_one_dev_info(dev);
4259 }
4260 
4261 static int intel_iommu_map(struct iommu_domain *domain,
4262 			   unsigned long iova, phys_addr_t hpa,
4263 			   size_t size, int iommu_prot, gfp_t gfp)
4264 {
4265 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4266 	u64 max_addr;
4267 	int prot = 0;
4268 
4269 	if (iommu_prot & IOMMU_READ)
4270 		prot |= DMA_PTE_READ;
4271 	if (iommu_prot & IOMMU_WRITE)
4272 		prot |= DMA_PTE_WRITE;
4273 	if (dmar_domain->set_pte_snp)
4274 		prot |= DMA_PTE_SNP;
4275 
4276 	max_addr = iova + size;
4277 	if (dmar_domain->max_addr < max_addr) {
4278 		u64 end;
4279 
4280 		/* check if minimum agaw is sufficient for mapped address */
4281 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4282 		if (end < max_addr) {
4283 			pr_err("%s: iommu width (%d) is not "
4284 			       "sufficient for the mapped address (%llx)\n",
4285 			       __func__, dmar_domain->gaw, max_addr);
4286 			return -EFAULT;
4287 		}
4288 		dmar_domain->max_addr = max_addr;
4289 	}
4290 	/* Round up size to next multiple of PAGE_SIZE, if it and
4291 	   the low bits of hpa would take us onto the next page */
4292 	size = aligned_nrpages(hpa, size);
4293 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4294 				hpa >> VTD_PAGE_SHIFT, size, prot);
4295 }
4296 
4297 static int intel_iommu_map_pages(struct iommu_domain *domain,
4298 				 unsigned long iova, phys_addr_t paddr,
4299 				 size_t pgsize, size_t pgcount,
4300 				 int prot, gfp_t gfp, size_t *mapped)
4301 {
4302 	unsigned long pgshift = __ffs(pgsize);
4303 	size_t size = pgcount << pgshift;
4304 	int ret;
4305 
4306 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4307 		return -EINVAL;
4308 
4309 	if (!IS_ALIGNED(iova | paddr, pgsize))
4310 		return -EINVAL;
4311 
4312 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4313 	if (!ret && mapped)
4314 		*mapped = size;
4315 
4316 	return ret;
4317 }
4318 
4319 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4320 				unsigned long iova, size_t size,
4321 				struct iommu_iotlb_gather *gather)
4322 {
4323 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4324 	unsigned long start_pfn, last_pfn;
4325 	int level = 0;
4326 
4327 	/* Cope with horrid API which requires us to unmap more than the
4328 	   size argument if it happens to be a large-page mapping. */
4329 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4330 
4331 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4332 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4333 
4334 	start_pfn = iova >> VTD_PAGE_SHIFT;
4335 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4336 
4337 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4338 
4339 	if (dmar_domain->max_addr == iova + size)
4340 		dmar_domain->max_addr = iova;
4341 
4342 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4343 
4344 	return size;
4345 }
4346 
4347 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4348 				      unsigned long iova,
4349 				      size_t pgsize, size_t pgcount,
4350 				      struct iommu_iotlb_gather *gather)
4351 {
4352 	unsigned long pgshift = __ffs(pgsize);
4353 	size_t size = pgcount << pgshift;
4354 
4355 	return intel_iommu_unmap(domain, iova, size, gather);
4356 }
4357 
4358 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4359 				 struct iommu_iotlb_gather *gather)
4360 {
4361 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4362 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4363 	size_t size = gather->end - gather->start;
4364 	struct iommu_domain_info *info;
4365 	unsigned long start_pfn;
4366 	unsigned long nrpages;
4367 	unsigned long i;
4368 
4369 	nrpages = aligned_nrpages(gather->start, size);
4370 	start_pfn = mm_to_dma_pfn(iova_pfn);
4371 
4372 	xa_for_each(&dmar_domain->iommu_array, i, info)
4373 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4374 				      start_pfn, nrpages,
4375 				      list_empty(&gather->freelist), 0);
4376 
4377 	put_pages_list(&gather->freelist);
4378 }
4379 
4380 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4381 					    dma_addr_t iova)
4382 {
4383 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4384 	struct dma_pte *pte;
4385 	int level = 0;
4386 	u64 phys = 0;
4387 
4388 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4389 	if (pte && dma_pte_present(pte))
4390 		phys = dma_pte_addr(pte) +
4391 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4392 						VTD_PAGE_SHIFT) - 1));
4393 
4394 	return phys;
4395 }
4396 
4397 static bool domain_support_force_snooping(struct dmar_domain *domain)
4398 {
4399 	struct device_domain_info *info;
4400 	bool support = true;
4401 
4402 	assert_spin_locked(&domain->lock);
4403 	list_for_each_entry(info, &domain->devices, link) {
4404 		if (!ecap_sc_support(info->iommu->ecap)) {
4405 			support = false;
4406 			break;
4407 		}
4408 	}
4409 
4410 	return support;
4411 }
4412 
4413 static void domain_set_force_snooping(struct dmar_domain *domain)
4414 {
4415 	struct device_domain_info *info;
4416 
4417 	assert_spin_locked(&domain->lock);
4418 	/*
4419 	 * Second level page table supports per-PTE snoop control. The
4420 	 * iommu_map() interface will handle this by setting SNP bit.
4421 	 */
4422 	if (!domain_use_first_level(domain)) {
4423 		domain->set_pte_snp = true;
4424 		return;
4425 	}
4426 
4427 	list_for_each_entry(info, &domain->devices, link)
4428 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4429 						     PASID_RID2PASID);
4430 }
4431 
4432 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4433 {
4434 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4435 	unsigned long flags;
4436 
4437 	if (dmar_domain->force_snooping)
4438 		return true;
4439 
4440 	spin_lock_irqsave(&dmar_domain->lock, flags);
4441 	if (!domain_support_force_snooping(dmar_domain)) {
4442 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4443 		return false;
4444 	}
4445 
4446 	domain_set_force_snooping(dmar_domain);
4447 	dmar_domain->force_snooping = true;
4448 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4449 
4450 	return true;
4451 }
4452 
4453 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4454 {
4455 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4456 		return true;
4457 	if (cap == IOMMU_CAP_INTR_REMAP)
4458 		return irq_remapping_enabled == 1;
4459 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4460 		return dmar_platform_optin();
4461 
4462 	return false;
4463 }
4464 
4465 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4466 {
4467 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4468 	struct device_domain_info *info;
4469 	struct intel_iommu *iommu;
4470 	u8 bus, devfn;
4471 
4472 	iommu = device_to_iommu(dev, &bus, &devfn);
4473 	if (!iommu || !iommu->iommu.ops)
4474 		return ERR_PTR(-ENODEV);
4475 
4476 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4477 	if (!info)
4478 		return ERR_PTR(-ENOMEM);
4479 
4480 	if (dev_is_real_dma_subdevice(dev)) {
4481 		info->bus = pdev->bus->number;
4482 		info->devfn = pdev->devfn;
4483 		info->segment = pci_domain_nr(pdev->bus);
4484 	} else {
4485 		info->bus = bus;
4486 		info->devfn = devfn;
4487 		info->segment = iommu->segment;
4488 	}
4489 
4490 	info->dev = dev;
4491 	info->iommu = iommu;
4492 	if (dev_is_pci(dev)) {
4493 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4494 		    pci_ats_supported(pdev) &&
4495 		    dmar_ats_supported(pdev, iommu))
4496 			info->ats_supported = 1;
4497 
4498 		if (sm_supported(iommu)) {
4499 			if (pasid_supported(iommu)) {
4500 				int features = pci_pasid_features(pdev);
4501 
4502 				if (features >= 0)
4503 					info->pasid_supported = features | 1;
4504 			}
4505 
4506 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4507 			    pci_pri_supported(pdev))
4508 				info->pri_supported = 1;
4509 		}
4510 	}
4511 
4512 	dev_iommu_priv_set(dev, info);
4513 
4514 	return &iommu->iommu;
4515 }
4516 
4517 static void intel_iommu_release_device(struct device *dev)
4518 {
4519 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4520 
4521 	dmar_remove_one_dev_info(dev);
4522 	dev_iommu_priv_set(dev, NULL);
4523 	kfree(info);
4524 	set_dma_ops(dev, NULL);
4525 }
4526 
4527 static void intel_iommu_probe_finalize(struct device *dev)
4528 {
4529 	set_dma_ops(dev, NULL);
4530 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4531 }
4532 
4533 static void intel_iommu_get_resv_regions(struct device *device,
4534 					 struct list_head *head)
4535 {
4536 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4537 	struct iommu_resv_region *reg;
4538 	struct dmar_rmrr_unit *rmrr;
4539 	struct device *i_dev;
4540 	int i;
4541 
4542 	rcu_read_lock();
4543 	for_each_rmrr_units(rmrr) {
4544 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4545 					  i, i_dev) {
4546 			struct iommu_resv_region *resv;
4547 			enum iommu_resv_type type;
4548 			size_t length;
4549 
4550 			if (i_dev != device &&
4551 			    !is_downstream_to_pci_bridge(device, i_dev))
4552 				continue;
4553 
4554 			length = rmrr->end_address - rmrr->base_address + 1;
4555 
4556 			type = device_rmrr_is_relaxable(device) ?
4557 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4558 
4559 			resv = iommu_alloc_resv_region(rmrr->base_address,
4560 						       length, prot, type,
4561 						       GFP_ATOMIC);
4562 			if (!resv)
4563 				break;
4564 
4565 			list_add_tail(&resv->list, head);
4566 		}
4567 	}
4568 	rcu_read_unlock();
4569 
4570 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4571 	if (dev_is_pci(device)) {
4572 		struct pci_dev *pdev = to_pci_dev(device);
4573 
4574 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4575 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4576 					IOMMU_RESV_DIRECT_RELAXABLE,
4577 					GFP_KERNEL);
4578 			if (reg)
4579 				list_add_tail(&reg->list, head);
4580 		}
4581 	}
4582 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4583 
4584 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4585 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4586 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4587 	if (!reg)
4588 		return;
4589 	list_add_tail(&reg->list, head);
4590 }
4591 
4592 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4593 {
4594 	if (dev_is_pci(dev))
4595 		return pci_device_group(dev);
4596 	return generic_device_group(dev);
4597 }
4598 
4599 static int intel_iommu_enable_sva(struct device *dev)
4600 {
4601 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4602 	struct intel_iommu *iommu;
4603 	int ret;
4604 
4605 	if (!info || dmar_disabled)
4606 		return -EINVAL;
4607 
4608 	iommu = info->iommu;
4609 	if (!iommu)
4610 		return -EINVAL;
4611 
4612 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4613 		return -ENODEV;
4614 
4615 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4616 		return -EINVAL;
4617 
4618 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4619 	if (!ret)
4620 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4621 
4622 	return ret;
4623 }
4624 
4625 static int intel_iommu_disable_sva(struct device *dev)
4626 {
4627 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4628 	struct intel_iommu *iommu = info->iommu;
4629 	int ret;
4630 
4631 	ret = iommu_unregister_device_fault_handler(dev);
4632 	if (!ret)
4633 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4634 
4635 	return ret;
4636 }
4637 
4638 static int intel_iommu_enable_iopf(struct device *dev)
4639 {
4640 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4641 
4642 	if (info && info->pri_supported)
4643 		return 0;
4644 
4645 	return -ENODEV;
4646 }
4647 
4648 static int
4649 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4650 {
4651 	switch (feat) {
4652 	case IOMMU_DEV_FEAT_IOPF:
4653 		return intel_iommu_enable_iopf(dev);
4654 
4655 	case IOMMU_DEV_FEAT_SVA:
4656 		return intel_iommu_enable_sva(dev);
4657 
4658 	default:
4659 		return -ENODEV;
4660 	}
4661 }
4662 
4663 static int
4664 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4665 {
4666 	switch (feat) {
4667 	case IOMMU_DEV_FEAT_IOPF:
4668 		return 0;
4669 
4670 	case IOMMU_DEV_FEAT_SVA:
4671 		return intel_iommu_disable_sva(dev);
4672 
4673 	default:
4674 		return -ENODEV;
4675 	}
4676 }
4677 
4678 static bool intel_iommu_is_attach_deferred(struct device *dev)
4679 {
4680 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4681 
4682 	return translation_pre_enabled(info->iommu) && !info->domain;
4683 }
4684 
4685 /*
4686  * Check that the device does not live on an external facing PCI port that is
4687  * marked as untrusted. Such devices should not be able to apply quirks and
4688  * thus not be able to bypass the IOMMU restrictions.
4689  */
4690 static bool risky_device(struct pci_dev *pdev)
4691 {
4692 	if (pdev->untrusted) {
4693 		pci_info(pdev,
4694 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4695 			 pdev->vendor, pdev->device);
4696 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4697 		return true;
4698 	}
4699 	return false;
4700 }
4701 
4702 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4703 				       unsigned long iova, size_t size)
4704 {
4705 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4706 	unsigned long pages = aligned_nrpages(iova, size);
4707 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4708 	struct iommu_domain_info *info;
4709 	unsigned long i;
4710 
4711 	xa_for_each(&dmar_domain->iommu_array, i, info)
4712 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4713 }
4714 
4715 const struct iommu_ops intel_iommu_ops = {
4716 	.capable		= intel_iommu_capable,
4717 	.domain_alloc		= intel_iommu_domain_alloc,
4718 	.probe_device		= intel_iommu_probe_device,
4719 	.probe_finalize		= intel_iommu_probe_finalize,
4720 	.release_device		= intel_iommu_release_device,
4721 	.get_resv_regions	= intel_iommu_get_resv_regions,
4722 	.device_group		= intel_iommu_device_group,
4723 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4724 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4725 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4726 	.def_domain_type	= device_def_domain_type,
4727 	.pgsize_bitmap		= SZ_4K,
4728 #ifdef CONFIG_INTEL_IOMMU_SVM
4729 	.sva_bind		= intel_svm_bind,
4730 	.sva_unbind		= intel_svm_unbind,
4731 	.sva_get_pasid		= intel_svm_get_pasid,
4732 	.page_response		= intel_svm_page_response,
4733 #endif
4734 	.default_domain_ops = &(const struct iommu_domain_ops) {
4735 		.attach_dev		= intel_iommu_attach_device,
4736 		.detach_dev		= intel_iommu_detach_device,
4737 		.map_pages		= intel_iommu_map_pages,
4738 		.unmap_pages		= intel_iommu_unmap_pages,
4739 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4740 		.flush_iotlb_all        = intel_flush_iotlb_all,
4741 		.iotlb_sync		= intel_iommu_tlb_sync,
4742 		.iova_to_phys		= intel_iommu_iova_to_phys,
4743 		.free			= intel_iommu_domain_free,
4744 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4745 	}
4746 };
4747 
4748 static void quirk_iommu_igfx(struct pci_dev *dev)
4749 {
4750 	if (risky_device(dev))
4751 		return;
4752 
4753 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4754 	dmar_map_gfx = 0;
4755 }
4756 
4757 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4758 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4759 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4760 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4763 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4764 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4765 
4766 /* Broadwell igfx malfunctions with dmar */
4767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4774 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4791 
4792 static void quirk_iommu_rwbf(struct pci_dev *dev)
4793 {
4794 	if (risky_device(dev))
4795 		return;
4796 
4797 	/*
4798 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4799 	 * but needs it. Same seems to hold for the desktop versions.
4800 	 */
4801 	pci_info(dev, "Forcing write-buffer flush capability\n");
4802 	rwbf_quirk = 1;
4803 }
4804 
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4812 
4813 #define GGC 0x52
4814 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4815 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4816 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4817 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4818 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4819 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4820 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4821 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4822 
4823 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4824 {
4825 	unsigned short ggc;
4826 
4827 	if (risky_device(dev))
4828 		return;
4829 
4830 	if (pci_read_config_word(dev, GGC, &ggc))
4831 		return;
4832 
4833 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4834 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4835 		dmar_map_gfx = 0;
4836 	} else if (dmar_map_gfx) {
4837 		/* we have to ensure the gfx device is idle before we flush */
4838 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4839 		iommu_set_dma_strict();
4840 	}
4841 }
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4846 
4847 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4848 {
4849 	unsigned short ver;
4850 
4851 	if (!IS_GFX_DEVICE(dev))
4852 		return;
4853 
4854 	ver = (dev->device >> 8) & 0xff;
4855 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4856 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4857 	    ver != 0x9a && ver != 0xa7)
4858 		return;
4859 
4860 	if (risky_device(dev))
4861 		return;
4862 
4863 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4864 	iommu_skip_te_disable = 1;
4865 }
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4867 
4868 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4869    ISOCH DMAR unit for the Azalia sound device, but not give it any
4870    TLB entries, which causes it to deadlock. Check for that.  We do
4871    this in a function called from init_dmars(), instead of in a PCI
4872    quirk, because we don't want to print the obnoxious "BIOS broken"
4873    message if VT-d is actually disabled.
4874 */
4875 static void __init check_tylersburg_isoch(void)
4876 {
4877 	struct pci_dev *pdev;
4878 	uint32_t vtisochctrl;
4879 
4880 	/* If there's no Azalia in the system anyway, forget it. */
4881 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4882 	if (!pdev)
4883 		return;
4884 
4885 	if (risky_device(pdev)) {
4886 		pci_dev_put(pdev);
4887 		return;
4888 	}
4889 
4890 	pci_dev_put(pdev);
4891 
4892 	/* System Management Registers. Might be hidden, in which case
4893 	   we can't do the sanity check. But that's OK, because the
4894 	   known-broken BIOSes _don't_ actually hide it, so far. */
4895 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4896 	if (!pdev)
4897 		return;
4898 
4899 	if (risky_device(pdev)) {
4900 		pci_dev_put(pdev);
4901 		return;
4902 	}
4903 
4904 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4905 		pci_dev_put(pdev);
4906 		return;
4907 	}
4908 
4909 	pci_dev_put(pdev);
4910 
4911 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4912 	if (vtisochctrl & 1)
4913 		return;
4914 
4915 	/* Drop all bits other than the number of TLB entries */
4916 	vtisochctrl &= 0x1c;
4917 
4918 	/* If we have the recommended number of TLB entries (16), fine. */
4919 	if (vtisochctrl == 0x10)
4920 		return;
4921 
4922 	/* Zero TLB entries? You get to ride the short bus to school. */
4923 	if (!vtisochctrl) {
4924 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4925 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4926 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4927 		     dmi_get_system_info(DMI_BIOS_VERSION),
4928 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4929 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4930 		return;
4931 	}
4932 
4933 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4934 	       vtisochctrl);
4935 }
4936