xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 9ebdff9aac5ded7bb515e80478afacaaa3abd799)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 
26 #include "iommu.h"
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
282 
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285 
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288 
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
293 
294 #define IDENTMAP_GFX		2
295 #define IDENTMAP_AZALIA		4
296 
297 const struct iommu_ops intel_iommu_ops;
298 
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
300 {
301 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302 }
303 
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307 }
308 
309 static void init_translation_status(struct intel_iommu *iommu)
310 {
311 	u32 gsts;
312 
313 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 	if (gsts & DMA_GSTS_TES)
315 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316 }
317 
318 static int __init intel_iommu_setup(char *str)
319 {
320 	if (!str)
321 		return -EINVAL;
322 
323 	while (*str) {
324 		if (!strncmp(str, "on", 2)) {
325 			dmar_disabled = 0;
326 			pr_info("IOMMU enabled\n");
327 		} else if (!strncmp(str, "off", 3)) {
328 			dmar_disabled = 1;
329 			no_platform_optin = 1;
330 			pr_info("IOMMU disabled\n");
331 		} else if (!strncmp(str, "igfx_off", 8)) {
332 			dmar_map_gfx = 0;
333 			pr_info("Disable GFX device mapping\n");
334 		} else if (!strncmp(str, "forcedac", 8)) {
335 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 			iommu_dma_forcedac = true;
337 		} else if (!strncmp(str, "strict", 6)) {
338 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 			iommu_set_dma_strict();
340 		} else if (!strncmp(str, "sp_off", 6)) {
341 			pr_info("Disable supported super page\n");
342 			intel_iommu_superpage = 0;
343 		} else if (!strncmp(str, "sm_on", 5)) {
344 			pr_info("Enable scalable mode if hardware supports\n");
345 			intel_iommu_sm = 1;
346 		} else if (!strncmp(str, "sm_off", 6)) {
347 			pr_info("Scalable mode is disallowed\n");
348 			intel_iommu_sm = 0;
349 		} else if (!strncmp(str, "tboot_noforce", 13)) {
350 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 			intel_iommu_tboot_noforce = 1;
352 		} else {
353 			pr_notice("Unknown option - '%s'\n", str);
354 		}
355 
356 		str += strcspn(str, ",");
357 		while (*str == ',')
358 			str++;
359 	}
360 
361 	return 1;
362 }
363 __setup("intel_iommu=", intel_iommu_setup);
364 
365 void *alloc_pgtable_page(int node, gfp_t gfp)
366 {
367 	struct page *page;
368 	void *vaddr = NULL;
369 
370 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
371 	if (page)
372 		vaddr = page_address(page);
373 	return vaddr;
374 }
375 
376 void free_pgtable_page(void *vaddr)
377 {
378 	free_page((unsigned long)vaddr);
379 }
380 
381 static inline int domain_type_is_si(struct dmar_domain *domain)
382 {
383 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384 }
385 
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
387 				       unsigned long pfn)
388 {
389 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390 
391 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392 }
393 
394 /*
395  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397  * the returned SAGAW.
398  */
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400 {
401 	unsigned long fl_sagaw, sl_sagaw;
402 
403 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 	sl_sagaw = cap_sagaw(iommu->cap);
405 
406 	/* Second level only. */
407 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 		return sl_sagaw;
409 
410 	/* First level only. */
411 	if (!ecap_slts(iommu->ecap))
412 		return fl_sagaw;
413 
414 	return fl_sagaw & sl_sagaw;
415 }
416 
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418 {
419 	unsigned long sagaw;
420 	int agaw;
421 
422 	sagaw = __iommu_calculate_sagaw(iommu);
423 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 		if (test_bit(agaw, &sagaw))
425 			break;
426 	}
427 
428 	return agaw;
429 }
430 
431 /*
432  * Calculate max SAGAW for each iommu.
433  */
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435 {
436 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437 }
438 
439 /*
440  * calculate agaw for each iommu.
441  * "SAGAW" may be different across iommus, use a default agaw, and
442  * get a supported less agaw for iommus that don't support the default agaw.
443  */
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
445 {
446 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447 }
448 
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450 {
451 	return sm_supported(iommu) ?
452 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453 }
454 
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
456 {
457 	struct iommu_domain_info *info;
458 	struct dmar_drhd_unit *drhd;
459 	struct intel_iommu *iommu;
460 	bool found = false;
461 	unsigned long i;
462 
463 	domain->iommu_coherency = true;
464 	xa_for_each(&domain->iommu_array, i, info) {
465 		found = true;
466 		if (!iommu_paging_structure_coherency(info->iommu)) {
467 			domain->iommu_coherency = false;
468 			break;
469 		}
470 	}
471 	if (found)
472 		return;
473 
474 	/* No hardware attached; use lowest common denominator */
475 	rcu_read_lock();
476 	for_each_active_iommu(iommu, drhd) {
477 		if (!iommu_paging_structure_coherency(iommu)) {
478 			domain->iommu_coherency = false;
479 			break;
480 		}
481 	}
482 	rcu_read_unlock();
483 }
484 
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 					 struct intel_iommu *skip)
487 {
488 	struct dmar_drhd_unit *drhd;
489 	struct intel_iommu *iommu;
490 	int mask = 0x3;
491 
492 	if (!intel_iommu_superpage)
493 		return 0;
494 
495 	/* set iommu_superpage to the smallest common denominator */
496 	rcu_read_lock();
497 	for_each_active_iommu(iommu, drhd) {
498 		if (iommu != skip) {
499 			if (domain && domain->use_first_level) {
500 				if (!cap_fl1gp_support(iommu->cap))
501 					mask = 0x1;
502 			} else {
503 				mask &= cap_super_page_val(iommu->cap);
504 			}
505 
506 			if (!mask)
507 				break;
508 		}
509 	}
510 	rcu_read_unlock();
511 
512 	return fls(mask);
513 }
514 
515 static int domain_update_device_node(struct dmar_domain *domain)
516 {
517 	struct device_domain_info *info;
518 	int nid = NUMA_NO_NODE;
519 	unsigned long flags;
520 
521 	spin_lock_irqsave(&domain->lock, flags);
522 	list_for_each_entry(info, &domain->devices, link) {
523 		/*
524 		 * There could possibly be multiple device numa nodes as devices
525 		 * within the same domain may sit behind different IOMMUs. There
526 		 * isn't perfect answer in such situation, so we select first
527 		 * come first served policy.
528 		 */
529 		nid = dev_to_node(info->dev);
530 		if (nid != NUMA_NO_NODE)
531 			break;
532 	}
533 	spin_unlock_irqrestore(&domain->lock, flags);
534 
535 	return nid;
536 }
537 
538 static void domain_update_iotlb(struct dmar_domain *domain);
539 
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542 {
543 	unsigned long bitmap = 0;
544 
545 	/*
546 	 * 1-level super page supports page size of 2MiB, 2-level super page
547 	 * supports page size of both 2MiB and 1GiB.
548 	 */
549 	if (domain->iommu_superpage == 1)
550 		bitmap |= SZ_2M;
551 	else if (domain->iommu_superpage == 2)
552 		bitmap |= SZ_2M | SZ_1G;
553 
554 	return bitmap;
555 }
556 
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560 	domain_update_iommu_coherency(domain);
561 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562 
563 	/*
564 	 * If RHSA is missing, we should default to the device numa domain
565 	 * as fall back.
566 	 */
567 	if (domain->nid == NUMA_NO_NODE)
568 		domain->nid = domain_update_device_node(domain);
569 
570 	/*
571 	 * First-level translation restricts the input-address to a
572 	 * canonical address (i.e., address bits 63:N have the same
573 	 * value as address bit [N-1], where N is 48-bits with 4-level
574 	 * paging and 57-bits with 5-level paging). Hence, skip bit
575 	 * [N-1].
576 	 */
577 	if (domain->use_first_level)
578 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 	else
580 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581 
582 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 	domain_update_iotlb(domain);
584 }
585 
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 					 u8 devfn, int alloc)
588 {
589 	struct root_entry *root = &iommu->root_entry[bus];
590 	struct context_entry *context;
591 	u64 *entry;
592 
593 	/*
594 	 * Except that the caller requested to allocate a new entry,
595 	 * returning a copied context entry makes no sense.
596 	 */
597 	if (!alloc && context_copied(iommu, bus, devfn))
598 		return NULL;
599 
600 	entry = &root->lo;
601 	if (sm_supported(iommu)) {
602 		if (devfn >= 0x80) {
603 			devfn -= 0x80;
604 			entry = &root->hi;
605 		}
606 		devfn *= 2;
607 	}
608 	if (*entry & 1)
609 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 	else {
611 		unsigned long phy_addr;
612 		if (!alloc)
613 			return NULL;
614 
615 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
616 		if (!context)
617 			return NULL;
618 
619 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 		phy_addr = virt_to_phys((void *)context);
621 		*entry = phy_addr | 1;
622 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
623 	}
624 	return &context[devfn];
625 }
626 
627 /**
628  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629  *				 sub-hierarchy of a candidate PCI-PCI bridge
630  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631  * @bridge: the candidate PCI-PCI bridge
632  *
633  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634  */
635 static bool
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637 {
638 	struct pci_dev *pdev, *pbridge;
639 
640 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 		return false;
642 
643 	pdev = to_pci_dev(dev);
644 	pbridge = to_pci_dev(bridge);
645 
646 	if (pbridge->subordinate &&
647 	    pbridge->subordinate->number <= pdev->bus->number &&
648 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 		return true;
650 
651 	return false;
652 }
653 
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655 {
656 	struct dmar_drhd_unit *drhd;
657 	u32 vtbar;
658 	int rc;
659 
660 	/* We know that this device on this chipset has its own IOMMU.
661 	 * If we find it under a different IOMMU, then the BIOS is lying
662 	 * to us. Hope that the IOMMU for this device is actually
663 	 * disabled, and it needs no translation...
664 	 */
665 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 	if (rc) {
667 		/* "can't" happen */
668 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 		return false;
670 	}
671 	vtbar &= 0xffff0000;
672 
673 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
674 	drhd = dmar_find_matched_drhd_unit(pdev);
675 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 		return true;
679 	}
680 
681 	return false;
682 }
683 
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685 {
686 	if (!iommu || iommu->drhd->ignored)
687 		return true;
688 
689 	if (dev_is_pci(dev)) {
690 		struct pci_dev *pdev = to_pci_dev(dev);
691 
692 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 		    quirk_ioat_snb_local_iommu(pdev))
695 			return true;
696 	}
697 
698 	return false;
699 }
700 
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702 {
703 	struct dmar_drhd_unit *drhd = NULL;
704 	struct pci_dev *pdev = NULL;
705 	struct intel_iommu *iommu;
706 	struct device *tmp;
707 	u16 segment = 0;
708 	int i;
709 
710 	if (!dev)
711 		return NULL;
712 
713 	if (dev_is_pci(dev)) {
714 		struct pci_dev *pf_pdev;
715 
716 		pdev = pci_real_dma_dev(to_pci_dev(dev));
717 
718 		/* VFs aren't listed in scope tables; we need to look up
719 		 * the PF instead to find the IOMMU. */
720 		pf_pdev = pci_physfn(pdev);
721 		dev = &pf_pdev->dev;
722 		segment = pci_domain_nr(pdev->bus);
723 	} else if (has_acpi_companion(dev))
724 		dev = &ACPI_COMPANION(dev)->dev;
725 
726 	rcu_read_lock();
727 	for_each_iommu(iommu, drhd) {
728 		if (pdev && segment != drhd->segment)
729 			continue;
730 
731 		for_each_active_dev_scope(drhd->devices,
732 					  drhd->devices_cnt, i, tmp) {
733 			if (tmp == dev) {
734 				/* For a VF use its original BDF# not that of the PF
735 				 * which we used for the IOMMU lookup. Strictly speaking
736 				 * we could do this for all PCI devices; we only need to
737 				 * get the BDF# from the scope table for ACPI matches. */
738 				if (pdev && pdev->is_virtfn)
739 					goto got_pdev;
740 
741 				if (bus && devfn) {
742 					*bus = drhd->devices[i].bus;
743 					*devfn = drhd->devices[i].devfn;
744 				}
745 				goto out;
746 			}
747 
748 			if (is_downstream_to_pci_bridge(dev, tmp))
749 				goto got_pdev;
750 		}
751 
752 		if (pdev && drhd->include_all) {
753 got_pdev:
754 			if (bus && devfn) {
755 				*bus = pdev->bus->number;
756 				*devfn = pdev->devfn;
757 			}
758 			goto out;
759 		}
760 	}
761 	iommu = NULL;
762 out:
763 	if (iommu_is_dummy(iommu, dev))
764 		iommu = NULL;
765 
766 	rcu_read_unlock();
767 
768 	return iommu;
769 }
770 
771 static void domain_flush_cache(struct dmar_domain *domain,
772 			       void *addr, int size)
773 {
774 	if (!domain->iommu_coherency)
775 		clflush_cache_range(addr, size);
776 }
777 
778 static void free_context_table(struct intel_iommu *iommu)
779 {
780 	struct context_entry *context;
781 	int i;
782 
783 	if (!iommu->root_entry)
784 		return;
785 
786 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 		context = iommu_context_addr(iommu, i, 0, 0);
788 		if (context)
789 			free_pgtable_page(context);
790 
791 		if (!sm_supported(iommu))
792 			continue;
793 
794 		context = iommu_context_addr(iommu, i, 0x80, 0);
795 		if (context)
796 			free_pgtable_page(context);
797 	}
798 
799 	free_pgtable_page(iommu->root_entry);
800 	iommu->root_entry = NULL;
801 }
802 
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806 {
807 	struct dma_pte *pte;
808 	int offset;
809 
810 	while (1) {
811 		offset = pfn_level_offset(pfn, level);
812 		pte = &parent[offset];
813 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 			pr_info("PTE not present at level %d\n", level);
815 			break;
816 		}
817 
818 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819 
820 		if (level == 1)
821 			break;
822 
823 		parent = phys_to_virt(dma_pte_addr(pte));
824 		level--;
825 	}
826 }
827 
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 			  unsigned long long addr, u32 pasid)
830 {
831 	struct pasid_dir_entry *dir, *pde;
832 	struct pasid_entry *entries, *pte;
833 	struct context_entry *ctx_entry;
834 	struct root_entry *rt_entry;
835 	int i, dir_index, index, level;
836 	u8 devfn = source_id & 0xff;
837 	u8 bus = source_id >> 8;
838 	struct dma_pte *pgtable;
839 
840 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841 
842 	/* root entry dump */
843 	rt_entry = &iommu->root_entry[bus];
844 	if (!rt_entry) {
845 		pr_info("root table entry is not present\n");
846 		return;
847 	}
848 
849 	if (sm_supported(iommu))
850 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 			rt_entry->hi, rt_entry->lo);
852 	else
853 		pr_info("root entry: 0x%016llx", rt_entry->lo);
854 
855 	/* context entry dump */
856 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 	if (!ctx_entry) {
858 		pr_info("context table entry is not present\n");
859 		return;
860 	}
861 
862 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 		ctx_entry->hi, ctx_entry->lo);
864 
865 	/* legacy mode does not require PASID entries */
866 	if (!sm_supported(iommu)) {
867 		level = agaw_to_level(ctx_entry->hi & 7);
868 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 		goto pgtable_walk;
870 	}
871 
872 	/* get the pointer to pasid directory entry */
873 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 	if (!dir) {
875 		pr_info("pasid directory entry is not present\n");
876 		return;
877 	}
878 	/* For request-without-pasid, get the pasid from context entry */
879 	if (intel_iommu_sm && pasid == INVALID_IOASID)
880 		pasid = PASID_RID2PASID;
881 
882 	dir_index = pasid >> PASID_PDE_SHIFT;
883 	pde = &dir[dir_index];
884 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885 
886 	/* get the pointer to the pasid table entry */
887 	entries = get_pasid_table_from_pde(pde);
888 	if (!entries) {
889 		pr_info("pasid table entry is not present\n");
890 		return;
891 	}
892 	index = pasid & PASID_PTE_MASK;
893 	pte = &entries[index];
894 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896 
897 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 	} else {
901 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 	}
904 
905 pgtable_walk:
906 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907 }
908 #endif
909 
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 				      unsigned long pfn, int *target_level,
912 				      gfp_t gfp)
913 {
914 	struct dma_pte *parent, *pte;
915 	int level = agaw_to_level(domain->agaw);
916 	int offset;
917 
918 	BUG_ON(!domain->pgd);
919 
920 	if (!domain_pfn_supported(domain, pfn))
921 		/* Address beyond IOMMU's addressing capabilities. */
922 		return NULL;
923 
924 	parent = domain->pgd;
925 
926 	while (1) {
927 		void *tmp_page;
928 
929 		offset = pfn_level_offset(pfn, level);
930 		pte = &parent[offset];
931 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
932 			break;
933 		if (level == *target_level)
934 			break;
935 
936 		if (!dma_pte_present(pte)) {
937 			uint64_t pteval;
938 
939 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
940 
941 			if (!tmp_page)
942 				return NULL;
943 
944 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
945 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
946 			if (domain->use_first_level)
947 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
948 
949 			if (cmpxchg64(&pte->val, 0ULL, pteval))
950 				/* Someone else set it while we were thinking; use theirs. */
951 				free_pgtable_page(tmp_page);
952 			else
953 				domain_flush_cache(domain, pte, sizeof(*pte));
954 		}
955 		if (level == 1)
956 			break;
957 
958 		parent = phys_to_virt(dma_pte_addr(pte));
959 		level--;
960 	}
961 
962 	if (!*target_level)
963 		*target_level = level;
964 
965 	return pte;
966 }
967 
968 /* return address's pte at specific level */
969 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
970 					 unsigned long pfn,
971 					 int level, int *large_page)
972 {
973 	struct dma_pte *parent, *pte;
974 	int total = agaw_to_level(domain->agaw);
975 	int offset;
976 
977 	parent = domain->pgd;
978 	while (level <= total) {
979 		offset = pfn_level_offset(pfn, total);
980 		pte = &parent[offset];
981 		if (level == total)
982 			return pte;
983 
984 		if (!dma_pte_present(pte)) {
985 			*large_page = total;
986 			break;
987 		}
988 
989 		if (dma_pte_superpage(pte)) {
990 			*large_page = total;
991 			return pte;
992 		}
993 
994 		parent = phys_to_virt(dma_pte_addr(pte));
995 		total--;
996 	}
997 	return NULL;
998 }
999 
1000 /* clear last level pte, a tlb flush should be followed */
1001 static void dma_pte_clear_range(struct dmar_domain *domain,
1002 				unsigned long start_pfn,
1003 				unsigned long last_pfn)
1004 {
1005 	unsigned int large_page;
1006 	struct dma_pte *first_pte, *pte;
1007 
1008 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1009 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1010 	BUG_ON(start_pfn > last_pfn);
1011 
1012 	/* we don't need lock here; nobody else touches the iova range */
1013 	do {
1014 		large_page = 1;
1015 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1016 		if (!pte) {
1017 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1018 			continue;
1019 		}
1020 		do {
1021 			dma_clear_pte(pte);
1022 			start_pfn += lvl_to_nr_pages(large_page);
1023 			pte++;
1024 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1025 
1026 		domain_flush_cache(domain, first_pte,
1027 				   (void *)pte - (void *)first_pte);
1028 
1029 	} while (start_pfn && start_pfn <= last_pfn);
1030 }
1031 
1032 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1033 			       int retain_level, struct dma_pte *pte,
1034 			       unsigned long pfn, unsigned long start_pfn,
1035 			       unsigned long last_pfn)
1036 {
1037 	pfn = max(start_pfn, pfn);
1038 	pte = &pte[pfn_level_offset(pfn, level)];
1039 
1040 	do {
1041 		unsigned long level_pfn;
1042 		struct dma_pte *level_pte;
1043 
1044 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1045 			goto next;
1046 
1047 		level_pfn = pfn & level_mask(level);
1048 		level_pte = phys_to_virt(dma_pte_addr(pte));
1049 
1050 		if (level > 2) {
1051 			dma_pte_free_level(domain, level - 1, retain_level,
1052 					   level_pte, level_pfn, start_pfn,
1053 					   last_pfn);
1054 		}
1055 
1056 		/*
1057 		 * Free the page table if we're below the level we want to
1058 		 * retain and the range covers the entire table.
1059 		 */
1060 		if (level < retain_level && !(start_pfn > level_pfn ||
1061 		      last_pfn < level_pfn + level_size(level) - 1)) {
1062 			dma_clear_pte(pte);
1063 			domain_flush_cache(domain, pte, sizeof(*pte));
1064 			free_pgtable_page(level_pte);
1065 		}
1066 next:
1067 		pfn += level_size(level);
1068 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1069 }
1070 
1071 /*
1072  * clear last level (leaf) ptes and free page table pages below the
1073  * level we wish to keep intact.
1074  */
1075 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1076 				   unsigned long start_pfn,
1077 				   unsigned long last_pfn,
1078 				   int retain_level)
1079 {
1080 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1081 
1082 	/* We don't need lock here; nobody else touches the iova range */
1083 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1084 			   domain->pgd, 0, start_pfn, last_pfn);
1085 
1086 	/* free pgd */
1087 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1088 		free_pgtable_page(domain->pgd);
1089 		domain->pgd = NULL;
1090 	}
1091 }
1092 
1093 /* When a page at a given level is being unlinked from its parent, we don't
1094    need to *modify* it at all. All we need to do is make a list of all the
1095    pages which can be freed just as soon as we've flushed the IOTLB and we
1096    know the hardware page-walk will no longer touch them.
1097    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1098    be freed. */
1099 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1100 				    int level, struct dma_pte *pte,
1101 				    struct list_head *freelist)
1102 {
1103 	struct page *pg;
1104 
1105 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1106 	list_add_tail(&pg->lru, freelist);
1107 
1108 	if (level == 1)
1109 		return;
1110 
1111 	pte = page_address(pg);
1112 	do {
1113 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1114 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1115 		pte++;
1116 	} while (!first_pte_in_page(pte));
1117 }
1118 
1119 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1120 				struct dma_pte *pte, unsigned long pfn,
1121 				unsigned long start_pfn, unsigned long last_pfn,
1122 				struct list_head *freelist)
1123 {
1124 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1125 
1126 	pfn = max(start_pfn, pfn);
1127 	pte = &pte[pfn_level_offset(pfn, level)];
1128 
1129 	do {
1130 		unsigned long level_pfn = pfn & level_mask(level);
1131 
1132 		if (!dma_pte_present(pte))
1133 			goto next;
1134 
1135 		/* If range covers entire pagetable, free it */
1136 		if (start_pfn <= level_pfn &&
1137 		    last_pfn >= level_pfn + level_size(level) - 1) {
1138 			/* These suborbinate page tables are going away entirely. Don't
1139 			   bother to clear them; we're just going to *free* them. */
1140 			if (level > 1 && !dma_pte_superpage(pte))
1141 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1142 
1143 			dma_clear_pte(pte);
1144 			if (!first_pte)
1145 				first_pte = pte;
1146 			last_pte = pte;
1147 		} else if (level > 1) {
1148 			/* Recurse down into a level that isn't *entirely* obsolete */
1149 			dma_pte_clear_level(domain, level - 1,
1150 					    phys_to_virt(dma_pte_addr(pte)),
1151 					    level_pfn, start_pfn, last_pfn,
1152 					    freelist);
1153 		}
1154 next:
1155 		pfn = level_pfn + level_size(level);
1156 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1157 
1158 	if (first_pte)
1159 		domain_flush_cache(domain, first_pte,
1160 				   (void *)++last_pte - (void *)first_pte);
1161 }
1162 
1163 /* We can't just free the pages because the IOMMU may still be walking
1164    the page tables, and may have cached the intermediate levels. The
1165    pages can only be freed after the IOTLB flush has been done. */
1166 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1167 			 unsigned long last_pfn, struct list_head *freelist)
1168 {
1169 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1170 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1171 	BUG_ON(start_pfn > last_pfn);
1172 
1173 	/* we don't need lock here; nobody else touches the iova range */
1174 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1175 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1176 
1177 	/* free pgd */
1178 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1179 		struct page *pgd_page = virt_to_page(domain->pgd);
1180 		list_add_tail(&pgd_page->lru, freelist);
1181 		domain->pgd = NULL;
1182 	}
1183 }
1184 
1185 /* iommu handling */
1186 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1187 {
1188 	struct root_entry *root;
1189 
1190 	root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1191 	if (!root) {
1192 		pr_err("Allocating root entry for %s failed\n",
1193 			iommu->name);
1194 		return -ENOMEM;
1195 	}
1196 
1197 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1198 	iommu->root_entry = root;
1199 
1200 	return 0;
1201 }
1202 
1203 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 {
1205 	u64 addr;
1206 	u32 sts;
1207 	unsigned long flag;
1208 
1209 	addr = virt_to_phys(iommu->root_entry);
1210 	if (sm_supported(iommu))
1211 		addr |= DMA_RTADDR_SMT;
1212 
1213 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1214 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1215 
1216 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1217 
1218 	/* Make sure hardware complete it */
1219 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220 		      readl, (sts & DMA_GSTS_RTPS), sts);
1221 
1222 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1223 
1224 	/*
1225 	 * Hardware invalidates all DMA remapping hardware translation
1226 	 * caches as part of SRTP flow.
1227 	 */
1228 	if (cap_esrtps(iommu->cap))
1229 		return;
1230 
1231 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1232 	if (sm_supported(iommu))
1233 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1234 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1235 }
1236 
1237 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1238 {
1239 	u32 val;
1240 	unsigned long flag;
1241 
1242 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1243 		return;
1244 
1245 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1246 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1247 
1248 	/* Make sure hardware complete it */
1249 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1250 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1251 
1252 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1253 }
1254 
1255 /* return value determine if we need a write buffer flush */
1256 static void __iommu_flush_context(struct intel_iommu *iommu,
1257 				  u16 did, u16 source_id, u8 function_mask,
1258 				  u64 type)
1259 {
1260 	u64 val = 0;
1261 	unsigned long flag;
1262 
1263 	switch (type) {
1264 	case DMA_CCMD_GLOBAL_INVL:
1265 		val = DMA_CCMD_GLOBAL_INVL;
1266 		break;
1267 	case DMA_CCMD_DOMAIN_INVL:
1268 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1269 		break;
1270 	case DMA_CCMD_DEVICE_INVL:
1271 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1272 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1273 		break;
1274 	default:
1275 		BUG();
1276 	}
1277 	val |= DMA_CCMD_ICC;
1278 
1279 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1281 
1282 	/* Make sure hardware complete it */
1283 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1285 
1286 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1287 }
1288 
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291 				u64 addr, unsigned int size_order, u64 type)
1292 {
1293 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294 	u64 val = 0, val_iva = 0;
1295 	unsigned long flag;
1296 
1297 	switch (type) {
1298 	case DMA_TLB_GLOBAL_FLUSH:
1299 		/* global flush doesn't need set IVA_REG */
1300 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1301 		break;
1302 	case DMA_TLB_DSI_FLUSH:
1303 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304 		break;
1305 	case DMA_TLB_PSI_FLUSH:
1306 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307 		/* IH bit is passed in as part of address */
1308 		val_iva = size_order | addr;
1309 		break;
1310 	default:
1311 		BUG();
1312 	}
1313 	/* Note: set drain read/write */
1314 #if 0
1315 	/*
1316 	 * This is probably to be super secure.. Looks like we can
1317 	 * ignore it without any impact.
1318 	 */
1319 	if (cap_read_drain(iommu->cap))
1320 		val |= DMA_TLB_READ_DRAIN;
1321 #endif
1322 	if (cap_write_drain(iommu->cap))
1323 		val |= DMA_TLB_WRITE_DRAIN;
1324 
1325 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326 	/* Note: Only uses first TLB reg currently */
1327 	if (val_iva)
1328 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1329 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1330 
1331 	/* Make sure hardware complete it */
1332 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1333 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1334 
1335 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1336 
1337 	/* check IOTLB invalidation granularity */
1338 	if (DMA_TLB_IAIG(val) == 0)
1339 		pr_err("Flush IOTLB failed\n");
1340 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1341 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1342 			(unsigned long long)DMA_TLB_IIRG(type),
1343 			(unsigned long long)DMA_TLB_IAIG(val));
1344 }
1345 
1346 static struct device_domain_info *
1347 domain_lookup_dev_info(struct dmar_domain *domain,
1348 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1349 {
1350 	struct device_domain_info *info;
1351 	unsigned long flags;
1352 
1353 	spin_lock_irqsave(&domain->lock, flags);
1354 	list_for_each_entry(info, &domain->devices, link) {
1355 		if (info->iommu == iommu && info->bus == bus &&
1356 		    info->devfn == devfn) {
1357 			spin_unlock_irqrestore(&domain->lock, flags);
1358 			return info;
1359 		}
1360 	}
1361 	spin_unlock_irqrestore(&domain->lock, flags);
1362 
1363 	return NULL;
1364 }
1365 
1366 static void domain_update_iotlb(struct dmar_domain *domain)
1367 {
1368 	struct device_domain_info *info;
1369 	bool has_iotlb_device = false;
1370 	unsigned long flags;
1371 
1372 	spin_lock_irqsave(&domain->lock, flags);
1373 	list_for_each_entry(info, &domain->devices, link) {
1374 		if (info->ats_enabled) {
1375 			has_iotlb_device = true;
1376 			break;
1377 		}
1378 	}
1379 	domain->has_iotlb_device = has_iotlb_device;
1380 	spin_unlock_irqrestore(&domain->lock, flags);
1381 }
1382 
1383 /*
1384  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1385  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1386  * check because it applies only to the built-in QAT devices and it doesn't
1387  * grant additional privileges.
1388  */
1389 #define BUGGY_QAT_DEVID_MASK 0x4940
1390 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1391 {
1392 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1393 		return false;
1394 
1395 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1396 		return false;
1397 
1398 	return true;
1399 }
1400 
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1402 {
1403 	struct pci_dev *pdev;
1404 
1405 	if (!dev_is_pci(info->dev))
1406 		return;
1407 
1408 	pdev = to_pci_dev(info->dev);
1409 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412 	 * reserved, which should be set to 0.
1413 	 */
1414 	if (!ecap_dit(info->iommu->ecap))
1415 		info->pfsid = 0;
1416 	else {
1417 		struct pci_dev *pf_pdev;
1418 
1419 		/* pdev will be returned if device is not a vf */
1420 		pf_pdev = pci_physfn(pdev);
1421 		info->pfsid = pci_dev_id(pf_pdev);
1422 	}
1423 
1424 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1425 	   the device if you enable PASID support after ATS support is
1426 	   undefined. So always enable PASID support on devices which
1427 	   have it, even if we can't yet know if we're ever going to
1428 	   use it. */
1429 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430 		info->pasid_enabled = 1;
1431 
1432 	if (info->pri_supported &&
1433 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1434 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435 		info->pri_enabled = 1;
1436 
1437 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439 		info->ats_enabled = 1;
1440 		domain_update_iotlb(info->domain);
1441 		info->ats_qdep = pci_ats_queue_depth(pdev);
1442 	}
1443 }
1444 
1445 static void iommu_disable_pci_caps(struct device_domain_info *info)
1446 {
1447 	struct pci_dev *pdev;
1448 
1449 	if (!dev_is_pci(info->dev))
1450 		return;
1451 
1452 	pdev = to_pci_dev(info->dev);
1453 
1454 	if (info->ats_enabled) {
1455 		pci_disable_ats(pdev);
1456 		info->ats_enabled = 0;
1457 		domain_update_iotlb(info->domain);
1458 	}
1459 
1460 	if (info->pri_enabled) {
1461 		pci_disable_pri(pdev);
1462 		info->pri_enabled = 0;
1463 	}
1464 
1465 	if (info->pasid_enabled) {
1466 		pci_disable_pasid(pdev);
1467 		info->pasid_enabled = 0;
1468 	}
1469 }
1470 
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472 				    u64 addr, unsigned int mask)
1473 {
1474 	u16 sid, qdep;
1475 
1476 	if (!info || !info->ats_enabled)
1477 		return;
1478 
1479 	sid = info->bus << 8 | info->devfn;
1480 	qdep = info->ats_qdep;
1481 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1482 			   qdep, addr, mask);
1483 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1484 }
1485 
1486 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1487 				  u64 addr, unsigned mask)
1488 {
1489 	struct device_domain_info *info;
1490 	unsigned long flags;
1491 
1492 	if (!domain->has_iotlb_device)
1493 		return;
1494 
1495 	spin_lock_irqsave(&domain->lock, flags);
1496 	list_for_each_entry(info, &domain->devices, link)
1497 		__iommu_flush_dev_iotlb(info, addr, mask);
1498 	spin_unlock_irqrestore(&domain->lock, flags);
1499 }
1500 
1501 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1502 				  struct dmar_domain *domain,
1503 				  unsigned long pfn, unsigned int pages,
1504 				  int ih, int map)
1505 {
1506 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1507 	unsigned int mask = ilog2(aligned_pages);
1508 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1509 	u16 did = domain_id_iommu(domain, iommu);
1510 
1511 	BUG_ON(pages == 0);
1512 
1513 	if (ih)
1514 		ih = 1 << 6;
1515 
1516 	if (domain->use_first_level) {
1517 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1518 	} else {
1519 		unsigned long bitmask = aligned_pages - 1;
1520 
1521 		/*
1522 		 * PSI masks the low order bits of the base address. If the
1523 		 * address isn't aligned to the mask, then compute a mask value
1524 		 * needed to ensure the target range is flushed.
1525 		 */
1526 		if (unlikely(bitmask & pfn)) {
1527 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1528 
1529 			/*
1530 			 * Since end_pfn <= pfn + bitmask, the only way bits
1531 			 * higher than bitmask can differ in pfn and end_pfn is
1532 			 * by carrying. This means after masking out bitmask,
1533 			 * high bits starting with the first set bit in
1534 			 * shared_bits are all equal in both pfn and end_pfn.
1535 			 */
1536 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1537 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1538 		}
1539 
1540 		/*
1541 		 * Fallback to domain selective flush if no PSI support or
1542 		 * the size is too big.
1543 		 */
1544 		if (!cap_pgsel_inv(iommu->cap) ||
1545 		    mask > cap_max_amask_val(iommu->cap))
1546 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1547 							DMA_TLB_DSI_FLUSH);
1548 		else
1549 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1550 							DMA_TLB_PSI_FLUSH);
1551 	}
1552 
1553 	/*
1554 	 * In caching mode, changes of pages from non-present to present require
1555 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1556 	 */
1557 	if (!cap_caching_mode(iommu->cap) || !map)
1558 		iommu_flush_dev_iotlb(domain, addr, mask);
1559 }
1560 
1561 /* Notification for newly created mappings */
1562 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1563 					struct dmar_domain *domain,
1564 					unsigned long pfn, unsigned int pages)
1565 {
1566 	/*
1567 	 * It's a non-present to present mapping. Only flush if caching mode
1568 	 * and second level.
1569 	 */
1570 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1571 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1572 	else
1573 		iommu_flush_write_buffer(iommu);
1574 }
1575 
1576 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1577 {
1578 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1579 	struct iommu_domain_info *info;
1580 	unsigned long idx;
1581 
1582 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1583 		struct intel_iommu *iommu = info->iommu;
1584 		u16 did = domain_id_iommu(dmar_domain, iommu);
1585 
1586 		if (dmar_domain->use_first_level)
1587 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1588 		else
1589 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1590 						 DMA_TLB_DSI_FLUSH);
1591 
1592 		if (!cap_caching_mode(iommu->cap))
1593 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1594 	}
1595 }
1596 
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1598 {
1599 	u32 pmen;
1600 	unsigned long flags;
1601 
1602 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1603 		return;
1604 
1605 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1606 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1607 	pmen &= ~DMA_PMEN_EPM;
1608 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1609 
1610 	/* wait for the protected region status bit to clear */
1611 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1612 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1613 
1614 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1615 }
1616 
1617 static void iommu_enable_translation(struct intel_iommu *iommu)
1618 {
1619 	u32 sts;
1620 	unsigned long flags;
1621 
1622 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623 	iommu->gcmd |= DMA_GCMD_TE;
1624 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1625 
1626 	/* Make sure hardware complete it */
1627 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1628 		      readl, (sts & DMA_GSTS_TES), sts);
1629 
1630 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1631 }
1632 
1633 static void iommu_disable_translation(struct intel_iommu *iommu)
1634 {
1635 	u32 sts;
1636 	unsigned long flag;
1637 
1638 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1639 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1640 		return;
1641 
1642 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1643 	iommu->gcmd &= ~DMA_GCMD_TE;
1644 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1645 
1646 	/* Make sure hardware complete it */
1647 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1648 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1649 
1650 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1651 }
1652 
1653 static int iommu_init_domains(struct intel_iommu *iommu)
1654 {
1655 	u32 ndomains;
1656 
1657 	ndomains = cap_ndoms(iommu->cap);
1658 	pr_debug("%s: Number of Domains supported <%d>\n",
1659 		 iommu->name, ndomains);
1660 
1661 	spin_lock_init(&iommu->lock);
1662 
1663 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1664 	if (!iommu->domain_ids)
1665 		return -ENOMEM;
1666 
1667 	/*
1668 	 * If Caching mode is set, then invalid translations are tagged
1669 	 * with domain-id 0, hence we need to pre-allocate it. We also
1670 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1671 	 * make sure it is not used for a real domain.
1672 	 */
1673 	set_bit(0, iommu->domain_ids);
1674 
1675 	/*
1676 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1677 	 * entry for first-level or pass-through translation modes should
1678 	 * be programmed with a domain id different from those used for
1679 	 * second-level or nested translation. We reserve a domain id for
1680 	 * this purpose.
1681 	 */
1682 	if (sm_supported(iommu))
1683 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1684 
1685 	return 0;
1686 }
1687 
1688 static void disable_dmar_iommu(struct intel_iommu *iommu)
1689 {
1690 	if (!iommu->domain_ids)
1691 		return;
1692 
1693 	/*
1694 	 * All iommu domains must have been detached from the devices,
1695 	 * hence there should be no domain IDs in use.
1696 	 */
1697 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1698 		    > NUM_RESERVED_DID))
1699 		return;
1700 
1701 	if (iommu->gcmd & DMA_GCMD_TE)
1702 		iommu_disable_translation(iommu);
1703 }
1704 
1705 static void free_dmar_iommu(struct intel_iommu *iommu)
1706 {
1707 	if (iommu->domain_ids) {
1708 		bitmap_free(iommu->domain_ids);
1709 		iommu->domain_ids = NULL;
1710 	}
1711 
1712 	if (iommu->copied_tables) {
1713 		bitmap_free(iommu->copied_tables);
1714 		iommu->copied_tables = NULL;
1715 	}
1716 
1717 	/* free context mapping */
1718 	free_context_table(iommu);
1719 
1720 #ifdef CONFIG_INTEL_IOMMU_SVM
1721 	if (pasid_supported(iommu)) {
1722 		if (ecap_prs(iommu->ecap))
1723 			intel_svm_finish_prq(iommu);
1724 	}
1725 	if (vccap_pasid(iommu->vccap))
1726 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1727 
1728 #endif
1729 }
1730 
1731 /*
1732  * Check and return whether first level is used by default for
1733  * DMA translation.
1734  */
1735 static bool first_level_by_default(unsigned int type)
1736 {
1737 	/* Only SL is available in legacy mode */
1738 	if (!scalable_mode_support())
1739 		return false;
1740 
1741 	/* Only level (either FL or SL) is available, just use it */
1742 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1743 		return intel_cap_flts_sanity();
1744 
1745 	/* Both levels are available, decide it based on domain type */
1746 	return type != IOMMU_DOMAIN_UNMANAGED;
1747 }
1748 
1749 static struct dmar_domain *alloc_domain(unsigned int type)
1750 {
1751 	struct dmar_domain *domain;
1752 
1753 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1754 	if (!domain)
1755 		return NULL;
1756 
1757 	domain->nid = NUMA_NO_NODE;
1758 	if (first_level_by_default(type))
1759 		domain->use_first_level = true;
1760 	domain->has_iotlb_device = false;
1761 	INIT_LIST_HEAD(&domain->devices);
1762 	spin_lock_init(&domain->lock);
1763 	xa_init(&domain->iommu_array);
1764 
1765 	return domain;
1766 }
1767 
1768 static int domain_attach_iommu(struct dmar_domain *domain,
1769 			       struct intel_iommu *iommu)
1770 {
1771 	struct iommu_domain_info *info, *curr;
1772 	unsigned long ndomains;
1773 	int num, ret = -ENOSPC;
1774 
1775 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1776 	if (!info)
1777 		return -ENOMEM;
1778 
1779 	spin_lock(&iommu->lock);
1780 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1781 	if (curr) {
1782 		curr->refcnt++;
1783 		spin_unlock(&iommu->lock);
1784 		kfree(info);
1785 		return 0;
1786 	}
1787 
1788 	ndomains = cap_ndoms(iommu->cap);
1789 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1790 	if (num >= ndomains) {
1791 		pr_err("%s: No free domain ids\n", iommu->name);
1792 		goto err_unlock;
1793 	}
1794 
1795 	set_bit(num, iommu->domain_ids);
1796 	info->refcnt	= 1;
1797 	info->did	= num;
1798 	info->iommu	= iommu;
1799 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1800 			  NULL, info, GFP_ATOMIC);
1801 	if (curr) {
1802 		ret = xa_err(curr) ? : -EBUSY;
1803 		goto err_clear;
1804 	}
1805 	domain_update_iommu_cap(domain);
1806 
1807 	spin_unlock(&iommu->lock);
1808 	return 0;
1809 
1810 err_clear:
1811 	clear_bit(info->did, iommu->domain_ids);
1812 err_unlock:
1813 	spin_unlock(&iommu->lock);
1814 	kfree(info);
1815 	return ret;
1816 }
1817 
1818 static void domain_detach_iommu(struct dmar_domain *domain,
1819 				struct intel_iommu *iommu)
1820 {
1821 	struct iommu_domain_info *info;
1822 
1823 	spin_lock(&iommu->lock);
1824 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1825 	if (--info->refcnt == 0) {
1826 		clear_bit(info->did, iommu->domain_ids);
1827 		xa_erase(&domain->iommu_array, iommu->seq_id);
1828 		domain->nid = NUMA_NO_NODE;
1829 		domain_update_iommu_cap(domain);
1830 		kfree(info);
1831 	}
1832 	spin_unlock(&iommu->lock);
1833 }
1834 
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1836 {
1837 	int agaw;
1838 	int r = (gaw - 12) % 9;
1839 
1840 	if (r == 0)
1841 		agaw = gaw;
1842 	else
1843 		agaw = gaw + 9 - r;
1844 	if (agaw > 64)
1845 		agaw = 64;
1846 	return agaw;
1847 }
1848 
1849 static void domain_exit(struct dmar_domain *domain)
1850 {
1851 	if (domain->pgd) {
1852 		LIST_HEAD(freelist);
1853 
1854 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1855 		put_pages_list(&freelist);
1856 	}
1857 
1858 	if (WARN_ON(!list_empty(&domain->devices)))
1859 		return;
1860 
1861 	kfree(domain);
1862 }
1863 
1864 /*
1865  * Get the PASID directory size for scalable mode context entry.
1866  * Value of X in the PDTS field of a scalable mode context entry
1867  * indicates PASID directory with 2^(X + 7) entries.
1868  */
1869 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1870 {
1871 	unsigned long pds, max_pde;
1872 
1873 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1874 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1875 	if (pds < 7)
1876 		return 0;
1877 
1878 	return pds - 7;
1879 }
1880 
1881 /*
1882  * Set the RID_PASID field of a scalable mode context entry. The
1883  * IOMMU hardware will use the PASID value set in this field for
1884  * DMA translations of DMA requests without PASID.
1885  */
1886 static inline void
1887 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1888 {
1889 	context->hi |= pasid & ((1 << 20) - 1);
1890 }
1891 
1892 /*
1893  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1894  * entry.
1895  */
1896 static inline void context_set_sm_dte(struct context_entry *context)
1897 {
1898 	context->lo |= (1 << 2);
1899 }
1900 
1901 /*
1902  * Set the PRE(Page Request Enable) field of a scalable mode context
1903  * entry.
1904  */
1905 static inline void context_set_sm_pre(struct context_entry *context)
1906 {
1907 	context->lo |= (1 << 4);
1908 }
1909 
1910 /* Convert value to context PASID directory size field coding. */
1911 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1912 
1913 static int domain_context_mapping_one(struct dmar_domain *domain,
1914 				      struct intel_iommu *iommu,
1915 				      struct pasid_table *table,
1916 				      u8 bus, u8 devfn)
1917 {
1918 	struct device_domain_info *info =
1919 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1920 	u16 did = domain_id_iommu(domain, iommu);
1921 	int translation = CONTEXT_TT_MULTI_LEVEL;
1922 	struct context_entry *context;
1923 	int ret;
1924 
1925 	WARN_ON(did == 0);
1926 
1927 	if (hw_pass_through && domain_type_is_si(domain))
1928 		translation = CONTEXT_TT_PASS_THROUGH;
1929 
1930 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1931 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1932 
1933 	BUG_ON(!domain->pgd);
1934 
1935 	spin_lock(&iommu->lock);
1936 	ret = -ENOMEM;
1937 	context = iommu_context_addr(iommu, bus, devfn, 1);
1938 	if (!context)
1939 		goto out_unlock;
1940 
1941 	ret = 0;
1942 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1943 		goto out_unlock;
1944 
1945 	/*
1946 	 * For kdump cases, old valid entries may be cached due to the
1947 	 * in-flight DMA and copied pgtable, but there is no unmapping
1948 	 * behaviour for them, thus we need an explicit cache flush for
1949 	 * the newly-mapped device. For kdump, at this point, the device
1950 	 * is supposed to finish reset at its driver probe stage, so no
1951 	 * in-flight DMA will exist, and we don't need to worry anymore
1952 	 * hereafter.
1953 	 */
1954 	if (context_copied(iommu, bus, devfn)) {
1955 		u16 did_old = context_domain_id(context);
1956 
1957 		if (did_old < cap_ndoms(iommu->cap)) {
1958 			iommu->flush.flush_context(iommu, did_old,
1959 						   (((u16)bus) << 8) | devfn,
1960 						   DMA_CCMD_MASK_NOBIT,
1961 						   DMA_CCMD_DEVICE_INVL);
1962 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1963 						 DMA_TLB_DSI_FLUSH);
1964 		}
1965 
1966 		clear_context_copied(iommu, bus, devfn);
1967 	}
1968 
1969 	context_clear_entry(context);
1970 
1971 	if (sm_supported(iommu)) {
1972 		unsigned long pds;
1973 
1974 		WARN_ON(!table);
1975 
1976 		/* Setup the PASID DIR pointer: */
1977 		pds = context_get_sm_pds(table);
1978 		context->lo = (u64)virt_to_phys(table->table) |
1979 				context_pdts(pds);
1980 
1981 		/* Setup the RID_PASID field: */
1982 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1983 
1984 		/*
1985 		 * Setup the Device-TLB enable bit and Page request
1986 		 * Enable bit:
1987 		 */
1988 		if (info && info->ats_supported)
1989 			context_set_sm_dte(context);
1990 		if (info && info->pri_supported)
1991 			context_set_sm_pre(context);
1992 		if (info && info->pasid_supported)
1993 			context_set_pasid(context);
1994 	} else {
1995 		struct dma_pte *pgd = domain->pgd;
1996 		int agaw;
1997 
1998 		context_set_domain_id(context, did);
1999 
2000 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2001 			/*
2002 			 * Skip top levels of page tables for iommu which has
2003 			 * less agaw than default. Unnecessary for PT mode.
2004 			 */
2005 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2006 				ret = -ENOMEM;
2007 				pgd = phys_to_virt(dma_pte_addr(pgd));
2008 				if (!dma_pte_present(pgd))
2009 					goto out_unlock;
2010 			}
2011 
2012 			if (info && info->ats_supported)
2013 				translation = CONTEXT_TT_DEV_IOTLB;
2014 			else
2015 				translation = CONTEXT_TT_MULTI_LEVEL;
2016 
2017 			context_set_address_root(context, virt_to_phys(pgd));
2018 			context_set_address_width(context, agaw);
2019 		} else {
2020 			/*
2021 			 * In pass through mode, AW must be programmed to
2022 			 * indicate the largest AGAW value supported by
2023 			 * hardware. And ASR is ignored by hardware.
2024 			 */
2025 			context_set_address_width(context, iommu->msagaw);
2026 		}
2027 
2028 		context_set_translation_type(context, translation);
2029 	}
2030 
2031 	context_set_fault_enable(context);
2032 	context_set_present(context);
2033 	if (!ecap_coherent(iommu->ecap))
2034 		clflush_cache_range(context, sizeof(*context));
2035 
2036 	/*
2037 	 * It's a non-present to present mapping. If hardware doesn't cache
2038 	 * non-present entry we only need to flush the write-buffer. If the
2039 	 * _does_ cache non-present entries, then it does so in the special
2040 	 * domain #0, which we have to flush:
2041 	 */
2042 	if (cap_caching_mode(iommu->cap)) {
2043 		iommu->flush.flush_context(iommu, 0,
2044 					   (((u16)bus) << 8) | devfn,
2045 					   DMA_CCMD_MASK_NOBIT,
2046 					   DMA_CCMD_DEVICE_INVL);
2047 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2048 	} else {
2049 		iommu_flush_write_buffer(iommu);
2050 	}
2051 
2052 	ret = 0;
2053 
2054 out_unlock:
2055 	spin_unlock(&iommu->lock);
2056 
2057 	return ret;
2058 }
2059 
2060 struct domain_context_mapping_data {
2061 	struct dmar_domain *domain;
2062 	struct intel_iommu *iommu;
2063 	struct pasid_table *table;
2064 };
2065 
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067 				     u16 alias, void *opaque)
2068 {
2069 	struct domain_context_mapping_data *data = opaque;
2070 
2071 	return domain_context_mapping_one(data->domain, data->iommu,
2072 					  data->table, PCI_BUS_NUM(alias),
2073 					  alias & 0xff);
2074 }
2075 
2076 static int
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2078 {
2079 	struct domain_context_mapping_data data;
2080 	struct pasid_table *table;
2081 	struct intel_iommu *iommu;
2082 	u8 bus, devfn;
2083 
2084 	iommu = device_to_iommu(dev, &bus, &devfn);
2085 	if (!iommu)
2086 		return -ENODEV;
2087 
2088 	table = intel_pasid_get_table(dev);
2089 
2090 	if (!dev_is_pci(dev))
2091 		return domain_context_mapping_one(domain, iommu, table,
2092 						  bus, devfn);
2093 
2094 	data.domain = domain;
2095 	data.iommu = iommu;
2096 	data.table = table;
2097 
2098 	return pci_for_each_dma_alias(to_pci_dev(dev),
2099 				      &domain_context_mapping_cb, &data);
2100 }
2101 
2102 /* Returns a number of VTD pages, but aligned to MM page size */
2103 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2104 					    size_t size)
2105 {
2106 	host_addr &= ~PAGE_MASK;
2107 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2108 }
2109 
2110 /* Return largest possible superpage level for a given mapping */
2111 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2112 					  unsigned long iov_pfn,
2113 					  unsigned long phy_pfn,
2114 					  unsigned long pages)
2115 {
2116 	int support, level = 1;
2117 	unsigned long pfnmerge;
2118 
2119 	support = domain->iommu_superpage;
2120 
2121 	/* To use a large page, the virtual *and* physical addresses
2122 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2123 	   of them will mean we have to use smaller pages. So just
2124 	   merge them and check both at once. */
2125 	pfnmerge = iov_pfn | phy_pfn;
2126 
2127 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2128 		pages >>= VTD_STRIDE_SHIFT;
2129 		if (!pages)
2130 			break;
2131 		pfnmerge >>= VTD_STRIDE_SHIFT;
2132 		level++;
2133 		support--;
2134 	}
2135 	return level;
2136 }
2137 
2138 /*
2139  * Ensure that old small page tables are removed to make room for superpage(s).
2140  * We're going to add new large pages, so make sure we don't remove their parent
2141  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2142  */
2143 static void switch_to_super_page(struct dmar_domain *domain,
2144 				 unsigned long start_pfn,
2145 				 unsigned long end_pfn, int level)
2146 {
2147 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2148 	struct iommu_domain_info *info;
2149 	struct dma_pte *pte = NULL;
2150 	unsigned long i;
2151 
2152 	while (start_pfn <= end_pfn) {
2153 		if (!pte)
2154 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2155 					     GFP_ATOMIC);
2156 
2157 		if (dma_pte_present(pte)) {
2158 			dma_pte_free_pagetable(domain, start_pfn,
2159 					       start_pfn + lvl_pages - 1,
2160 					       level + 1);
2161 
2162 			xa_for_each(&domain->iommu_array, i, info)
2163 				iommu_flush_iotlb_psi(info->iommu, domain,
2164 						      start_pfn, lvl_pages,
2165 						      0, 0);
2166 		}
2167 
2168 		pte++;
2169 		start_pfn += lvl_pages;
2170 		if (first_pte_in_page(pte))
2171 			pte = NULL;
2172 	}
2173 }
2174 
2175 static int
2176 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2177 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2178 		 gfp_t gfp)
2179 {
2180 	struct dma_pte *first_pte = NULL, *pte = NULL;
2181 	unsigned int largepage_lvl = 0;
2182 	unsigned long lvl_pages = 0;
2183 	phys_addr_t pteval;
2184 	u64 attr;
2185 
2186 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2187 
2188 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2189 		return -EINVAL;
2190 
2191 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2192 	attr |= DMA_FL_PTE_PRESENT;
2193 	if (domain->use_first_level) {
2194 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2195 		if (prot & DMA_PTE_WRITE)
2196 			attr |= DMA_FL_PTE_DIRTY;
2197 	}
2198 
2199 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2200 
2201 	while (nr_pages > 0) {
2202 		uint64_t tmp;
2203 
2204 		if (!pte) {
2205 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2206 					phys_pfn, nr_pages);
2207 
2208 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2209 					     gfp);
2210 			if (!pte)
2211 				return -ENOMEM;
2212 			first_pte = pte;
2213 
2214 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2215 
2216 			/* It is large page*/
2217 			if (largepage_lvl > 1) {
2218 				unsigned long end_pfn;
2219 				unsigned long pages_to_remove;
2220 
2221 				pteval |= DMA_PTE_LARGE_PAGE;
2222 				pages_to_remove = min_t(unsigned long, nr_pages,
2223 							nr_pte_to_next_page(pte) * lvl_pages);
2224 				end_pfn = iov_pfn + pages_to_remove - 1;
2225 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2226 			} else {
2227 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2228 			}
2229 
2230 		}
2231 		/* We don't need lock here, nobody else
2232 		 * touches the iova range
2233 		 */
2234 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2235 		if (tmp) {
2236 			static int dumps = 5;
2237 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2238 				iov_pfn, tmp, (unsigned long long)pteval);
2239 			if (dumps) {
2240 				dumps--;
2241 				debug_dma_dump_mappings(NULL);
2242 			}
2243 			WARN_ON(1);
2244 		}
2245 
2246 		nr_pages -= lvl_pages;
2247 		iov_pfn += lvl_pages;
2248 		phys_pfn += lvl_pages;
2249 		pteval += lvl_pages * VTD_PAGE_SIZE;
2250 
2251 		/* If the next PTE would be the first in a new page, then we
2252 		 * need to flush the cache on the entries we've just written.
2253 		 * And then we'll need to recalculate 'pte', so clear it and
2254 		 * let it get set again in the if (!pte) block above.
2255 		 *
2256 		 * If we're done (!nr_pages) we need to flush the cache too.
2257 		 *
2258 		 * Also if we've been setting superpages, we may need to
2259 		 * recalculate 'pte' and switch back to smaller pages for the
2260 		 * end of the mapping, if the trailing size is not enough to
2261 		 * use another superpage (i.e. nr_pages < lvl_pages).
2262 		 */
2263 		pte++;
2264 		if (!nr_pages || first_pte_in_page(pte) ||
2265 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2266 			domain_flush_cache(domain, first_pte,
2267 					   (void *)pte - (void *)first_pte);
2268 			pte = NULL;
2269 		}
2270 	}
2271 
2272 	return 0;
2273 }
2274 
2275 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2276 {
2277 	struct intel_iommu *iommu = info->iommu;
2278 	struct context_entry *context;
2279 	u16 did_old;
2280 
2281 	if (!iommu)
2282 		return;
2283 
2284 	spin_lock(&iommu->lock);
2285 	context = iommu_context_addr(iommu, bus, devfn, 0);
2286 	if (!context) {
2287 		spin_unlock(&iommu->lock);
2288 		return;
2289 	}
2290 
2291 	if (sm_supported(iommu)) {
2292 		if (hw_pass_through && domain_type_is_si(info->domain))
2293 			did_old = FLPT_DEFAULT_DID;
2294 		else
2295 			did_old = domain_id_iommu(info->domain, iommu);
2296 	} else {
2297 		did_old = context_domain_id(context);
2298 	}
2299 
2300 	context_clear_entry(context);
2301 	__iommu_flush_cache(iommu, context, sizeof(*context));
2302 	spin_unlock(&iommu->lock);
2303 	iommu->flush.flush_context(iommu,
2304 				   did_old,
2305 				   (((u16)bus) << 8) | devfn,
2306 				   DMA_CCMD_MASK_NOBIT,
2307 				   DMA_CCMD_DEVICE_INVL);
2308 
2309 	if (sm_supported(iommu))
2310 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2311 
2312 	iommu->flush.flush_iotlb(iommu,
2313 				 did_old,
2314 				 0,
2315 				 0,
2316 				 DMA_TLB_DSI_FLUSH);
2317 
2318 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2319 }
2320 
2321 static int domain_setup_first_level(struct intel_iommu *iommu,
2322 				    struct dmar_domain *domain,
2323 				    struct device *dev,
2324 				    u32 pasid)
2325 {
2326 	struct dma_pte *pgd = domain->pgd;
2327 	int agaw, level;
2328 	int flags = 0;
2329 
2330 	/*
2331 	 * Skip top levels of page tables for iommu which has
2332 	 * less agaw than default. Unnecessary for PT mode.
2333 	 */
2334 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2335 		pgd = phys_to_virt(dma_pte_addr(pgd));
2336 		if (!dma_pte_present(pgd))
2337 			return -ENOMEM;
2338 	}
2339 
2340 	level = agaw_to_level(agaw);
2341 	if (level != 4 && level != 5)
2342 		return -EINVAL;
2343 
2344 	if (pasid != PASID_RID2PASID)
2345 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2346 	if (level == 5)
2347 		flags |= PASID_FLAG_FL5LP;
2348 
2349 	if (domain->force_snooping)
2350 		flags |= PASID_FLAG_PAGE_SNOOP;
2351 
2352 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2353 					     domain_id_iommu(domain, iommu),
2354 					     flags);
2355 }
2356 
2357 static bool dev_is_real_dma_subdevice(struct device *dev)
2358 {
2359 	return dev && dev_is_pci(dev) &&
2360 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2361 }
2362 
2363 static int iommu_domain_identity_map(struct dmar_domain *domain,
2364 				     unsigned long first_vpfn,
2365 				     unsigned long last_vpfn)
2366 {
2367 	/*
2368 	 * RMRR range might have overlap with physical memory range,
2369 	 * clear it first
2370 	 */
2371 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2372 
2373 	return __domain_mapping(domain, first_vpfn,
2374 				first_vpfn, last_vpfn - first_vpfn + 1,
2375 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2376 }
2377 
2378 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2379 
2380 static int __init si_domain_init(int hw)
2381 {
2382 	struct dmar_rmrr_unit *rmrr;
2383 	struct device *dev;
2384 	int i, nid, ret;
2385 
2386 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2387 	if (!si_domain)
2388 		return -EFAULT;
2389 
2390 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2391 		domain_exit(si_domain);
2392 		si_domain = NULL;
2393 		return -EFAULT;
2394 	}
2395 
2396 	if (hw)
2397 		return 0;
2398 
2399 	for_each_online_node(nid) {
2400 		unsigned long start_pfn, end_pfn;
2401 		int i;
2402 
2403 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2404 			ret = iommu_domain_identity_map(si_domain,
2405 					mm_to_dma_pfn(start_pfn),
2406 					mm_to_dma_pfn(end_pfn));
2407 			if (ret)
2408 				return ret;
2409 		}
2410 	}
2411 
2412 	/*
2413 	 * Identity map the RMRRs so that devices with RMRRs could also use
2414 	 * the si_domain.
2415 	 */
2416 	for_each_rmrr_units(rmrr) {
2417 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2418 					  i, dev) {
2419 			unsigned long long start = rmrr->base_address;
2420 			unsigned long long end = rmrr->end_address;
2421 
2422 			if (WARN_ON(end < start ||
2423 				    end >> agaw_to_width(si_domain->agaw)))
2424 				continue;
2425 
2426 			ret = iommu_domain_identity_map(si_domain,
2427 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2428 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2429 			if (ret)
2430 				return ret;
2431 		}
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static int dmar_domain_attach_device(struct dmar_domain *domain,
2438 				     struct device *dev)
2439 {
2440 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2441 	struct intel_iommu *iommu;
2442 	unsigned long flags;
2443 	u8 bus, devfn;
2444 	int ret;
2445 
2446 	iommu = device_to_iommu(dev, &bus, &devfn);
2447 	if (!iommu)
2448 		return -ENODEV;
2449 
2450 	ret = domain_attach_iommu(domain, iommu);
2451 	if (ret)
2452 		return ret;
2453 	info->domain = domain;
2454 	spin_lock_irqsave(&domain->lock, flags);
2455 	list_add(&info->link, &domain->devices);
2456 	spin_unlock_irqrestore(&domain->lock, flags);
2457 
2458 	/* PASID table is mandatory for a PCI device in scalable mode. */
2459 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2460 		/* Setup the PASID entry for requests without PASID: */
2461 		if (hw_pass_through && domain_type_is_si(domain))
2462 			ret = intel_pasid_setup_pass_through(iommu, domain,
2463 					dev, PASID_RID2PASID);
2464 		else if (domain->use_first_level)
2465 			ret = domain_setup_first_level(iommu, domain, dev,
2466 					PASID_RID2PASID);
2467 		else
2468 			ret = intel_pasid_setup_second_level(iommu, domain,
2469 					dev, PASID_RID2PASID);
2470 		if (ret) {
2471 			dev_err(dev, "Setup RID2PASID failed\n");
2472 			device_block_translation(dev);
2473 			return ret;
2474 		}
2475 	}
2476 
2477 	ret = domain_context_mapping(domain, dev);
2478 	if (ret) {
2479 		dev_err(dev, "Domain context map failed\n");
2480 		device_block_translation(dev);
2481 		return ret;
2482 	}
2483 
2484 	iommu_enable_pci_caps(info);
2485 
2486 	return 0;
2487 }
2488 
2489 static bool device_has_rmrr(struct device *dev)
2490 {
2491 	struct dmar_rmrr_unit *rmrr;
2492 	struct device *tmp;
2493 	int i;
2494 
2495 	rcu_read_lock();
2496 	for_each_rmrr_units(rmrr) {
2497 		/*
2498 		 * Return TRUE if this RMRR contains the device that
2499 		 * is passed in.
2500 		 */
2501 		for_each_active_dev_scope(rmrr->devices,
2502 					  rmrr->devices_cnt, i, tmp)
2503 			if (tmp == dev ||
2504 			    is_downstream_to_pci_bridge(dev, tmp)) {
2505 				rcu_read_unlock();
2506 				return true;
2507 			}
2508 	}
2509 	rcu_read_unlock();
2510 	return false;
2511 }
2512 
2513 /**
2514  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2515  * is relaxable (ie. is allowed to be not enforced under some conditions)
2516  * @dev: device handle
2517  *
2518  * We assume that PCI USB devices with RMRRs have them largely
2519  * for historical reasons and that the RMRR space is not actively used post
2520  * boot.  This exclusion may change if vendors begin to abuse it.
2521  *
2522  * The same exception is made for graphics devices, with the requirement that
2523  * any use of the RMRR regions will be torn down before assigning the device
2524  * to a guest.
2525  *
2526  * Return: true if the RMRR is relaxable, false otherwise
2527  */
2528 static bool device_rmrr_is_relaxable(struct device *dev)
2529 {
2530 	struct pci_dev *pdev;
2531 
2532 	if (!dev_is_pci(dev))
2533 		return false;
2534 
2535 	pdev = to_pci_dev(dev);
2536 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2537 		return true;
2538 	else
2539 		return false;
2540 }
2541 
2542 /*
2543  * There are a couple cases where we need to restrict the functionality of
2544  * devices associated with RMRRs.  The first is when evaluating a device for
2545  * identity mapping because problems exist when devices are moved in and out
2546  * of domains and their respective RMRR information is lost.  This means that
2547  * a device with associated RMRRs will never be in a "passthrough" domain.
2548  * The second is use of the device through the IOMMU API.  This interface
2549  * expects to have full control of the IOVA space for the device.  We cannot
2550  * satisfy both the requirement that RMRR access is maintained and have an
2551  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2552  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2553  * We therefore prevent devices associated with an RMRR from participating in
2554  * the IOMMU API, which eliminates them from device assignment.
2555  *
2556  * In both cases, devices which have relaxable RMRRs are not concerned by this
2557  * restriction. See device_rmrr_is_relaxable comment.
2558  */
2559 static bool device_is_rmrr_locked(struct device *dev)
2560 {
2561 	if (!device_has_rmrr(dev))
2562 		return false;
2563 
2564 	if (device_rmrr_is_relaxable(dev))
2565 		return false;
2566 
2567 	return true;
2568 }
2569 
2570 /*
2571  * Return the required default domain type for a specific device.
2572  *
2573  * @dev: the device in query
2574  * @startup: true if this is during early boot
2575  *
2576  * Returns:
2577  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2578  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2579  *  - 0: both identity and dynamic domains work for this device
2580  */
2581 static int device_def_domain_type(struct device *dev)
2582 {
2583 	if (dev_is_pci(dev)) {
2584 		struct pci_dev *pdev = to_pci_dev(dev);
2585 
2586 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2587 			return IOMMU_DOMAIN_IDENTITY;
2588 
2589 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2590 			return IOMMU_DOMAIN_IDENTITY;
2591 	}
2592 
2593 	return 0;
2594 }
2595 
2596 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2597 {
2598 	/*
2599 	 * Start from the sane iommu hardware state.
2600 	 * If the queued invalidation is already initialized by us
2601 	 * (for example, while enabling interrupt-remapping) then
2602 	 * we got the things already rolling from a sane state.
2603 	 */
2604 	if (!iommu->qi) {
2605 		/*
2606 		 * Clear any previous faults.
2607 		 */
2608 		dmar_fault(-1, iommu);
2609 		/*
2610 		 * Disable queued invalidation if supported and already enabled
2611 		 * before OS handover.
2612 		 */
2613 		dmar_disable_qi(iommu);
2614 	}
2615 
2616 	if (dmar_enable_qi(iommu)) {
2617 		/*
2618 		 * Queued Invalidate not enabled, use Register Based Invalidate
2619 		 */
2620 		iommu->flush.flush_context = __iommu_flush_context;
2621 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2622 		pr_info("%s: Using Register based invalidation\n",
2623 			iommu->name);
2624 	} else {
2625 		iommu->flush.flush_context = qi_flush_context;
2626 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2627 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2628 	}
2629 }
2630 
2631 static int copy_context_table(struct intel_iommu *iommu,
2632 			      struct root_entry *old_re,
2633 			      struct context_entry **tbl,
2634 			      int bus, bool ext)
2635 {
2636 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2637 	struct context_entry *new_ce = NULL, ce;
2638 	struct context_entry *old_ce = NULL;
2639 	struct root_entry re;
2640 	phys_addr_t old_ce_phys;
2641 
2642 	tbl_idx = ext ? bus * 2 : bus;
2643 	memcpy(&re, old_re, sizeof(re));
2644 
2645 	for (devfn = 0; devfn < 256; devfn++) {
2646 		/* First calculate the correct index */
2647 		idx = (ext ? devfn * 2 : devfn) % 256;
2648 
2649 		if (idx == 0) {
2650 			/* First save what we may have and clean up */
2651 			if (new_ce) {
2652 				tbl[tbl_idx] = new_ce;
2653 				__iommu_flush_cache(iommu, new_ce,
2654 						    VTD_PAGE_SIZE);
2655 				pos = 1;
2656 			}
2657 
2658 			if (old_ce)
2659 				memunmap(old_ce);
2660 
2661 			ret = 0;
2662 			if (devfn < 0x80)
2663 				old_ce_phys = root_entry_lctp(&re);
2664 			else
2665 				old_ce_phys = root_entry_uctp(&re);
2666 
2667 			if (!old_ce_phys) {
2668 				if (ext && devfn == 0) {
2669 					/* No LCTP, try UCTP */
2670 					devfn = 0x7f;
2671 					continue;
2672 				} else {
2673 					goto out;
2674 				}
2675 			}
2676 
2677 			ret = -ENOMEM;
2678 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2679 					MEMREMAP_WB);
2680 			if (!old_ce)
2681 				goto out;
2682 
2683 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2684 			if (!new_ce)
2685 				goto out_unmap;
2686 
2687 			ret = 0;
2688 		}
2689 
2690 		/* Now copy the context entry */
2691 		memcpy(&ce, old_ce + idx, sizeof(ce));
2692 
2693 		if (!context_present(&ce))
2694 			continue;
2695 
2696 		did = context_domain_id(&ce);
2697 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2698 			set_bit(did, iommu->domain_ids);
2699 
2700 		set_context_copied(iommu, bus, devfn);
2701 		new_ce[idx] = ce;
2702 	}
2703 
2704 	tbl[tbl_idx + pos] = new_ce;
2705 
2706 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2707 
2708 out_unmap:
2709 	memunmap(old_ce);
2710 
2711 out:
2712 	return ret;
2713 }
2714 
2715 static int copy_translation_tables(struct intel_iommu *iommu)
2716 {
2717 	struct context_entry **ctxt_tbls;
2718 	struct root_entry *old_rt;
2719 	phys_addr_t old_rt_phys;
2720 	int ctxt_table_entries;
2721 	u64 rtaddr_reg;
2722 	int bus, ret;
2723 	bool new_ext, ext;
2724 
2725 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2726 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2727 	new_ext    = !!sm_supported(iommu);
2728 
2729 	/*
2730 	 * The RTT bit can only be changed when translation is disabled,
2731 	 * but disabling translation means to open a window for data
2732 	 * corruption. So bail out and don't copy anything if we would
2733 	 * have to change the bit.
2734 	 */
2735 	if (new_ext != ext)
2736 		return -EINVAL;
2737 
2738 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2739 	if (!iommu->copied_tables)
2740 		return -ENOMEM;
2741 
2742 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2743 	if (!old_rt_phys)
2744 		return -EINVAL;
2745 
2746 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2747 	if (!old_rt)
2748 		return -ENOMEM;
2749 
2750 	/* This is too big for the stack - allocate it from slab */
2751 	ctxt_table_entries = ext ? 512 : 256;
2752 	ret = -ENOMEM;
2753 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2754 	if (!ctxt_tbls)
2755 		goto out_unmap;
2756 
2757 	for (bus = 0; bus < 256; bus++) {
2758 		ret = copy_context_table(iommu, &old_rt[bus],
2759 					 ctxt_tbls, bus, ext);
2760 		if (ret) {
2761 			pr_err("%s: Failed to copy context table for bus %d\n",
2762 				iommu->name, bus);
2763 			continue;
2764 		}
2765 	}
2766 
2767 	spin_lock(&iommu->lock);
2768 
2769 	/* Context tables are copied, now write them to the root_entry table */
2770 	for (bus = 0; bus < 256; bus++) {
2771 		int idx = ext ? bus * 2 : bus;
2772 		u64 val;
2773 
2774 		if (ctxt_tbls[idx]) {
2775 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2776 			iommu->root_entry[bus].lo = val;
2777 		}
2778 
2779 		if (!ext || !ctxt_tbls[idx + 1])
2780 			continue;
2781 
2782 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2783 		iommu->root_entry[bus].hi = val;
2784 	}
2785 
2786 	spin_unlock(&iommu->lock);
2787 
2788 	kfree(ctxt_tbls);
2789 
2790 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2791 
2792 	ret = 0;
2793 
2794 out_unmap:
2795 	memunmap(old_rt);
2796 
2797 	return ret;
2798 }
2799 
2800 #ifdef CONFIG_INTEL_IOMMU_SVM
2801 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2802 {
2803 	struct intel_iommu *iommu = data;
2804 	ioasid_t ioasid;
2805 
2806 	if (!iommu)
2807 		return INVALID_IOASID;
2808 	/*
2809 	 * VT-d virtual command interface always uses the full 20 bit
2810 	 * PASID range. Host can partition guest PASID range based on
2811 	 * policies but it is out of guest's control.
2812 	 */
2813 	if (min < PASID_MIN || max > intel_pasid_max_id)
2814 		return INVALID_IOASID;
2815 
2816 	if (vcmd_alloc_pasid(iommu, &ioasid))
2817 		return INVALID_IOASID;
2818 
2819 	return ioasid;
2820 }
2821 
2822 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2823 {
2824 	struct intel_iommu *iommu = data;
2825 
2826 	if (!iommu)
2827 		return;
2828 	/*
2829 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2830 	 * We can only free the PASID when all the devices are unbound.
2831 	 */
2832 	if (ioasid_find(NULL, ioasid, NULL)) {
2833 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2834 		return;
2835 	}
2836 	vcmd_free_pasid(iommu, ioasid);
2837 }
2838 
2839 static void register_pasid_allocator(struct intel_iommu *iommu)
2840 {
2841 	/*
2842 	 * If we are running in the host, no need for custom allocator
2843 	 * in that PASIDs are allocated from the host system-wide.
2844 	 */
2845 	if (!cap_caching_mode(iommu->cap))
2846 		return;
2847 
2848 	if (!sm_supported(iommu)) {
2849 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2850 		return;
2851 	}
2852 
2853 	/*
2854 	 * Register a custom PASID allocator if we are running in a guest,
2855 	 * guest PASID must be obtained via virtual command interface.
2856 	 * There can be multiple vIOMMUs in each guest but only one allocator
2857 	 * is active. All vIOMMU allocators will eventually be calling the same
2858 	 * host allocator.
2859 	 */
2860 	if (!vccap_pasid(iommu->vccap))
2861 		return;
2862 
2863 	pr_info("Register custom PASID allocator\n");
2864 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2865 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2866 	iommu->pasid_allocator.pdata = (void *)iommu;
2867 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2868 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2869 		/*
2870 		 * Disable scalable mode on this IOMMU if there
2871 		 * is no custom allocator. Mixing SM capable vIOMMU
2872 		 * and non-SM vIOMMU are not supported.
2873 		 */
2874 		intel_iommu_sm = 0;
2875 	}
2876 }
2877 #endif
2878 
2879 static int __init init_dmars(void)
2880 {
2881 	struct dmar_drhd_unit *drhd;
2882 	struct intel_iommu *iommu;
2883 	int ret;
2884 
2885 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2886 	if (ret)
2887 		goto free_iommu;
2888 
2889 	for_each_iommu(iommu, drhd) {
2890 		if (drhd->ignored) {
2891 			iommu_disable_translation(iommu);
2892 			continue;
2893 		}
2894 
2895 		/*
2896 		 * Find the max pasid size of all IOMMU's in the system.
2897 		 * We need to ensure the system pasid table is no bigger
2898 		 * than the smallest supported.
2899 		 */
2900 		if (pasid_supported(iommu)) {
2901 			u32 temp = 2 << ecap_pss(iommu->ecap);
2902 
2903 			intel_pasid_max_id = min_t(u32, temp,
2904 						   intel_pasid_max_id);
2905 		}
2906 
2907 		intel_iommu_init_qi(iommu);
2908 
2909 		ret = iommu_init_domains(iommu);
2910 		if (ret)
2911 			goto free_iommu;
2912 
2913 		init_translation_status(iommu);
2914 
2915 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2916 			iommu_disable_translation(iommu);
2917 			clear_translation_pre_enabled(iommu);
2918 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2919 				iommu->name);
2920 		}
2921 
2922 		/*
2923 		 * TBD:
2924 		 * we could share the same root & context tables
2925 		 * among all IOMMU's. Need to Split it later.
2926 		 */
2927 		ret = iommu_alloc_root_entry(iommu);
2928 		if (ret)
2929 			goto free_iommu;
2930 
2931 		if (translation_pre_enabled(iommu)) {
2932 			pr_info("Translation already enabled - trying to copy translation structures\n");
2933 
2934 			ret = copy_translation_tables(iommu);
2935 			if (ret) {
2936 				/*
2937 				 * We found the IOMMU with translation
2938 				 * enabled - but failed to copy over the
2939 				 * old root-entry table. Try to proceed
2940 				 * by disabling translation now and
2941 				 * allocating a clean root-entry table.
2942 				 * This might cause DMAR faults, but
2943 				 * probably the dump will still succeed.
2944 				 */
2945 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2946 				       iommu->name);
2947 				iommu_disable_translation(iommu);
2948 				clear_translation_pre_enabled(iommu);
2949 			} else {
2950 				pr_info("Copied translation tables from previous kernel for %s\n",
2951 					iommu->name);
2952 			}
2953 		}
2954 
2955 		if (!ecap_pass_through(iommu->ecap))
2956 			hw_pass_through = 0;
2957 		intel_svm_check(iommu);
2958 	}
2959 
2960 	/*
2961 	 * Now that qi is enabled on all iommus, set the root entry and flush
2962 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2963 	 * flush_context function will loop forever and the boot hangs.
2964 	 */
2965 	for_each_active_iommu(iommu, drhd) {
2966 		iommu_flush_write_buffer(iommu);
2967 #ifdef CONFIG_INTEL_IOMMU_SVM
2968 		register_pasid_allocator(iommu);
2969 #endif
2970 		iommu_set_root_entry(iommu);
2971 	}
2972 
2973 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2974 	dmar_map_gfx = 0;
2975 #endif
2976 
2977 	if (!dmar_map_gfx)
2978 		iommu_identity_mapping |= IDENTMAP_GFX;
2979 
2980 	check_tylersburg_isoch();
2981 
2982 	ret = si_domain_init(hw_pass_through);
2983 	if (ret)
2984 		goto free_iommu;
2985 
2986 	/*
2987 	 * for each drhd
2988 	 *   enable fault log
2989 	 *   global invalidate context cache
2990 	 *   global invalidate iotlb
2991 	 *   enable translation
2992 	 */
2993 	for_each_iommu(iommu, drhd) {
2994 		if (drhd->ignored) {
2995 			/*
2996 			 * we always have to disable PMRs or DMA may fail on
2997 			 * this device
2998 			 */
2999 			if (force_on)
3000 				iommu_disable_protect_mem_regions(iommu);
3001 			continue;
3002 		}
3003 
3004 		iommu_flush_write_buffer(iommu);
3005 
3006 #ifdef CONFIG_INTEL_IOMMU_SVM
3007 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3008 			/*
3009 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3010 			 * could cause possible lock race condition.
3011 			 */
3012 			up_write(&dmar_global_lock);
3013 			ret = intel_svm_enable_prq(iommu);
3014 			down_write(&dmar_global_lock);
3015 			if (ret)
3016 				goto free_iommu;
3017 		}
3018 #endif
3019 		ret = dmar_set_interrupt(iommu);
3020 		if (ret)
3021 			goto free_iommu;
3022 	}
3023 
3024 	return 0;
3025 
3026 free_iommu:
3027 	for_each_active_iommu(iommu, drhd) {
3028 		disable_dmar_iommu(iommu);
3029 		free_dmar_iommu(iommu);
3030 	}
3031 	if (si_domain) {
3032 		domain_exit(si_domain);
3033 		si_domain = NULL;
3034 	}
3035 
3036 	return ret;
3037 }
3038 
3039 static void __init init_no_remapping_devices(void)
3040 {
3041 	struct dmar_drhd_unit *drhd;
3042 	struct device *dev;
3043 	int i;
3044 
3045 	for_each_drhd_unit(drhd) {
3046 		if (!drhd->include_all) {
3047 			for_each_active_dev_scope(drhd->devices,
3048 						  drhd->devices_cnt, i, dev)
3049 				break;
3050 			/* ignore DMAR unit if no devices exist */
3051 			if (i == drhd->devices_cnt)
3052 				drhd->ignored = 1;
3053 		}
3054 	}
3055 
3056 	for_each_active_drhd_unit(drhd) {
3057 		if (drhd->include_all)
3058 			continue;
3059 
3060 		for_each_active_dev_scope(drhd->devices,
3061 					  drhd->devices_cnt, i, dev)
3062 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3063 				break;
3064 		if (i < drhd->devices_cnt)
3065 			continue;
3066 
3067 		/* This IOMMU has *only* gfx devices. Either bypass it or
3068 		   set the gfx_mapped flag, as appropriate */
3069 		drhd->gfx_dedicated = 1;
3070 		if (!dmar_map_gfx)
3071 			drhd->ignored = 1;
3072 	}
3073 }
3074 
3075 #ifdef CONFIG_SUSPEND
3076 static int init_iommu_hw(void)
3077 {
3078 	struct dmar_drhd_unit *drhd;
3079 	struct intel_iommu *iommu = NULL;
3080 
3081 	for_each_active_iommu(iommu, drhd)
3082 		if (iommu->qi)
3083 			dmar_reenable_qi(iommu);
3084 
3085 	for_each_iommu(iommu, drhd) {
3086 		if (drhd->ignored) {
3087 			/*
3088 			 * we always have to disable PMRs or DMA may fail on
3089 			 * this device
3090 			 */
3091 			if (force_on)
3092 				iommu_disable_protect_mem_regions(iommu);
3093 			continue;
3094 		}
3095 
3096 		iommu_flush_write_buffer(iommu);
3097 		iommu_set_root_entry(iommu);
3098 		iommu_enable_translation(iommu);
3099 		iommu_disable_protect_mem_regions(iommu);
3100 	}
3101 
3102 	return 0;
3103 }
3104 
3105 static void iommu_flush_all(void)
3106 {
3107 	struct dmar_drhd_unit *drhd;
3108 	struct intel_iommu *iommu;
3109 
3110 	for_each_active_iommu(iommu, drhd) {
3111 		iommu->flush.flush_context(iommu, 0, 0, 0,
3112 					   DMA_CCMD_GLOBAL_INVL);
3113 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3114 					 DMA_TLB_GLOBAL_FLUSH);
3115 	}
3116 }
3117 
3118 static int iommu_suspend(void)
3119 {
3120 	struct dmar_drhd_unit *drhd;
3121 	struct intel_iommu *iommu = NULL;
3122 	unsigned long flag;
3123 
3124 	for_each_active_iommu(iommu, drhd) {
3125 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3126 					     GFP_KERNEL);
3127 		if (!iommu->iommu_state)
3128 			goto nomem;
3129 	}
3130 
3131 	iommu_flush_all();
3132 
3133 	for_each_active_iommu(iommu, drhd) {
3134 		iommu_disable_translation(iommu);
3135 
3136 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3137 
3138 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3139 			readl(iommu->reg + DMAR_FECTL_REG);
3140 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3141 			readl(iommu->reg + DMAR_FEDATA_REG);
3142 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3143 			readl(iommu->reg + DMAR_FEADDR_REG);
3144 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3145 			readl(iommu->reg + DMAR_FEUADDR_REG);
3146 
3147 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3148 	}
3149 	return 0;
3150 
3151 nomem:
3152 	for_each_active_iommu(iommu, drhd)
3153 		kfree(iommu->iommu_state);
3154 
3155 	return -ENOMEM;
3156 }
3157 
3158 static void iommu_resume(void)
3159 {
3160 	struct dmar_drhd_unit *drhd;
3161 	struct intel_iommu *iommu = NULL;
3162 	unsigned long flag;
3163 
3164 	if (init_iommu_hw()) {
3165 		if (force_on)
3166 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3167 		else
3168 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3169 		return;
3170 	}
3171 
3172 	for_each_active_iommu(iommu, drhd) {
3173 
3174 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3175 
3176 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3177 			iommu->reg + DMAR_FECTL_REG);
3178 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3179 			iommu->reg + DMAR_FEDATA_REG);
3180 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3181 			iommu->reg + DMAR_FEADDR_REG);
3182 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3183 			iommu->reg + DMAR_FEUADDR_REG);
3184 
3185 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3186 	}
3187 
3188 	for_each_active_iommu(iommu, drhd)
3189 		kfree(iommu->iommu_state);
3190 }
3191 
3192 static struct syscore_ops iommu_syscore_ops = {
3193 	.resume		= iommu_resume,
3194 	.suspend	= iommu_suspend,
3195 };
3196 
3197 static void __init init_iommu_pm_ops(void)
3198 {
3199 	register_syscore_ops(&iommu_syscore_ops);
3200 }
3201 
3202 #else
3203 static inline void init_iommu_pm_ops(void) {}
3204 #endif	/* CONFIG_PM */
3205 
3206 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3207 {
3208 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3209 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3210 	    rmrr->end_address <= rmrr->base_address ||
3211 	    arch_rmrr_sanity_check(rmrr))
3212 		return -EINVAL;
3213 
3214 	return 0;
3215 }
3216 
3217 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3218 {
3219 	struct acpi_dmar_reserved_memory *rmrr;
3220 	struct dmar_rmrr_unit *rmrru;
3221 
3222 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3223 	if (rmrr_sanity_check(rmrr)) {
3224 		pr_warn(FW_BUG
3225 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3226 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3227 			   rmrr->base_address, rmrr->end_address,
3228 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3229 			   dmi_get_system_info(DMI_BIOS_VERSION),
3230 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3231 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3232 	}
3233 
3234 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3235 	if (!rmrru)
3236 		goto out;
3237 
3238 	rmrru->hdr = header;
3239 
3240 	rmrru->base_address = rmrr->base_address;
3241 	rmrru->end_address = rmrr->end_address;
3242 
3243 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3244 				((void *)rmrr) + rmrr->header.length,
3245 				&rmrru->devices_cnt);
3246 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3247 		goto free_rmrru;
3248 
3249 	list_add(&rmrru->list, &dmar_rmrr_units);
3250 
3251 	return 0;
3252 free_rmrru:
3253 	kfree(rmrru);
3254 out:
3255 	return -ENOMEM;
3256 }
3257 
3258 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3259 {
3260 	struct dmar_atsr_unit *atsru;
3261 	struct acpi_dmar_atsr *tmp;
3262 
3263 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3264 				dmar_rcu_check()) {
3265 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3266 		if (atsr->segment != tmp->segment)
3267 			continue;
3268 		if (atsr->header.length != tmp->header.length)
3269 			continue;
3270 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3271 			return atsru;
3272 	}
3273 
3274 	return NULL;
3275 }
3276 
3277 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3278 {
3279 	struct acpi_dmar_atsr *atsr;
3280 	struct dmar_atsr_unit *atsru;
3281 
3282 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3283 		return 0;
3284 
3285 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3286 	atsru = dmar_find_atsr(atsr);
3287 	if (atsru)
3288 		return 0;
3289 
3290 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3291 	if (!atsru)
3292 		return -ENOMEM;
3293 
3294 	/*
3295 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3296 	 * copy the memory content because the memory buffer will be freed
3297 	 * on return.
3298 	 */
3299 	atsru->hdr = (void *)(atsru + 1);
3300 	memcpy(atsru->hdr, hdr, hdr->length);
3301 	atsru->include_all = atsr->flags & 0x1;
3302 	if (!atsru->include_all) {
3303 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3304 				(void *)atsr + atsr->header.length,
3305 				&atsru->devices_cnt);
3306 		if (atsru->devices_cnt && atsru->devices == NULL) {
3307 			kfree(atsru);
3308 			return -ENOMEM;
3309 		}
3310 	}
3311 
3312 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3313 
3314 	return 0;
3315 }
3316 
3317 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3318 {
3319 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3320 	kfree(atsru);
3321 }
3322 
3323 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3324 {
3325 	struct acpi_dmar_atsr *atsr;
3326 	struct dmar_atsr_unit *atsru;
3327 
3328 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3329 	atsru = dmar_find_atsr(atsr);
3330 	if (atsru) {
3331 		list_del_rcu(&atsru->list);
3332 		synchronize_rcu();
3333 		intel_iommu_free_atsr(atsru);
3334 	}
3335 
3336 	return 0;
3337 }
3338 
3339 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3340 {
3341 	int i;
3342 	struct device *dev;
3343 	struct acpi_dmar_atsr *atsr;
3344 	struct dmar_atsr_unit *atsru;
3345 
3346 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3347 	atsru = dmar_find_atsr(atsr);
3348 	if (!atsru)
3349 		return 0;
3350 
3351 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3352 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3353 					  i, dev)
3354 			return -EBUSY;
3355 	}
3356 
3357 	return 0;
3358 }
3359 
3360 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3361 {
3362 	struct dmar_satc_unit *satcu;
3363 	struct acpi_dmar_satc *tmp;
3364 
3365 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3366 				dmar_rcu_check()) {
3367 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3368 		if (satc->segment != tmp->segment)
3369 			continue;
3370 		if (satc->header.length != tmp->header.length)
3371 			continue;
3372 		if (memcmp(satc, tmp, satc->header.length) == 0)
3373 			return satcu;
3374 	}
3375 
3376 	return NULL;
3377 }
3378 
3379 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3380 {
3381 	struct acpi_dmar_satc *satc;
3382 	struct dmar_satc_unit *satcu;
3383 
3384 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3385 		return 0;
3386 
3387 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3388 	satcu = dmar_find_satc(satc);
3389 	if (satcu)
3390 		return 0;
3391 
3392 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3393 	if (!satcu)
3394 		return -ENOMEM;
3395 
3396 	satcu->hdr = (void *)(satcu + 1);
3397 	memcpy(satcu->hdr, hdr, hdr->length);
3398 	satcu->atc_required = satc->flags & 0x1;
3399 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3400 					      (void *)satc + satc->header.length,
3401 					      &satcu->devices_cnt);
3402 	if (satcu->devices_cnt && !satcu->devices) {
3403 		kfree(satcu);
3404 		return -ENOMEM;
3405 	}
3406 	list_add_rcu(&satcu->list, &dmar_satc_units);
3407 
3408 	return 0;
3409 }
3410 
3411 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3412 {
3413 	int sp, ret;
3414 	struct intel_iommu *iommu = dmaru->iommu;
3415 
3416 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3417 	if (ret)
3418 		goto out;
3419 
3420 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3421 		pr_warn("%s: Doesn't support hardware pass through.\n",
3422 			iommu->name);
3423 		return -ENXIO;
3424 	}
3425 
3426 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3427 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3428 		pr_warn("%s: Doesn't support large page.\n",
3429 			iommu->name);
3430 		return -ENXIO;
3431 	}
3432 
3433 	/*
3434 	 * Disable translation if already enabled prior to OS handover.
3435 	 */
3436 	if (iommu->gcmd & DMA_GCMD_TE)
3437 		iommu_disable_translation(iommu);
3438 
3439 	ret = iommu_init_domains(iommu);
3440 	if (ret == 0)
3441 		ret = iommu_alloc_root_entry(iommu);
3442 	if (ret)
3443 		goto out;
3444 
3445 	intel_svm_check(iommu);
3446 
3447 	if (dmaru->ignored) {
3448 		/*
3449 		 * we always have to disable PMRs or DMA may fail on this device
3450 		 */
3451 		if (force_on)
3452 			iommu_disable_protect_mem_regions(iommu);
3453 		return 0;
3454 	}
3455 
3456 	intel_iommu_init_qi(iommu);
3457 	iommu_flush_write_buffer(iommu);
3458 
3459 #ifdef CONFIG_INTEL_IOMMU_SVM
3460 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3461 		ret = intel_svm_enable_prq(iommu);
3462 		if (ret)
3463 			goto disable_iommu;
3464 	}
3465 #endif
3466 	ret = dmar_set_interrupt(iommu);
3467 	if (ret)
3468 		goto disable_iommu;
3469 
3470 	iommu_set_root_entry(iommu);
3471 	iommu_enable_translation(iommu);
3472 
3473 	iommu_disable_protect_mem_regions(iommu);
3474 	return 0;
3475 
3476 disable_iommu:
3477 	disable_dmar_iommu(iommu);
3478 out:
3479 	free_dmar_iommu(iommu);
3480 	return ret;
3481 }
3482 
3483 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3484 {
3485 	int ret = 0;
3486 	struct intel_iommu *iommu = dmaru->iommu;
3487 
3488 	if (!intel_iommu_enabled)
3489 		return 0;
3490 	if (iommu == NULL)
3491 		return -EINVAL;
3492 
3493 	if (insert) {
3494 		ret = intel_iommu_add(dmaru);
3495 	} else {
3496 		disable_dmar_iommu(iommu);
3497 		free_dmar_iommu(iommu);
3498 	}
3499 
3500 	return ret;
3501 }
3502 
3503 static void intel_iommu_free_dmars(void)
3504 {
3505 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3506 	struct dmar_atsr_unit *atsru, *atsr_n;
3507 	struct dmar_satc_unit *satcu, *satc_n;
3508 
3509 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3510 		list_del(&rmrru->list);
3511 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3512 		kfree(rmrru);
3513 	}
3514 
3515 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3516 		list_del(&atsru->list);
3517 		intel_iommu_free_atsr(atsru);
3518 	}
3519 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3520 		list_del(&satcu->list);
3521 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3522 		kfree(satcu);
3523 	}
3524 }
3525 
3526 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3527 {
3528 	struct dmar_satc_unit *satcu;
3529 	struct acpi_dmar_satc *satc;
3530 	struct device *tmp;
3531 	int i;
3532 
3533 	dev = pci_physfn(dev);
3534 	rcu_read_lock();
3535 
3536 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3537 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3538 		if (satc->segment != pci_domain_nr(dev->bus))
3539 			continue;
3540 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3541 			if (to_pci_dev(tmp) == dev)
3542 				goto out;
3543 	}
3544 	satcu = NULL;
3545 out:
3546 	rcu_read_unlock();
3547 	return satcu;
3548 }
3549 
3550 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3551 {
3552 	int i, ret = 1;
3553 	struct pci_bus *bus;
3554 	struct pci_dev *bridge = NULL;
3555 	struct device *tmp;
3556 	struct acpi_dmar_atsr *atsr;
3557 	struct dmar_atsr_unit *atsru;
3558 	struct dmar_satc_unit *satcu;
3559 
3560 	dev = pci_physfn(dev);
3561 	satcu = dmar_find_matched_satc_unit(dev);
3562 	if (satcu)
3563 		/*
3564 		 * This device supports ATS as it is in SATC table.
3565 		 * When IOMMU is in legacy mode, enabling ATS is done
3566 		 * automatically by HW for the device that requires
3567 		 * ATS, hence OS should not enable this device ATS
3568 		 * to avoid duplicated TLB invalidation.
3569 		 */
3570 		return !(satcu->atc_required && !sm_supported(iommu));
3571 
3572 	for (bus = dev->bus; bus; bus = bus->parent) {
3573 		bridge = bus->self;
3574 		/* If it's an integrated device, allow ATS */
3575 		if (!bridge)
3576 			return 1;
3577 		/* Connected via non-PCIe: no ATS */
3578 		if (!pci_is_pcie(bridge) ||
3579 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3580 			return 0;
3581 		/* If we found the root port, look it up in the ATSR */
3582 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3583 			break;
3584 	}
3585 
3586 	rcu_read_lock();
3587 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3588 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3589 		if (atsr->segment != pci_domain_nr(dev->bus))
3590 			continue;
3591 
3592 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3593 			if (tmp == &bridge->dev)
3594 				goto out;
3595 
3596 		if (atsru->include_all)
3597 			goto out;
3598 	}
3599 	ret = 0;
3600 out:
3601 	rcu_read_unlock();
3602 
3603 	return ret;
3604 }
3605 
3606 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3607 {
3608 	int ret;
3609 	struct dmar_rmrr_unit *rmrru;
3610 	struct dmar_atsr_unit *atsru;
3611 	struct dmar_satc_unit *satcu;
3612 	struct acpi_dmar_atsr *atsr;
3613 	struct acpi_dmar_reserved_memory *rmrr;
3614 	struct acpi_dmar_satc *satc;
3615 
3616 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3617 		return 0;
3618 
3619 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3620 		rmrr = container_of(rmrru->hdr,
3621 				    struct acpi_dmar_reserved_memory, header);
3622 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3623 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3624 				((void *)rmrr) + rmrr->header.length,
3625 				rmrr->segment, rmrru->devices,
3626 				rmrru->devices_cnt);
3627 			if (ret < 0)
3628 				return ret;
3629 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3630 			dmar_remove_dev_scope(info, rmrr->segment,
3631 				rmrru->devices, rmrru->devices_cnt);
3632 		}
3633 	}
3634 
3635 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3636 		if (atsru->include_all)
3637 			continue;
3638 
3639 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3640 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3641 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3642 					(void *)atsr + atsr->header.length,
3643 					atsr->segment, atsru->devices,
3644 					atsru->devices_cnt);
3645 			if (ret > 0)
3646 				break;
3647 			else if (ret < 0)
3648 				return ret;
3649 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3650 			if (dmar_remove_dev_scope(info, atsr->segment,
3651 					atsru->devices, atsru->devices_cnt))
3652 				break;
3653 		}
3654 	}
3655 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3656 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3657 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3658 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3659 					(void *)satc + satc->header.length,
3660 					satc->segment, satcu->devices,
3661 					satcu->devices_cnt);
3662 			if (ret > 0)
3663 				break;
3664 			else if (ret < 0)
3665 				return ret;
3666 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3667 			if (dmar_remove_dev_scope(info, satc->segment,
3668 					satcu->devices, satcu->devices_cnt))
3669 				break;
3670 		}
3671 	}
3672 
3673 	return 0;
3674 }
3675 
3676 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3677 				       unsigned long val, void *v)
3678 {
3679 	struct memory_notify *mhp = v;
3680 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3681 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3682 			mhp->nr_pages - 1);
3683 
3684 	switch (val) {
3685 	case MEM_GOING_ONLINE:
3686 		if (iommu_domain_identity_map(si_domain,
3687 					      start_vpfn, last_vpfn)) {
3688 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3689 				start_vpfn, last_vpfn);
3690 			return NOTIFY_BAD;
3691 		}
3692 		break;
3693 
3694 	case MEM_OFFLINE:
3695 	case MEM_CANCEL_ONLINE:
3696 		{
3697 			struct dmar_drhd_unit *drhd;
3698 			struct intel_iommu *iommu;
3699 			LIST_HEAD(freelist);
3700 
3701 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3702 
3703 			rcu_read_lock();
3704 			for_each_active_iommu(iommu, drhd)
3705 				iommu_flush_iotlb_psi(iommu, si_domain,
3706 					start_vpfn, mhp->nr_pages,
3707 					list_empty(&freelist), 0);
3708 			rcu_read_unlock();
3709 			put_pages_list(&freelist);
3710 		}
3711 		break;
3712 	}
3713 
3714 	return NOTIFY_OK;
3715 }
3716 
3717 static struct notifier_block intel_iommu_memory_nb = {
3718 	.notifier_call = intel_iommu_memory_notifier,
3719 	.priority = 0
3720 };
3721 
3722 static void intel_disable_iommus(void)
3723 {
3724 	struct intel_iommu *iommu = NULL;
3725 	struct dmar_drhd_unit *drhd;
3726 
3727 	for_each_iommu(iommu, drhd)
3728 		iommu_disable_translation(iommu);
3729 }
3730 
3731 void intel_iommu_shutdown(void)
3732 {
3733 	struct dmar_drhd_unit *drhd;
3734 	struct intel_iommu *iommu = NULL;
3735 
3736 	if (no_iommu || dmar_disabled)
3737 		return;
3738 
3739 	down_write(&dmar_global_lock);
3740 
3741 	/* Disable PMRs explicitly here. */
3742 	for_each_iommu(iommu, drhd)
3743 		iommu_disable_protect_mem_regions(iommu);
3744 
3745 	/* Make sure the IOMMUs are switched off */
3746 	intel_disable_iommus();
3747 
3748 	up_write(&dmar_global_lock);
3749 }
3750 
3751 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3752 {
3753 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3754 
3755 	return container_of(iommu_dev, struct intel_iommu, iommu);
3756 }
3757 
3758 static ssize_t version_show(struct device *dev,
3759 			    struct device_attribute *attr, char *buf)
3760 {
3761 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3762 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3763 	return sprintf(buf, "%d:%d\n",
3764 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3765 }
3766 static DEVICE_ATTR_RO(version);
3767 
3768 static ssize_t address_show(struct device *dev,
3769 			    struct device_attribute *attr, char *buf)
3770 {
3771 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3772 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3773 }
3774 static DEVICE_ATTR_RO(address);
3775 
3776 static ssize_t cap_show(struct device *dev,
3777 			struct device_attribute *attr, char *buf)
3778 {
3779 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3780 	return sprintf(buf, "%llx\n", iommu->cap);
3781 }
3782 static DEVICE_ATTR_RO(cap);
3783 
3784 static ssize_t ecap_show(struct device *dev,
3785 			 struct device_attribute *attr, char *buf)
3786 {
3787 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3788 	return sprintf(buf, "%llx\n", iommu->ecap);
3789 }
3790 static DEVICE_ATTR_RO(ecap);
3791 
3792 static ssize_t domains_supported_show(struct device *dev,
3793 				      struct device_attribute *attr, char *buf)
3794 {
3795 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3796 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3797 }
3798 static DEVICE_ATTR_RO(domains_supported);
3799 
3800 static ssize_t domains_used_show(struct device *dev,
3801 				 struct device_attribute *attr, char *buf)
3802 {
3803 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3804 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3805 						  cap_ndoms(iommu->cap)));
3806 }
3807 static DEVICE_ATTR_RO(domains_used);
3808 
3809 static struct attribute *intel_iommu_attrs[] = {
3810 	&dev_attr_version.attr,
3811 	&dev_attr_address.attr,
3812 	&dev_attr_cap.attr,
3813 	&dev_attr_ecap.attr,
3814 	&dev_attr_domains_supported.attr,
3815 	&dev_attr_domains_used.attr,
3816 	NULL,
3817 };
3818 
3819 static struct attribute_group intel_iommu_group = {
3820 	.name = "intel-iommu",
3821 	.attrs = intel_iommu_attrs,
3822 };
3823 
3824 const struct attribute_group *intel_iommu_groups[] = {
3825 	&intel_iommu_group,
3826 	NULL,
3827 };
3828 
3829 static inline bool has_external_pci(void)
3830 {
3831 	struct pci_dev *pdev = NULL;
3832 
3833 	for_each_pci_dev(pdev)
3834 		if (pdev->external_facing) {
3835 			pci_dev_put(pdev);
3836 			return true;
3837 		}
3838 
3839 	return false;
3840 }
3841 
3842 static int __init platform_optin_force_iommu(void)
3843 {
3844 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3845 		return 0;
3846 
3847 	if (no_iommu || dmar_disabled)
3848 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3849 
3850 	/*
3851 	 * If Intel-IOMMU is disabled by default, we will apply identity
3852 	 * map for all devices except those marked as being untrusted.
3853 	 */
3854 	if (dmar_disabled)
3855 		iommu_set_default_passthrough(false);
3856 
3857 	dmar_disabled = 0;
3858 	no_iommu = 0;
3859 
3860 	return 1;
3861 }
3862 
3863 static int __init probe_acpi_namespace_devices(void)
3864 {
3865 	struct dmar_drhd_unit *drhd;
3866 	/* To avoid a -Wunused-but-set-variable warning. */
3867 	struct intel_iommu *iommu __maybe_unused;
3868 	struct device *dev;
3869 	int i, ret = 0;
3870 
3871 	for_each_active_iommu(iommu, drhd) {
3872 		for_each_active_dev_scope(drhd->devices,
3873 					  drhd->devices_cnt, i, dev) {
3874 			struct acpi_device_physical_node *pn;
3875 			struct iommu_group *group;
3876 			struct acpi_device *adev;
3877 
3878 			if (dev->bus != &acpi_bus_type)
3879 				continue;
3880 
3881 			adev = to_acpi_device(dev);
3882 			mutex_lock(&adev->physical_node_lock);
3883 			list_for_each_entry(pn,
3884 					    &adev->physical_node_list, node) {
3885 				group = iommu_group_get(pn->dev);
3886 				if (group) {
3887 					iommu_group_put(group);
3888 					continue;
3889 				}
3890 
3891 				ret = iommu_probe_device(pn->dev);
3892 				if (ret)
3893 					break;
3894 			}
3895 			mutex_unlock(&adev->physical_node_lock);
3896 
3897 			if (ret)
3898 				return ret;
3899 		}
3900 	}
3901 
3902 	return 0;
3903 }
3904 
3905 static __init int tboot_force_iommu(void)
3906 {
3907 	if (!tboot_enabled())
3908 		return 0;
3909 
3910 	if (no_iommu || dmar_disabled)
3911 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3912 
3913 	dmar_disabled = 0;
3914 	no_iommu = 0;
3915 
3916 	return 1;
3917 }
3918 
3919 int __init intel_iommu_init(void)
3920 {
3921 	int ret = -ENODEV;
3922 	struct dmar_drhd_unit *drhd;
3923 	struct intel_iommu *iommu;
3924 
3925 	/*
3926 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3927 	 * opt in, so enforce that.
3928 	 */
3929 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3930 		    platform_optin_force_iommu();
3931 
3932 	down_write(&dmar_global_lock);
3933 	if (dmar_table_init()) {
3934 		if (force_on)
3935 			panic("tboot: Failed to initialize DMAR table\n");
3936 		goto out_free_dmar;
3937 	}
3938 
3939 	if (dmar_dev_scope_init() < 0) {
3940 		if (force_on)
3941 			panic("tboot: Failed to initialize DMAR device scope\n");
3942 		goto out_free_dmar;
3943 	}
3944 
3945 	up_write(&dmar_global_lock);
3946 
3947 	/*
3948 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3949 	 * complain later when we register it under the lock.
3950 	 */
3951 	dmar_register_bus_notifier();
3952 
3953 	down_write(&dmar_global_lock);
3954 
3955 	if (!no_iommu)
3956 		intel_iommu_debugfs_init();
3957 
3958 	if (no_iommu || dmar_disabled) {
3959 		/*
3960 		 * We exit the function here to ensure IOMMU's remapping and
3961 		 * mempool aren't setup, which means that the IOMMU's PMRs
3962 		 * won't be disabled via the call to init_dmars(). So disable
3963 		 * it explicitly here. The PMRs were setup by tboot prior to
3964 		 * calling SENTER, but the kernel is expected to reset/tear
3965 		 * down the PMRs.
3966 		 */
3967 		if (intel_iommu_tboot_noforce) {
3968 			for_each_iommu(iommu, drhd)
3969 				iommu_disable_protect_mem_regions(iommu);
3970 		}
3971 
3972 		/*
3973 		 * Make sure the IOMMUs are switched off, even when we
3974 		 * boot into a kexec kernel and the previous kernel left
3975 		 * them enabled
3976 		 */
3977 		intel_disable_iommus();
3978 		goto out_free_dmar;
3979 	}
3980 
3981 	if (list_empty(&dmar_rmrr_units))
3982 		pr_info("No RMRR found\n");
3983 
3984 	if (list_empty(&dmar_atsr_units))
3985 		pr_info("No ATSR found\n");
3986 
3987 	if (list_empty(&dmar_satc_units))
3988 		pr_info("No SATC found\n");
3989 
3990 	init_no_remapping_devices();
3991 
3992 	ret = init_dmars();
3993 	if (ret) {
3994 		if (force_on)
3995 			panic("tboot: Failed to initialize DMARs\n");
3996 		pr_err("Initialization failed\n");
3997 		goto out_free_dmar;
3998 	}
3999 	up_write(&dmar_global_lock);
4000 
4001 	init_iommu_pm_ops();
4002 
4003 	down_read(&dmar_global_lock);
4004 	for_each_active_iommu(iommu, drhd) {
4005 		/*
4006 		 * The flush queue implementation does not perform
4007 		 * page-selective invalidations that are required for efficient
4008 		 * TLB flushes in virtual environments.  The benefit of batching
4009 		 * is likely to be much lower than the overhead of synchronizing
4010 		 * the virtual and physical IOMMU page-tables.
4011 		 */
4012 		if (cap_caching_mode(iommu->cap) &&
4013 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
4014 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4015 			iommu_set_dma_strict();
4016 		}
4017 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4018 				       intel_iommu_groups,
4019 				       "%s", iommu->name);
4020 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4021 
4022 		iommu_pmu_register(iommu);
4023 	}
4024 	up_read(&dmar_global_lock);
4025 
4026 	if (si_domain && !hw_pass_through)
4027 		register_memory_notifier(&intel_iommu_memory_nb);
4028 
4029 	down_read(&dmar_global_lock);
4030 	if (probe_acpi_namespace_devices())
4031 		pr_warn("ACPI name space devices didn't probe correctly\n");
4032 
4033 	/* Finally, we enable the DMA remapping hardware. */
4034 	for_each_iommu(iommu, drhd) {
4035 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4036 			iommu_enable_translation(iommu);
4037 
4038 		iommu_disable_protect_mem_regions(iommu);
4039 	}
4040 	up_read(&dmar_global_lock);
4041 
4042 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4043 
4044 	intel_iommu_enabled = 1;
4045 
4046 	return 0;
4047 
4048 out_free_dmar:
4049 	intel_iommu_free_dmars();
4050 	up_write(&dmar_global_lock);
4051 	return ret;
4052 }
4053 
4054 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4055 {
4056 	struct device_domain_info *info = opaque;
4057 
4058 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4059 	return 0;
4060 }
4061 
4062 /*
4063  * NB - intel-iommu lacks any sort of reference counting for the users of
4064  * dependent devices.  If multiple endpoints have intersecting dependent
4065  * devices, unbinding the driver from any one of them will possibly leave
4066  * the others unable to operate.
4067  */
4068 static void domain_context_clear(struct device_domain_info *info)
4069 {
4070 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4071 		return;
4072 
4073 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4074 			       &domain_context_clear_one_cb, info);
4075 }
4076 
4077 static void dmar_remove_one_dev_info(struct device *dev)
4078 {
4079 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4080 	struct dmar_domain *domain = info->domain;
4081 	struct intel_iommu *iommu = info->iommu;
4082 	unsigned long flags;
4083 
4084 	if (!dev_is_real_dma_subdevice(info->dev)) {
4085 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4086 			intel_pasid_tear_down_entry(iommu, info->dev,
4087 					PASID_RID2PASID, false);
4088 
4089 		iommu_disable_pci_caps(info);
4090 		domain_context_clear(info);
4091 	}
4092 
4093 	spin_lock_irqsave(&domain->lock, flags);
4094 	list_del(&info->link);
4095 	spin_unlock_irqrestore(&domain->lock, flags);
4096 
4097 	domain_detach_iommu(domain, iommu);
4098 	info->domain = NULL;
4099 }
4100 
4101 /*
4102  * Clear the page table pointer in context or pasid table entries so that
4103  * all DMA requests without PASID from the device are blocked. If the page
4104  * table has been set, clean up the data structures.
4105  */
4106 static void device_block_translation(struct device *dev)
4107 {
4108 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4109 	struct intel_iommu *iommu = info->iommu;
4110 	unsigned long flags;
4111 
4112 	iommu_disable_pci_caps(info);
4113 	if (!dev_is_real_dma_subdevice(dev)) {
4114 		if (sm_supported(iommu))
4115 			intel_pasid_tear_down_entry(iommu, dev,
4116 						    PASID_RID2PASID, false);
4117 		else
4118 			domain_context_clear(info);
4119 	}
4120 
4121 	if (!info->domain)
4122 		return;
4123 
4124 	spin_lock_irqsave(&info->domain->lock, flags);
4125 	list_del(&info->link);
4126 	spin_unlock_irqrestore(&info->domain->lock, flags);
4127 
4128 	domain_detach_iommu(info->domain, iommu);
4129 	info->domain = NULL;
4130 }
4131 
4132 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4133 {
4134 	int adjust_width;
4135 
4136 	/* calculate AGAW */
4137 	domain->gaw = guest_width;
4138 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4139 	domain->agaw = width_to_agaw(adjust_width);
4140 
4141 	domain->iommu_coherency = false;
4142 	domain->iommu_superpage = 0;
4143 	domain->max_addr = 0;
4144 
4145 	/* always allocate the top pgd */
4146 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4147 	if (!domain->pgd)
4148 		return -ENOMEM;
4149 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4150 	return 0;
4151 }
4152 
4153 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4154 				      struct device *dev)
4155 {
4156 	device_block_translation(dev);
4157 	return 0;
4158 }
4159 
4160 static struct iommu_domain blocking_domain = {
4161 	.ops = &(const struct iommu_domain_ops) {
4162 		.attach_dev	= blocking_domain_attach_dev,
4163 		.free		= intel_iommu_domain_free
4164 	}
4165 };
4166 
4167 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4168 {
4169 	struct dmar_domain *dmar_domain;
4170 	struct iommu_domain *domain;
4171 
4172 	switch (type) {
4173 	case IOMMU_DOMAIN_BLOCKED:
4174 		return &blocking_domain;
4175 	case IOMMU_DOMAIN_DMA:
4176 	case IOMMU_DOMAIN_DMA_FQ:
4177 	case IOMMU_DOMAIN_UNMANAGED:
4178 		dmar_domain = alloc_domain(type);
4179 		if (!dmar_domain) {
4180 			pr_err("Can't allocate dmar_domain\n");
4181 			return NULL;
4182 		}
4183 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4184 			pr_err("Domain initialization failed\n");
4185 			domain_exit(dmar_domain);
4186 			return NULL;
4187 		}
4188 
4189 		domain = &dmar_domain->domain;
4190 		domain->geometry.aperture_start = 0;
4191 		domain->geometry.aperture_end   =
4192 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4193 		domain->geometry.force_aperture = true;
4194 
4195 		return domain;
4196 	case IOMMU_DOMAIN_IDENTITY:
4197 		return &si_domain->domain;
4198 	case IOMMU_DOMAIN_SVA:
4199 		return intel_svm_domain_alloc();
4200 	default:
4201 		return NULL;
4202 	}
4203 
4204 	return NULL;
4205 }
4206 
4207 static void intel_iommu_domain_free(struct iommu_domain *domain)
4208 {
4209 	if (domain != &si_domain->domain && domain != &blocking_domain)
4210 		domain_exit(to_dmar_domain(domain));
4211 }
4212 
4213 static int prepare_domain_attach_device(struct iommu_domain *domain,
4214 					struct device *dev)
4215 {
4216 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4217 	struct intel_iommu *iommu;
4218 	int addr_width;
4219 
4220 	iommu = device_to_iommu(dev, NULL, NULL);
4221 	if (!iommu)
4222 		return -ENODEV;
4223 
4224 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4225 		return -EINVAL;
4226 
4227 	/* check if this iommu agaw is sufficient for max mapped address */
4228 	addr_width = agaw_to_width(iommu->agaw);
4229 	if (addr_width > cap_mgaw(iommu->cap))
4230 		addr_width = cap_mgaw(iommu->cap);
4231 
4232 	if (dmar_domain->max_addr > (1LL << addr_width))
4233 		return -EINVAL;
4234 	dmar_domain->gaw = addr_width;
4235 
4236 	/*
4237 	 * Knock out extra levels of page tables if necessary
4238 	 */
4239 	while (iommu->agaw < dmar_domain->agaw) {
4240 		struct dma_pte *pte;
4241 
4242 		pte = dmar_domain->pgd;
4243 		if (dma_pte_present(pte)) {
4244 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4245 			free_pgtable_page(pte);
4246 		}
4247 		dmar_domain->agaw--;
4248 	}
4249 
4250 	return 0;
4251 }
4252 
4253 static int intel_iommu_attach_device(struct iommu_domain *domain,
4254 				     struct device *dev)
4255 {
4256 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4257 	int ret;
4258 
4259 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4260 	    device_is_rmrr_locked(dev)) {
4261 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4262 		return -EPERM;
4263 	}
4264 
4265 	if (info->domain)
4266 		device_block_translation(dev);
4267 
4268 	ret = prepare_domain_attach_device(domain, dev);
4269 	if (ret)
4270 		return ret;
4271 
4272 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4273 }
4274 
4275 static int intel_iommu_map(struct iommu_domain *domain,
4276 			   unsigned long iova, phys_addr_t hpa,
4277 			   size_t size, int iommu_prot, gfp_t gfp)
4278 {
4279 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4280 	u64 max_addr;
4281 	int prot = 0;
4282 
4283 	if (iommu_prot & IOMMU_READ)
4284 		prot |= DMA_PTE_READ;
4285 	if (iommu_prot & IOMMU_WRITE)
4286 		prot |= DMA_PTE_WRITE;
4287 	if (dmar_domain->set_pte_snp)
4288 		prot |= DMA_PTE_SNP;
4289 
4290 	max_addr = iova + size;
4291 	if (dmar_domain->max_addr < max_addr) {
4292 		u64 end;
4293 
4294 		/* check if minimum agaw is sufficient for mapped address */
4295 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4296 		if (end < max_addr) {
4297 			pr_err("%s: iommu width (%d) is not "
4298 			       "sufficient for the mapped address (%llx)\n",
4299 			       __func__, dmar_domain->gaw, max_addr);
4300 			return -EFAULT;
4301 		}
4302 		dmar_domain->max_addr = max_addr;
4303 	}
4304 	/* Round up size to next multiple of PAGE_SIZE, if it and
4305 	   the low bits of hpa would take us onto the next page */
4306 	size = aligned_nrpages(hpa, size);
4307 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4308 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4309 }
4310 
4311 static int intel_iommu_map_pages(struct iommu_domain *domain,
4312 				 unsigned long iova, phys_addr_t paddr,
4313 				 size_t pgsize, size_t pgcount,
4314 				 int prot, gfp_t gfp, size_t *mapped)
4315 {
4316 	unsigned long pgshift = __ffs(pgsize);
4317 	size_t size = pgcount << pgshift;
4318 	int ret;
4319 
4320 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4321 		return -EINVAL;
4322 
4323 	if (!IS_ALIGNED(iova | paddr, pgsize))
4324 		return -EINVAL;
4325 
4326 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4327 	if (!ret && mapped)
4328 		*mapped = size;
4329 
4330 	return ret;
4331 }
4332 
4333 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4334 				unsigned long iova, size_t size,
4335 				struct iommu_iotlb_gather *gather)
4336 {
4337 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4338 	unsigned long start_pfn, last_pfn;
4339 	int level = 0;
4340 
4341 	/* Cope with horrid API which requires us to unmap more than the
4342 	   size argument if it happens to be a large-page mapping. */
4343 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4344 			       GFP_ATOMIC));
4345 
4346 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4347 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4348 
4349 	start_pfn = iova >> VTD_PAGE_SHIFT;
4350 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4351 
4352 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4353 
4354 	if (dmar_domain->max_addr == iova + size)
4355 		dmar_domain->max_addr = iova;
4356 
4357 	/*
4358 	 * We do not use page-selective IOTLB invalidation in flush queue,
4359 	 * so there is no need to track page and sync iotlb.
4360 	 */
4361 	if (!iommu_iotlb_gather_queued(gather))
4362 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4363 
4364 	return size;
4365 }
4366 
4367 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4368 				      unsigned long iova,
4369 				      size_t pgsize, size_t pgcount,
4370 				      struct iommu_iotlb_gather *gather)
4371 {
4372 	unsigned long pgshift = __ffs(pgsize);
4373 	size_t size = pgcount << pgshift;
4374 
4375 	return intel_iommu_unmap(domain, iova, size, gather);
4376 }
4377 
4378 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4379 				 struct iommu_iotlb_gather *gather)
4380 {
4381 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4382 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4383 	size_t size = gather->end - gather->start;
4384 	struct iommu_domain_info *info;
4385 	unsigned long start_pfn;
4386 	unsigned long nrpages;
4387 	unsigned long i;
4388 
4389 	nrpages = aligned_nrpages(gather->start, size);
4390 	start_pfn = mm_to_dma_pfn(iova_pfn);
4391 
4392 	xa_for_each(&dmar_domain->iommu_array, i, info)
4393 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4394 				      start_pfn, nrpages,
4395 				      list_empty(&gather->freelist), 0);
4396 
4397 	put_pages_list(&gather->freelist);
4398 }
4399 
4400 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4401 					    dma_addr_t iova)
4402 {
4403 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4404 	struct dma_pte *pte;
4405 	int level = 0;
4406 	u64 phys = 0;
4407 
4408 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4409 			     GFP_ATOMIC);
4410 	if (pte && dma_pte_present(pte))
4411 		phys = dma_pte_addr(pte) +
4412 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4413 						VTD_PAGE_SHIFT) - 1));
4414 
4415 	return phys;
4416 }
4417 
4418 static bool domain_support_force_snooping(struct dmar_domain *domain)
4419 {
4420 	struct device_domain_info *info;
4421 	bool support = true;
4422 
4423 	assert_spin_locked(&domain->lock);
4424 	list_for_each_entry(info, &domain->devices, link) {
4425 		if (!ecap_sc_support(info->iommu->ecap)) {
4426 			support = false;
4427 			break;
4428 		}
4429 	}
4430 
4431 	return support;
4432 }
4433 
4434 static void domain_set_force_snooping(struct dmar_domain *domain)
4435 {
4436 	struct device_domain_info *info;
4437 
4438 	assert_spin_locked(&domain->lock);
4439 	/*
4440 	 * Second level page table supports per-PTE snoop control. The
4441 	 * iommu_map() interface will handle this by setting SNP bit.
4442 	 */
4443 	if (!domain->use_first_level) {
4444 		domain->set_pte_snp = true;
4445 		return;
4446 	}
4447 
4448 	list_for_each_entry(info, &domain->devices, link)
4449 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4450 						     PASID_RID2PASID);
4451 }
4452 
4453 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4454 {
4455 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4456 	unsigned long flags;
4457 
4458 	if (dmar_domain->force_snooping)
4459 		return true;
4460 
4461 	spin_lock_irqsave(&dmar_domain->lock, flags);
4462 	if (!domain_support_force_snooping(dmar_domain)) {
4463 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4464 		return false;
4465 	}
4466 
4467 	domain_set_force_snooping(dmar_domain);
4468 	dmar_domain->force_snooping = true;
4469 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4470 
4471 	return true;
4472 }
4473 
4474 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4475 {
4476 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4477 
4478 	switch (cap) {
4479 	case IOMMU_CAP_CACHE_COHERENCY:
4480 		return true;
4481 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4482 		return dmar_platform_optin();
4483 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4484 		return ecap_sc_support(info->iommu->ecap);
4485 	default:
4486 		return false;
4487 	}
4488 }
4489 
4490 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4491 {
4492 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4493 	struct device_domain_info *info;
4494 	struct intel_iommu *iommu;
4495 	u8 bus, devfn;
4496 	int ret;
4497 
4498 	iommu = device_to_iommu(dev, &bus, &devfn);
4499 	if (!iommu || !iommu->iommu.ops)
4500 		return ERR_PTR(-ENODEV);
4501 
4502 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4503 	if (!info)
4504 		return ERR_PTR(-ENOMEM);
4505 
4506 	if (dev_is_real_dma_subdevice(dev)) {
4507 		info->bus = pdev->bus->number;
4508 		info->devfn = pdev->devfn;
4509 		info->segment = pci_domain_nr(pdev->bus);
4510 	} else {
4511 		info->bus = bus;
4512 		info->devfn = devfn;
4513 		info->segment = iommu->segment;
4514 	}
4515 
4516 	info->dev = dev;
4517 	info->iommu = iommu;
4518 	if (dev_is_pci(dev)) {
4519 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4520 		    pci_ats_supported(pdev) &&
4521 		    dmar_ats_supported(pdev, iommu)) {
4522 			info->ats_supported = 1;
4523 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4524 		}
4525 		if (sm_supported(iommu)) {
4526 			if (pasid_supported(iommu)) {
4527 				int features = pci_pasid_features(pdev);
4528 
4529 				if (features >= 0)
4530 					info->pasid_supported = features | 1;
4531 			}
4532 
4533 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4534 			    pci_pri_supported(pdev))
4535 				info->pri_supported = 1;
4536 		}
4537 	}
4538 
4539 	dev_iommu_priv_set(dev, info);
4540 
4541 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4542 		ret = intel_pasid_alloc_table(dev);
4543 		if (ret) {
4544 			dev_err(dev, "PASID table allocation failed\n");
4545 			dev_iommu_priv_set(dev, NULL);
4546 			kfree(info);
4547 			return ERR_PTR(ret);
4548 		}
4549 	}
4550 
4551 	return &iommu->iommu;
4552 }
4553 
4554 static void intel_iommu_release_device(struct device *dev)
4555 {
4556 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4557 
4558 	dmar_remove_one_dev_info(dev);
4559 	intel_pasid_free_table(dev);
4560 	dev_iommu_priv_set(dev, NULL);
4561 	kfree(info);
4562 	set_dma_ops(dev, NULL);
4563 }
4564 
4565 static void intel_iommu_probe_finalize(struct device *dev)
4566 {
4567 	set_dma_ops(dev, NULL);
4568 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4569 }
4570 
4571 static void intel_iommu_get_resv_regions(struct device *device,
4572 					 struct list_head *head)
4573 {
4574 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4575 	struct iommu_resv_region *reg;
4576 	struct dmar_rmrr_unit *rmrr;
4577 	struct device *i_dev;
4578 	int i;
4579 
4580 	rcu_read_lock();
4581 	for_each_rmrr_units(rmrr) {
4582 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4583 					  i, i_dev) {
4584 			struct iommu_resv_region *resv;
4585 			enum iommu_resv_type type;
4586 			size_t length;
4587 
4588 			if (i_dev != device &&
4589 			    !is_downstream_to_pci_bridge(device, i_dev))
4590 				continue;
4591 
4592 			length = rmrr->end_address - rmrr->base_address + 1;
4593 
4594 			type = device_rmrr_is_relaxable(device) ?
4595 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4596 
4597 			resv = iommu_alloc_resv_region(rmrr->base_address,
4598 						       length, prot, type,
4599 						       GFP_ATOMIC);
4600 			if (!resv)
4601 				break;
4602 
4603 			list_add_tail(&resv->list, head);
4604 		}
4605 	}
4606 	rcu_read_unlock();
4607 
4608 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4609 	if (dev_is_pci(device)) {
4610 		struct pci_dev *pdev = to_pci_dev(device);
4611 
4612 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4613 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4614 					IOMMU_RESV_DIRECT_RELAXABLE,
4615 					GFP_KERNEL);
4616 			if (reg)
4617 				list_add_tail(&reg->list, head);
4618 		}
4619 	}
4620 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4621 
4622 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4623 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4624 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4625 	if (!reg)
4626 		return;
4627 	list_add_tail(&reg->list, head);
4628 }
4629 
4630 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4631 {
4632 	if (dev_is_pci(dev))
4633 		return pci_device_group(dev);
4634 	return generic_device_group(dev);
4635 }
4636 
4637 static int intel_iommu_enable_sva(struct device *dev)
4638 {
4639 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4640 	struct intel_iommu *iommu;
4641 	int ret;
4642 
4643 	if (!info || dmar_disabled)
4644 		return -EINVAL;
4645 
4646 	iommu = info->iommu;
4647 	if (!iommu)
4648 		return -EINVAL;
4649 
4650 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4651 		return -ENODEV;
4652 
4653 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4654 		return -EINVAL;
4655 
4656 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4657 	if (ret)
4658 		return ret;
4659 
4660 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4661 	if (ret)
4662 		iopf_queue_remove_device(iommu->iopf_queue, dev);
4663 
4664 	return ret;
4665 }
4666 
4667 static int intel_iommu_disable_sva(struct device *dev)
4668 {
4669 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4670 	struct intel_iommu *iommu = info->iommu;
4671 	int ret;
4672 
4673 	ret = iommu_unregister_device_fault_handler(dev);
4674 	if (ret)
4675 		return ret;
4676 
4677 	ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4678 	if (ret)
4679 		iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4680 
4681 	return ret;
4682 }
4683 
4684 static int intel_iommu_enable_iopf(struct device *dev)
4685 {
4686 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4687 
4688 	if (info && info->pri_supported)
4689 		return 0;
4690 
4691 	return -ENODEV;
4692 }
4693 
4694 static int
4695 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4696 {
4697 	switch (feat) {
4698 	case IOMMU_DEV_FEAT_IOPF:
4699 		return intel_iommu_enable_iopf(dev);
4700 
4701 	case IOMMU_DEV_FEAT_SVA:
4702 		return intel_iommu_enable_sva(dev);
4703 
4704 	default:
4705 		return -ENODEV;
4706 	}
4707 }
4708 
4709 static int
4710 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4711 {
4712 	switch (feat) {
4713 	case IOMMU_DEV_FEAT_IOPF:
4714 		return 0;
4715 
4716 	case IOMMU_DEV_FEAT_SVA:
4717 		return intel_iommu_disable_sva(dev);
4718 
4719 	default:
4720 		return -ENODEV;
4721 	}
4722 }
4723 
4724 static bool intel_iommu_is_attach_deferred(struct device *dev)
4725 {
4726 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4727 
4728 	return translation_pre_enabled(info->iommu) && !info->domain;
4729 }
4730 
4731 /*
4732  * Check that the device does not live on an external facing PCI port that is
4733  * marked as untrusted. Such devices should not be able to apply quirks and
4734  * thus not be able to bypass the IOMMU restrictions.
4735  */
4736 static bool risky_device(struct pci_dev *pdev)
4737 {
4738 	if (pdev->untrusted) {
4739 		pci_info(pdev,
4740 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4741 			 pdev->vendor, pdev->device);
4742 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4743 		return true;
4744 	}
4745 	return false;
4746 }
4747 
4748 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4749 				       unsigned long iova, size_t size)
4750 {
4751 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4752 	unsigned long pages = aligned_nrpages(iova, size);
4753 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4754 	struct iommu_domain_info *info;
4755 	unsigned long i;
4756 
4757 	xa_for_each(&dmar_domain->iommu_array, i, info)
4758 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4759 }
4760 
4761 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4762 {
4763 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4764 	struct iommu_domain *domain;
4765 
4766 	/* Domain type specific cleanup: */
4767 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4768 	if (domain) {
4769 		switch (domain->type) {
4770 		case IOMMU_DOMAIN_SVA:
4771 			intel_svm_remove_dev_pasid(dev, pasid);
4772 			break;
4773 		default:
4774 			/* should never reach here */
4775 			WARN_ON(1);
4776 			break;
4777 		}
4778 	}
4779 
4780 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4781 }
4782 
4783 const struct iommu_ops intel_iommu_ops = {
4784 	.capable		= intel_iommu_capable,
4785 	.domain_alloc		= intel_iommu_domain_alloc,
4786 	.probe_device		= intel_iommu_probe_device,
4787 	.probe_finalize		= intel_iommu_probe_finalize,
4788 	.release_device		= intel_iommu_release_device,
4789 	.get_resv_regions	= intel_iommu_get_resv_regions,
4790 	.device_group		= intel_iommu_device_group,
4791 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4792 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4793 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4794 	.def_domain_type	= device_def_domain_type,
4795 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4796 	.pgsize_bitmap		= SZ_4K,
4797 #ifdef CONFIG_INTEL_IOMMU_SVM
4798 	.page_response		= intel_svm_page_response,
4799 #endif
4800 	.default_domain_ops = &(const struct iommu_domain_ops) {
4801 		.attach_dev		= intel_iommu_attach_device,
4802 		.map_pages		= intel_iommu_map_pages,
4803 		.unmap_pages		= intel_iommu_unmap_pages,
4804 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4805 		.flush_iotlb_all        = intel_flush_iotlb_all,
4806 		.iotlb_sync		= intel_iommu_tlb_sync,
4807 		.iova_to_phys		= intel_iommu_iova_to_phys,
4808 		.free			= intel_iommu_domain_free,
4809 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4810 	}
4811 };
4812 
4813 static void quirk_iommu_igfx(struct pci_dev *dev)
4814 {
4815 	if (risky_device(dev))
4816 		return;
4817 
4818 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4819 	dmar_map_gfx = 0;
4820 }
4821 
4822 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4830 
4831 /* Broadwell igfx malfunctions with dmar */
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4856 
4857 static void quirk_iommu_rwbf(struct pci_dev *dev)
4858 {
4859 	if (risky_device(dev))
4860 		return;
4861 
4862 	/*
4863 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4864 	 * but needs it. Same seems to hold for the desktop versions.
4865 	 */
4866 	pci_info(dev, "Forcing write-buffer flush capability\n");
4867 	rwbf_quirk = 1;
4868 }
4869 
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4877 
4878 #define GGC 0x52
4879 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4880 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4881 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4882 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4883 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4884 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4885 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4886 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4887 
4888 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4889 {
4890 	unsigned short ggc;
4891 
4892 	if (risky_device(dev))
4893 		return;
4894 
4895 	if (pci_read_config_word(dev, GGC, &ggc))
4896 		return;
4897 
4898 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4899 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4900 		dmar_map_gfx = 0;
4901 	} else if (dmar_map_gfx) {
4902 		/* we have to ensure the gfx device is idle before we flush */
4903 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4904 		iommu_set_dma_strict();
4905 	}
4906 }
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4911 
4912 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4913 {
4914 	unsigned short ver;
4915 
4916 	if (!IS_GFX_DEVICE(dev))
4917 		return;
4918 
4919 	ver = (dev->device >> 8) & 0xff;
4920 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4921 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4922 	    ver != 0x9a && ver != 0xa7)
4923 		return;
4924 
4925 	if (risky_device(dev))
4926 		return;
4927 
4928 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4929 	iommu_skip_te_disable = 1;
4930 }
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4932 
4933 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4934    ISOCH DMAR unit for the Azalia sound device, but not give it any
4935    TLB entries, which causes it to deadlock. Check for that.  We do
4936    this in a function called from init_dmars(), instead of in a PCI
4937    quirk, because we don't want to print the obnoxious "BIOS broken"
4938    message if VT-d is actually disabled.
4939 */
4940 static void __init check_tylersburg_isoch(void)
4941 {
4942 	struct pci_dev *pdev;
4943 	uint32_t vtisochctrl;
4944 
4945 	/* If there's no Azalia in the system anyway, forget it. */
4946 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4947 	if (!pdev)
4948 		return;
4949 
4950 	if (risky_device(pdev)) {
4951 		pci_dev_put(pdev);
4952 		return;
4953 	}
4954 
4955 	pci_dev_put(pdev);
4956 
4957 	/* System Management Registers. Might be hidden, in which case
4958 	   we can't do the sanity check. But that's OK, because the
4959 	   known-broken BIOSes _don't_ actually hide it, so far. */
4960 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4961 	if (!pdev)
4962 		return;
4963 
4964 	if (risky_device(pdev)) {
4965 		pci_dev_put(pdev);
4966 		return;
4967 	}
4968 
4969 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4970 		pci_dev_put(pdev);
4971 		return;
4972 	}
4973 
4974 	pci_dev_put(pdev);
4975 
4976 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4977 	if (vtisochctrl & 1)
4978 		return;
4979 
4980 	/* Drop all bits other than the number of TLB entries */
4981 	vtisochctrl &= 0x1c;
4982 
4983 	/* If we have the recommended number of TLB entries (16), fine. */
4984 	if (vtisochctrl == 0x10)
4985 		return;
4986 
4987 	/* Zero TLB entries? You get to ride the short bus to school. */
4988 	if (!vtisochctrl) {
4989 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4990 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4991 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4992 		     dmi_get_system_info(DMI_BIOS_VERSION),
4993 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4994 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4995 		return;
4996 	}
4997 
4998 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4999 	       vtisochctrl);
5000 }
5001 
5002 /*
5003  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5004  * invalidation completion before posted writes initiated with translated address
5005  * that utilized translations matching the invalidation address range, violating
5006  * the invalidation completion ordering.
5007  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5008  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5009  * under the control of the trusted/privileged host device driver must use this
5010  * quirk.
5011  * Device TLBs are invalidated under the following six conditions:
5012  * 1. Device driver does DMA API unmap IOVA
5013  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5014  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5015  *    exit_mmap() due to crash
5016  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5017  *    VM has to free pages that were unmapped
5018  * 5. Userspace driver unmaps a DMA buffer
5019  * 6. Cache invalidation in vSVA usage (upcoming)
5020  *
5021  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5022  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5023  * invalidate TLB the same way as normal user unmap which will use this quirk.
5024  * The dTLB invalidation after PASID cache flush does not need this quirk.
5025  *
5026  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5027  */
5028 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5029 			       unsigned long address, unsigned long mask,
5030 			       u32 pasid, u16 qdep)
5031 {
5032 	u16 sid;
5033 
5034 	if (likely(!info->dtlb_extra_inval))
5035 		return;
5036 
5037 	sid = PCI_DEVID(info->bus, info->devfn);
5038 	if (pasid == PASID_RID2PASID) {
5039 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5040 				   qdep, address, mask);
5041 	} else {
5042 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5043 					 pasid, qdep, address, mask);
5044 	}
5045 }
5046 
5047 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5048 
5049 /*
5050  * Function to submit a command to the enhanced command interface. The
5051  * valid enhanced command descriptions are defined in Table 47 of the
5052  * VT-d spec. The VT-d hardware implementation may support some but not
5053  * all commands, which can be determined by checking the Enhanced
5054  * Command Capability Register.
5055  *
5056  * Return values:
5057  *  - 0: Command successful without any error;
5058  *  - Negative: software error value;
5059  *  - Nonzero positive: failure status code defined in Table 48.
5060  */
5061 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5062 {
5063 	unsigned long flags;
5064 	u64 res;
5065 	int ret;
5066 
5067 	if (!cap_ecmds(iommu->cap))
5068 		return -ENODEV;
5069 
5070 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5071 
5072 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5073 	if (res & DMA_ECMD_ECRSP_IP) {
5074 		ret = -EBUSY;
5075 		goto err;
5076 	}
5077 
5078 	/*
5079 	 * Unconditionally write the operand B, because
5080 	 * - There is no side effect if an ecmd doesn't require an
5081 	 *   operand B, but we set the register to some value.
5082 	 * - It's not invoked in any critical path. The extra MMIO
5083 	 *   write doesn't bring any performance concerns.
5084 	 */
5085 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5086 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5087 
5088 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5089 		      !(res & DMA_ECMD_ECRSP_IP), res);
5090 
5091 	if (res & DMA_ECMD_ECRSP_IP) {
5092 		ret = -ETIMEDOUT;
5093 		goto err;
5094 	}
5095 
5096 	ret = ecmd_get_status_code(res);
5097 err:
5098 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5099 
5100 	return ret;
5101 }
5102