xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 95f7a972194ad20696c36523b54c19a3567e0697)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN		(1)
63 
64 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65 
66 /* page table handling */
67 #define LEVEL_STRIDE		(9)
68 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69 
70 static inline int agaw_to_level(int agaw)
71 {
72 	return agaw + 2;
73 }
74 
75 static inline int agaw_to_width(int agaw)
76 {
77 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79 
80 static inline int width_to_agaw(int width)
81 {
82 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84 
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 	return (level - 1) * LEVEL_STRIDE;
88 }
89 
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94 
95 static inline u64 level_mask(int level)
96 {
97 	return -1ULL << level_to_offset_bits(level);
98 }
99 
100 static inline u64 level_size(int level)
101 {
102 	return 1ULL << level_to_offset_bits(level);
103 }
104 
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 	return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109 
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114 
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 	return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 	return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 	return page_to_dma_pfn(virt_to_page(p));
132 }
133 
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_set_present(struct context_entry *context)
172 {
173 	context->lo |= 1;
174 }
175 
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 	context->lo &= (((u64)-1) << 2) | 1;
179 }
180 
181 static inline void context_set_translation_type(struct context_entry *context,
182 						unsigned long value)
183 {
184 	context->lo &= (((u64)-1) << 4) | 3;
185 	context->lo |= (value & 3) << 2;
186 }
187 
188 static inline void context_set_address_root(struct context_entry *context,
189 					    unsigned long value)
190 {
191 	context->lo &= ~VTD_PAGE_MASK;
192 	context->lo |= value & VTD_PAGE_MASK;
193 }
194 
195 static inline void context_set_address_width(struct context_entry *context,
196 					     unsigned long value)
197 {
198 	context->hi |= value & 7;
199 }
200 
201 static inline void context_set_domain_id(struct context_entry *context,
202 					 unsigned long value)
203 {
204 	context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206 
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 	context->lo |= CONTEXT_PASIDE;
210 }
211 
212 static inline int context_domain_id(struct context_entry *c)
213 {
214 	return((c->hi >> 8) & 0xffff);
215 }
216 
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 	context->lo = 0;
220 	context->hi = 0;
221 }
222 
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 	if (!iommu->copied_tables)
226 		return false;
227 
228 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230 
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236 
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242 
243 /*
244  * This domain is a statically identity mapping domain.
245  *	1. This domain creats a static 1:1 mapping to all usable memory.
246  * 	2. It maps to each iommu if successful.
247  *	3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251 
252 struct dmar_rmrr_unit {
253 	struct list_head list;		/* list of rmrr units	*/
254 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
255 	u64	base_address;		/* reserved base address*/
256 	u64	end_address;		/* reserved end address */
257 	struct dmar_dev_scope *devices;	/* target devices */
258 	int	devices_cnt;		/* target device count */
259 };
260 
261 struct dmar_atsr_unit {
262 	struct list_head list;		/* list of ATSR units */
263 	struct acpi_dmar_header *hdr;	/* ACPI header */
264 	struct dmar_dev_scope *devices;	/* target devices */
265 	int devices_cnt;		/* target device count */
266 	u8 include_all:1;		/* include all ports */
267 };
268 
269 struct dmar_satc_unit {
270 	struct list_head list;		/* list of SATC units */
271 	struct acpi_dmar_header *hdr;	/* ACPI header */
272 	struct dmar_dev_scope *devices;	/* target devices */
273 	struct intel_iommu *iommu;	/* the corresponding iommu */
274 	int devices_cnt;		/* target device count */
275 	u8 atc_required:1;		/* ATS is required */
276 };
277 
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281 
282 #define for_each_rmrr_units(rmrr) \
283 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284 
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287 
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290 
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293 
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298 
299 #define IDENTMAP_GFX		2
300 #define IDENTMAP_AZALIA		4
301 
302 const struct iommu_ops intel_iommu_ops;
303 
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308 
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313 
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 	u32 gsts;
317 
318 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 	if (gsts & DMA_GSTS_TES)
320 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322 
323 static int __init intel_iommu_setup(char *str)
324 {
325 	if (!str)
326 		return -EINVAL;
327 
328 	while (*str) {
329 		if (!strncmp(str, "on", 2)) {
330 			dmar_disabled = 0;
331 			pr_info("IOMMU enabled\n");
332 		} else if (!strncmp(str, "off", 3)) {
333 			dmar_disabled = 1;
334 			no_platform_optin = 1;
335 			pr_info("IOMMU disabled\n");
336 		} else if (!strncmp(str, "igfx_off", 8)) {
337 			dmar_map_gfx = 0;
338 			pr_info("Disable GFX device mapping\n");
339 		} else if (!strncmp(str, "forcedac", 8)) {
340 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 			iommu_dma_forcedac = true;
342 		} else if (!strncmp(str, "strict", 6)) {
343 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 			iommu_set_dma_strict();
345 		} else if (!strncmp(str, "sp_off", 6)) {
346 			pr_info("Disable supported super page\n");
347 			intel_iommu_superpage = 0;
348 		} else if (!strncmp(str, "sm_on", 5)) {
349 			pr_info("Enable scalable mode if hardware supports\n");
350 			intel_iommu_sm = 1;
351 		} else if (!strncmp(str, "sm_off", 6)) {
352 			pr_info("Scalable mode is disallowed\n");
353 			intel_iommu_sm = 0;
354 		} else if (!strncmp(str, "tboot_noforce", 13)) {
355 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 			intel_iommu_tboot_noforce = 1;
357 		} else {
358 			pr_notice("Unknown option - '%s'\n", str);
359 		}
360 
361 		str += strcspn(str, ",");
362 		while (*str == ',')
363 			str++;
364 	}
365 
366 	return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369 
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 	struct page *page;
373 	void *vaddr = NULL;
374 
375 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 	if (page)
377 		vaddr = page_address(page);
378 	return vaddr;
379 }
380 
381 void free_pgtable_page(void *vaddr)
382 {
383 	free_page((unsigned long)vaddr);
384 }
385 
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390 
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 				       unsigned long pfn)
393 {
394 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395 
396 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398 
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 	unsigned long fl_sagaw, sl_sagaw;
407 
408 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 	sl_sagaw = cap_sagaw(iommu->cap);
410 
411 	/* Second level only. */
412 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 		return sl_sagaw;
414 
415 	/* First level only. */
416 	if (!ecap_slts(iommu->ecap))
417 		return fl_sagaw;
418 
419 	return fl_sagaw & sl_sagaw;
420 }
421 
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 	unsigned long sagaw;
425 	int agaw;
426 
427 	sagaw = __iommu_calculate_sagaw(iommu);
428 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 		if (test_bit(agaw, &sagaw))
430 			break;
431 	}
432 
433 	return agaw;
434 }
435 
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443 
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453 
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 	return sm_supported(iommu) ?
457 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459 
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 	struct iommu_domain_info *info;
463 	struct dmar_drhd_unit *drhd;
464 	struct intel_iommu *iommu;
465 	bool found = false;
466 	unsigned long i;
467 
468 	domain->iommu_coherency = true;
469 	xa_for_each(&domain->iommu_array, i, info) {
470 		found = true;
471 		if (!iommu_paging_structure_coherency(info->iommu)) {
472 			domain->iommu_coherency = false;
473 			break;
474 		}
475 	}
476 	if (found)
477 		return;
478 
479 	/* No hardware attached; use lowest common denominator */
480 	rcu_read_lock();
481 	for_each_active_iommu(iommu, drhd) {
482 		if (!iommu_paging_structure_coherency(iommu)) {
483 			domain->iommu_coherency = false;
484 			break;
485 		}
486 	}
487 	rcu_read_unlock();
488 }
489 
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 					 struct intel_iommu *skip)
492 {
493 	struct dmar_drhd_unit *drhd;
494 	struct intel_iommu *iommu;
495 	int mask = 0x3;
496 
497 	if (!intel_iommu_superpage)
498 		return 0;
499 
500 	/* set iommu_superpage to the smallest common denominator */
501 	rcu_read_lock();
502 	for_each_active_iommu(iommu, drhd) {
503 		if (iommu != skip) {
504 			if (domain && domain->use_first_level) {
505 				if (!cap_fl1gp_support(iommu->cap))
506 					mask = 0x1;
507 			} else {
508 				mask &= cap_super_page_val(iommu->cap);
509 			}
510 
511 			if (!mask)
512 				break;
513 		}
514 	}
515 	rcu_read_unlock();
516 
517 	return fls(mask);
518 }
519 
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 	struct device_domain_info *info;
523 	int nid = NUMA_NO_NODE;
524 	unsigned long flags;
525 
526 	spin_lock_irqsave(&domain->lock, flags);
527 	list_for_each_entry(info, &domain->devices, link) {
528 		/*
529 		 * There could possibly be multiple device numa nodes as devices
530 		 * within the same domain may sit behind different IOMMUs. There
531 		 * isn't perfect answer in such situation, so we select first
532 		 * come first served policy.
533 		 */
534 		nid = dev_to_node(info->dev);
535 		if (nid != NUMA_NO_NODE)
536 			break;
537 	}
538 	spin_unlock_irqrestore(&domain->lock, flags);
539 
540 	return nid;
541 }
542 
543 static void domain_update_iotlb(struct dmar_domain *domain);
544 
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 	unsigned long bitmap = 0;
549 
550 	/*
551 	 * 1-level super page supports page size of 2MiB, 2-level super page
552 	 * supports page size of both 2MiB and 1GiB.
553 	 */
554 	if (domain->iommu_superpage == 1)
555 		bitmap |= SZ_2M;
556 	else if (domain->iommu_superpage == 2)
557 		bitmap |= SZ_2M | SZ_1G;
558 
559 	return bitmap;
560 }
561 
562 /* Some capabilities may be different across iommus */
563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 	domain_update_iommu_coherency(domain);
566 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567 
568 	/*
569 	 * If RHSA is missing, we should default to the device numa domain
570 	 * as fall back.
571 	 */
572 	if (domain->nid == NUMA_NO_NODE)
573 		domain->nid = domain_update_device_node(domain);
574 
575 	/*
576 	 * First-level translation restricts the input-address to a
577 	 * canonical address (i.e., address bits 63:N have the same
578 	 * value as address bit [N-1], where N is 48-bits with 4-level
579 	 * paging and 57-bits with 5-level paging). Hence, skip bit
580 	 * [N-1].
581 	 */
582 	if (domain->use_first_level)
583 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 	else
585 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586 
587 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 	domain_update_iotlb(domain);
589 }
590 
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 					 u8 devfn, int alloc)
593 {
594 	struct root_entry *root = &iommu->root_entry[bus];
595 	struct context_entry *context;
596 	u64 *entry;
597 
598 	/*
599 	 * Except that the caller requested to allocate a new entry,
600 	 * returning a copied context entry makes no sense.
601 	 */
602 	if (!alloc && context_copied(iommu, bus, devfn))
603 		return NULL;
604 
605 	entry = &root->lo;
606 	if (sm_supported(iommu)) {
607 		if (devfn >= 0x80) {
608 			devfn -= 0x80;
609 			entry = &root->hi;
610 		}
611 		devfn *= 2;
612 	}
613 	if (*entry & 1)
614 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 	else {
616 		unsigned long phy_addr;
617 		if (!alloc)
618 			return NULL;
619 
620 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 		if (!context)
622 			return NULL;
623 
624 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 		phy_addr = virt_to_phys((void *)context);
626 		*entry = phy_addr | 1;
627 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
628 	}
629 	return &context[devfn];
630 }
631 
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *				 sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 	struct pci_dev *pdev, *pbridge;
644 
645 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 		return false;
647 
648 	pdev = to_pci_dev(dev);
649 	pbridge = to_pci_dev(bridge);
650 
651 	if (pbridge->subordinate &&
652 	    pbridge->subordinate->number <= pdev->bus->number &&
653 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 		return true;
655 
656 	return false;
657 }
658 
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 	struct dmar_drhd_unit *drhd;
662 	u32 vtbar;
663 	int rc;
664 
665 	/* We know that this device on this chipset has its own IOMMU.
666 	 * If we find it under a different IOMMU, then the BIOS is lying
667 	 * to us. Hope that the IOMMU for this device is actually
668 	 * disabled, and it needs no translation...
669 	 */
670 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 	if (rc) {
672 		/* "can't" happen */
673 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 		return false;
675 	}
676 	vtbar &= 0xffff0000;
677 
678 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
679 	drhd = dmar_find_matched_drhd_unit(pdev);
680 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 		return true;
684 	}
685 
686 	return false;
687 }
688 
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 	if (!iommu || iommu->drhd->ignored)
692 		return true;
693 
694 	if (dev_is_pci(dev)) {
695 		struct pci_dev *pdev = to_pci_dev(dev);
696 
697 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 		    quirk_ioat_snb_local_iommu(pdev))
700 			return true;
701 	}
702 
703 	return false;
704 }
705 
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 	struct dmar_drhd_unit *drhd = NULL;
709 	struct pci_dev *pdev = NULL;
710 	struct intel_iommu *iommu;
711 	struct device *tmp;
712 	u16 segment = 0;
713 	int i;
714 
715 	if (!dev)
716 		return NULL;
717 
718 	if (dev_is_pci(dev)) {
719 		struct pci_dev *pf_pdev;
720 
721 		pdev = pci_real_dma_dev(to_pci_dev(dev));
722 
723 		/* VFs aren't listed in scope tables; we need to look up
724 		 * the PF instead to find the IOMMU. */
725 		pf_pdev = pci_physfn(pdev);
726 		dev = &pf_pdev->dev;
727 		segment = pci_domain_nr(pdev->bus);
728 	} else if (has_acpi_companion(dev))
729 		dev = &ACPI_COMPANION(dev)->dev;
730 
731 	rcu_read_lock();
732 	for_each_iommu(iommu, drhd) {
733 		if (pdev && segment != drhd->segment)
734 			continue;
735 
736 		for_each_active_dev_scope(drhd->devices,
737 					  drhd->devices_cnt, i, tmp) {
738 			if (tmp == dev) {
739 				/* For a VF use its original BDF# not that of the PF
740 				 * which we used for the IOMMU lookup. Strictly speaking
741 				 * we could do this for all PCI devices; we only need to
742 				 * get the BDF# from the scope table for ACPI matches. */
743 				if (pdev && pdev->is_virtfn)
744 					goto got_pdev;
745 
746 				if (bus && devfn) {
747 					*bus = drhd->devices[i].bus;
748 					*devfn = drhd->devices[i].devfn;
749 				}
750 				goto out;
751 			}
752 
753 			if (is_downstream_to_pci_bridge(dev, tmp))
754 				goto got_pdev;
755 		}
756 
757 		if (pdev && drhd->include_all) {
758 got_pdev:
759 			if (bus && devfn) {
760 				*bus = pdev->bus->number;
761 				*devfn = pdev->devfn;
762 			}
763 			goto out;
764 		}
765 	}
766 	iommu = NULL;
767 out:
768 	if (iommu_is_dummy(iommu, dev))
769 		iommu = NULL;
770 
771 	rcu_read_unlock();
772 
773 	return iommu;
774 }
775 
776 static void domain_flush_cache(struct dmar_domain *domain,
777 			       void *addr, int size)
778 {
779 	if (!domain->iommu_coherency)
780 		clflush_cache_range(addr, size);
781 }
782 
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 	struct context_entry *context;
786 	int i;
787 
788 	if (!iommu->root_entry)
789 		return;
790 
791 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 		context = iommu_context_addr(iommu, i, 0, 0);
793 		if (context)
794 			free_pgtable_page(context);
795 
796 		if (!sm_supported(iommu))
797 			continue;
798 
799 		context = iommu_context_addr(iommu, i, 0x80, 0);
800 		if (context)
801 			free_pgtable_page(context);
802 	}
803 
804 	free_pgtable_page(iommu->root_entry);
805 	iommu->root_entry = NULL;
806 }
807 
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 	struct dma_pte *pte;
813 	int offset;
814 
815 	while (1) {
816 		offset = pfn_level_offset(pfn, level);
817 		pte = &parent[offset];
818 
819 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
820 
821 		if (!dma_pte_present(pte)) {
822 			pr_info("page table not present at level %d\n", level - 1);
823 			break;
824 		}
825 
826 		if (level == 1 || dma_pte_superpage(pte))
827 			break;
828 
829 		parent = phys_to_virt(dma_pte_addr(pte));
830 		level--;
831 	}
832 }
833 
834 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
835 			  unsigned long long addr, u32 pasid)
836 {
837 	struct pasid_dir_entry *dir, *pde;
838 	struct pasid_entry *entries, *pte;
839 	struct context_entry *ctx_entry;
840 	struct root_entry *rt_entry;
841 	int i, dir_index, index, level;
842 	u8 devfn = source_id & 0xff;
843 	u8 bus = source_id >> 8;
844 	struct dma_pte *pgtable;
845 
846 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
847 
848 	/* root entry dump */
849 	if (!iommu->root_entry) {
850 		pr_info("root table is not present\n");
851 		return;
852 	}
853 	rt_entry = &iommu->root_entry[bus];
854 
855 	if (sm_supported(iommu))
856 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
857 			rt_entry->hi, rt_entry->lo);
858 	else
859 		pr_info("root entry: 0x%016llx", rt_entry->lo);
860 
861 	/* context entry dump */
862 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
863 	if (!ctx_entry) {
864 		pr_info("context table is not present\n");
865 		return;
866 	}
867 
868 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
869 		ctx_entry->hi, ctx_entry->lo);
870 
871 	/* legacy mode does not require PASID entries */
872 	if (!sm_supported(iommu)) {
873 		if (!context_present(ctx_entry)) {
874 			pr_info("legacy mode page table is not present\n");
875 			return;
876 		}
877 		level = agaw_to_level(ctx_entry->hi & 7);
878 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 		goto pgtable_walk;
880 	}
881 
882 	if (!context_present(ctx_entry)) {
883 		pr_info("pasid directory table is not present\n");
884 		return;
885 	}
886 
887 	/* get the pointer to pasid directory entry */
888 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
889 
890 	/* For request-without-pasid, get the pasid from context entry */
891 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
892 		pasid = IOMMU_NO_PASID;
893 
894 	dir_index = pasid >> PASID_PDE_SHIFT;
895 	pde = &dir[dir_index];
896 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
897 
898 	/* get the pointer to the pasid table entry */
899 	entries = get_pasid_table_from_pde(pde);
900 	if (!entries) {
901 		pr_info("pasid table is not present\n");
902 		return;
903 	}
904 	index = pasid & PASID_PTE_MASK;
905 	pte = &entries[index];
906 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
907 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
908 
909 	if (!pasid_pte_is_present(pte)) {
910 		pr_info("scalable mode page table is not present\n");
911 		return;
912 	}
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level,
929 				      gfp_t gfp)
930 {
931 	struct dma_pte *parent, *pte;
932 	int level = agaw_to_level(domain->agaw);
933 	int offset;
934 
935 	if (!domain_pfn_supported(domain, pfn))
936 		/* Address beyond IOMMU's addressing capabilities. */
937 		return NULL;
938 
939 	parent = domain->pgd;
940 
941 	while (1) {
942 		void *tmp_page;
943 
944 		offset = pfn_level_offset(pfn, level);
945 		pte = &parent[offset];
946 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
947 			break;
948 		if (level == *target_level)
949 			break;
950 
951 		if (!dma_pte_present(pte)) {
952 			uint64_t pteval;
953 
954 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
955 
956 			if (!tmp_page)
957 				return NULL;
958 
959 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
960 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
961 			if (domain->use_first_level)
962 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
963 
964 			if (cmpxchg64(&pte->val, 0ULL, pteval))
965 				/* Someone else set it while we were thinking; use theirs. */
966 				free_pgtable_page(tmp_page);
967 			else
968 				domain_flush_cache(domain, pte, sizeof(*pte));
969 		}
970 		if (level == 1)
971 			break;
972 
973 		parent = phys_to_virt(dma_pte_addr(pte));
974 		level--;
975 	}
976 
977 	if (!*target_level)
978 		*target_level = level;
979 
980 	return pte;
981 }
982 
983 /* return address's pte at specific level */
984 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
985 					 unsigned long pfn,
986 					 int level, int *large_page)
987 {
988 	struct dma_pte *parent, *pte;
989 	int total = agaw_to_level(domain->agaw);
990 	int offset;
991 
992 	parent = domain->pgd;
993 	while (level <= total) {
994 		offset = pfn_level_offset(pfn, total);
995 		pte = &parent[offset];
996 		if (level == total)
997 			return pte;
998 
999 		if (!dma_pte_present(pte)) {
1000 			*large_page = total;
1001 			break;
1002 		}
1003 
1004 		if (dma_pte_superpage(pte)) {
1005 			*large_page = total;
1006 			return pte;
1007 		}
1008 
1009 		parent = phys_to_virt(dma_pte_addr(pte));
1010 		total--;
1011 	}
1012 	return NULL;
1013 }
1014 
1015 /* clear last level pte, a tlb flush should be followed */
1016 static void dma_pte_clear_range(struct dmar_domain *domain,
1017 				unsigned long start_pfn,
1018 				unsigned long last_pfn)
1019 {
1020 	unsigned int large_page;
1021 	struct dma_pte *first_pte, *pte;
1022 
1023 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1024 	    WARN_ON(start_pfn > last_pfn))
1025 		return;
1026 
1027 	/* we don't need lock here; nobody else touches the iova range */
1028 	do {
1029 		large_page = 1;
1030 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1031 		if (!pte) {
1032 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1033 			continue;
1034 		}
1035 		do {
1036 			dma_clear_pte(pte);
1037 			start_pfn += lvl_to_nr_pages(large_page);
1038 			pte++;
1039 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1040 
1041 		domain_flush_cache(domain, first_pte,
1042 				   (void *)pte - (void *)first_pte);
1043 
1044 	} while (start_pfn && start_pfn <= last_pfn);
1045 }
1046 
1047 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1048 			       int retain_level, struct dma_pte *pte,
1049 			       unsigned long pfn, unsigned long start_pfn,
1050 			       unsigned long last_pfn)
1051 {
1052 	pfn = max(start_pfn, pfn);
1053 	pte = &pte[pfn_level_offset(pfn, level)];
1054 
1055 	do {
1056 		unsigned long level_pfn;
1057 		struct dma_pte *level_pte;
1058 
1059 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1060 			goto next;
1061 
1062 		level_pfn = pfn & level_mask(level);
1063 		level_pte = phys_to_virt(dma_pte_addr(pte));
1064 
1065 		if (level > 2) {
1066 			dma_pte_free_level(domain, level - 1, retain_level,
1067 					   level_pte, level_pfn, start_pfn,
1068 					   last_pfn);
1069 		}
1070 
1071 		/*
1072 		 * Free the page table if we're below the level we want to
1073 		 * retain and the range covers the entire table.
1074 		 */
1075 		if (level < retain_level && !(start_pfn > level_pfn ||
1076 		      last_pfn < level_pfn + level_size(level) - 1)) {
1077 			dma_clear_pte(pte);
1078 			domain_flush_cache(domain, pte, sizeof(*pte));
1079 			free_pgtable_page(level_pte);
1080 		}
1081 next:
1082 		pfn += level_size(level);
1083 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1084 }
1085 
1086 /*
1087  * clear last level (leaf) ptes and free page table pages below the
1088  * level we wish to keep intact.
1089  */
1090 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1091 				   unsigned long start_pfn,
1092 				   unsigned long last_pfn,
1093 				   int retain_level)
1094 {
1095 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1096 
1097 	/* We don't need lock here; nobody else touches the iova range */
1098 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1099 			   domain->pgd, 0, start_pfn, last_pfn);
1100 
1101 	/* free pgd */
1102 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1103 		free_pgtable_page(domain->pgd);
1104 		domain->pgd = NULL;
1105 	}
1106 }
1107 
1108 /* When a page at a given level is being unlinked from its parent, we don't
1109    need to *modify* it at all. All we need to do is make a list of all the
1110    pages which can be freed just as soon as we've flushed the IOTLB and we
1111    know the hardware page-walk will no longer touch them.
1112    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1113    be freed. */
1114 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1115 				    int level, struct dma_pte *pte,
1116 				    struct list_head *freelist)
1117 {
1118 	struct page *pg;
1119 
1120 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1121 	list_add_tail(&pg->lru, freelist);
1122 
1123 	if (level == 1)
1124 		return;
1125 
1126 	pte = page_address(pg);
1127 	do {
1128 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1129 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1130 		pte++;
1131 	} while (!first_pte_in_page(pte));
1132 }
1133 
1134 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1135 				struct dma_pte *pte, unsigned long pfn,
1136 				unsigned long start_pfn, unsigned long last_pfn,
1137 				struct list_head *freelist)
1138 {
1139 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1140 
1141 	pfn = max(start_pfn, pfn);
1142 	pte = &pte[pfn_level_offset(pfn, level)];
1143 
1144 	do {
1145 		unsigned long level_pfn = pfn & level_mask(level);
1146 
1147 		if (!dma_pte_present(pte))
1148 			goto next;
1149 
1150 		/* If range covers entire pagetable, free it */
1151 		if (start_pfn <= level_pfn &&
1152 		    last_pfn >= level_pfn + level_size(level) - 1) {
1153 			/* These suborbinate page tables are going away entirely. Don't
1154 			   bother to clear them; we're just going to *free* them. */
1155 			if (level > 1 && !dma_pte_superpage(pte))
1156 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1157 
1158 			dma_clear_pte(pte);
1159 			if (!first_pte)
1160 				first_pte = pte;
1161 			last_pte = pte;
1162 		} else if (level > 1) {
1163 			/* Recurse down into a level that isn't *entirely* obsolete */
1164 			dma_pte_clear_level(domain, level - 1,
1165 					    phys_to_virt(dma_pte_addr(pte)),
1166 					    level_pfn, start_pfn, last_pfn,
1167 					    freelist);
1168 		}
1169 next:
1170 		pfn = level_pfn + level_size(level);
1171 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1172 
1173 	if (first_pte)
1174 		domain_flush_cache(domain, first_pte,
1175 				   (void *)++last_pte - (void *)first_pte);
1176 }
1177 
1178 /* We can't just free the pages because the IOMMU may still be walking
1179    the page tables, and may have cached the intermediate levels. The
1180    pages can only be freed after the IOTLB flush has been done. */
1181 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1182 			 unsigned long last_pfn, struct list_head *freelist)
1183 {
1184 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1185 	    WARN_ON(start_pfn > last_pfn))
1186 		return;
1187 
1188 	/* we don't need lock here; nobody else touches the iova range */
1189 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1190 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1191 
1192 	/* free pgd */
1193 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1194 		struct page *pgd_page = virt_to_page(domain->pgd);
1195 		list_add_tail(&pgd_page->lru, freelist);
1196 		domain->pgd = NULL;
1197 	}
1198 }
1199 
1200 /* iommu handling */
1201 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1202 {
1203 	struct root_entry *root;
1204 
1205 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1206 	if (!root) {
1207 		pr_err("Allocating root entry for %s failed\n",
1208 			iommu->name);
1209 		return -ENOMEM;
1210 	}
1211 
1212 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1213 	iommu->root_entry = root;
1214 
1215 	return 0;
1216 }
1217 
1218 static void iommu_set_root_entry(struct intel_iommu *iommu)
1219 {
1220 	u64 addr;
1221 	u32 sts;
1222 	unsigned long flag;
1223 
1224 	addr = virt_to_phys(iommu->root_entry);
1225 	if (sm_supported(iommu))
1226 		addr |= DMA_RTADDR_SMT;
1227 
1228 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1230 
1231 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1232 
1233 	/* Make sure hardware complete it */
1234 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235 		      readl, (sts & DMA_GSTS_RTPS), sts);
1236 
1237 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 
1239 	/*
1240 	 * Hardware invalidates all DMA remapping hardware translation
1241 	 * caches as part of SRTP flow.
1242 	 */
1243 	if (cap_esrtps(iommu->cap))
1244 		return;
1245 
1246 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1247 	if (sm_supported(iommu))
1248 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1249 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1250 }
1251 
1252 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1253 {
1254 	u32 val;
1255 	unsigned long flag;
1256 
1257 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1258 		return;
1259 
1260 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1262 
1263 	/* Make sure hardware complete it */
1264 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1265 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1266 
1267 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269 
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_context(struct intel_iommu *iommu,
1272 				  u16 did, u16 source_id, u8 function_mask,
1273 				  u64 type)
1274 {
1275 	u64 val = 0;
1276 	unsigned long flag;
1277 
1278 	switch (type) {
1279 	case DMA_CCMD_GLOBAL_INVL:
1280 		val = DMA_CCMD_GLOBAL_INVL;
1281 		break;
1282 	case DMA_CCMD_DOMAIN_INVL:
1283 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1284 		break;
1285 	case DMA_CCMD_DEVICE_INVL:
1286 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1287 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1288 		break;
1289 	default:
1290 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1291 			iommu->name, type);
1292 		return;
1293 	}
1294 	val |= DMA_CCMD_ICC;
1295 
1296 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1297 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1298 
1299 	/* Make sure hardware complete it */
1300 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1301 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1302 
1303 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1304 }
1305 
1306 /* return value determine if we need a write buffer flush */
1307 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1308 				u64 addr, unsigned int size_order, u64 type)
1309 {
1310 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1311 	u64 val = 0, val_iva = 0;
1312 	unsigned long flag;
1313 
1314 	switch (type) {
1315 	case DMA_TLB_GLOBAL_FLUSH:
1316 		/* global flush doesn't need set IVA_REG */
1317 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1318 		break;
1319 	case DMA_TLB_DSI_FLUSH:
1320 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1321 		break;
1322 	case DMA_TLB_PSI_FLUSH:
1323 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1324 		/* IH bit is passed in as part of address */
1325 		val_iva = size_order | addr;
1326 		break;
1327 	default:
1328 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1329 			iommu->name, type);
1330 		return;
1331 	}
1332 
1333 	if (cap_write_drain(iommu->cap))
1334 		val |= DMA_TLB_WRITE_DRAIN;
1335 
1336 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337 	/* Note: Only uses first TLB reg currently */
1338 	if (val_iva)
1339 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1340 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1341 
1342 	/* Make sure hardware complete it */
1343 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1344 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1345 
1346 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 
1348 	/* check IOTLB invalidation granularity */
1349 	if (DMA_TLB_IAIG(val) == 0)
1350 		pr_err("Flush IOTLB failed\n");
1351 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1352 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1353 			(unsigned long long)DMA_TLB_IIRG(type),
1354 			(unsigned long long)DMA_TLB_IAIG(val));
1355 }
1356 
1357 static struct device_domain_info *
1358 domain_lookup_dev_info(struct dmar_domain *domain,
1359 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1360 {
1361 	struct device_domain_info *info;
1362 	unsigned long flags;
1363 
1364 	spin_lock_irqsave(&domain->lock, flags);
1365 	list_for_each_entry(info, &domain->devices, link) {
1366 		if (info->iommu == iommu && info->bus == bus &&
1367 		    info->devfn == devfn) {
1368 			spin_unlock_irqrestore(&domain->lock, flags);
1369 			return info;
1370 		}
1371 	}
1372 	spin_unlock_irqrestore(&domain->lock, flags);
1373 
1374 	return NULL;
1375 }
1376 
1377 static void domain_update_iotlb(struct dmar_domain *domain)
1378 {
1379 	struct dev_pasid_info *dev_pasid;
1380 	struct device_domain_info *info;
1381 	bool has_iotlb_device = false;
1382 	unsigned long flags;
1383 
1384 	spin_lock_irqsave(&domain->lock, flags);
1385 	list_for_each_entry(info, &domain->devices, link) {
1386 		if (info->ats_enabled) {
1387 			has_iotlb_device = true;
1388 			break;
1389 		}
1390 	}
1391 
1392 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1393 		info = dev_iommu_priv_get(dev_pasid->dev);
1394 		if (info->ats_enabled) {
1395 			has_iotlb_device = true;
1396 			break;
1397 		}
1398 	}
1399 	domain->has_iotlb_device = has_iotlb_device;
1400 	spin_unlock_irqrestore(&domain->lock, flags);
1401 }
1402 
1403 /*
1404  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1405  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1406  * check because it applies only to the built-in QAT devices and it doesn't
1407  * grant additional privileges.
1408  */
1409 #define BUGGY_QAT_DEVID_MASK 0x4940
1410 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1411 {
1412 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1413 		return false;
1414 
1415 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1416 		return false;
1417 
1418 	return true;
1419 }
1420 
1421 static void iommu_enable_pci_caps(struct device_domain_info *info)
1422 {
1423 	struct pci_dev *pdev;
1424 
1425 	if (!dev_is_pci(info->dev))
1426 		return;
1427 
1428 	pdev = to_pci_dev(info->dev);
1429 
1430 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1431 	   the device if you enable PASID support after ATS support is
1432 	   undefined. So always enable PASID support on devices which
1433 	   have it, even if we can't yet know if we're ever going to
1434 	   use it. */
1435 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1436 		info->pasid_enabled = 1;
1437 
1438 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1439 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1440 		info->ats_enabled = 1;
1441 		domain_update_iotlb(info->domain);
1442 	}
1443 }
1444 
1445 static void iommu_disable_pci_caps(struct device_domain_info *info)
1446 {
1447 	struct pci_dev *pdev;
1448 
1449 	if (!dev_is_pci(info->dev))
1450 		return;
1451 
1452 	pdev = to_pci_dev(info->dev);
1453 
1454 	if (info->ats_enabled) {
1455 		pci_disable_ats(pdev);
1456 		info->ats_enabled = 0;
1457 		domain_update_iotlb(info->domain);
1458 	}
1459 
1460 	if (info->pasid_enabled) {
1461 		pci_disable_pasid(pdev);
1462 		info->pasid_enabled = 0;
1463 	}
1464 }
1465 
1466 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1467 				    u64 addr, unsigned int mask)
1468 {
1469 	u16 sid, qdep;
1470 
1471 	if (!info || !info->ats_enabled)
1472 		return;
1473 
1474 	sid = info->bus << 8 | info->devfn;
1475 	qdep = info->ats_qdep;
1476 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1477 			   qdep, addr, mask);
1478 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1479 }
1480 
1481 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1482 				  u64 addr, unsigned mask)
1483 {
1484 	struct dev_pasid_info *dev_pasid;
1485 	struct device_domain_info *info;
1486 	unsigned long flags;
1487 
1488 	if (!domain->has_iotlb_device)
1489 		return;
1490 
1491 	spin_lock_irqsave(&domain->lock, flags);
1492 	list_for_each_entry(info, &domain->devices, link)
1493 		__iommu_flush_dev_iotlb(info, addr, mask);
1494 
1495 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1496 		info = dev_iommu_priv_get(dev_pasid->dev);
1497 
1498 		if (!info->ats_enabled)
1499 			continue;
1500 
1501 		qi_flush_dev_iotlb_pasid(info->iommu,
1502 					 PCI_DEVID(info->bus, info->devfn),
1503 					 info->pfsid, dev_pasid->pasid,
1504 					 info->ats_qdep, addr,
1505 					 mask);
1506 	}
1507 	spin_unlock_irqrestore(&domain->lock, flags);
1508 }
1509 
1510 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1511 				     struct dmar_domain *domain, u64 addr,
1512 				     unsigned long npages, bool ih)
1513 {
1514 	u16 did = domain_id_iommu(domain, iommu);
1515 	struct dev_pasid_info *dev_pasid;
1516 	unsigned long flags;
1517 
1518 	spin_lock_irqsave(&domain->lock, flags);
1519 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1520 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1521 
1522 	if (!list_empty(&domain->devices))
1523 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1524 	spin_unlock_irqrestore(&domain->lock, flags);
1525 }
1526 
1527 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1528 				  struct dmar_domain *domain,
1529 				  unsigned long pfn, unsigned int pages,
1530 				  int ih, int map)
1531 {
1532 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1533 	unsigned int mask = ilog2(aligned_pages);
1534 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1535 	u16 did = domain_id_iommu(domain, iommu);
1536 
1537 	if (WARN_ON(!pages))
1538 		return;
1539 
1540 	if (ih)
1541 		ih = 1 << 6;
1542 
1543 	if (domain->use_first_level) {
1544 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1545 	} else {
1546 		unsigned long bitmask = aligned_pages - 1;
1547 
1548 		/*
1549 		 * PSI masks the low order bits of the base address. If the
1550 		 * address isn't aligned to the mask, then compute a mask value
1551 		 * needed to ensure the target range is flushed.
1552 		 */
1553 		if (unlikely(bitmask & pfn)) {
1554 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1555 
1556 			/*
1557 			 * Since end_pfn <= pfn + bitmask, the only way bits
1558 			 * higher than bitmask can differ in pfn and end_pfn is
1559 			 * by carrying. This means after masking out bitmask,
1560 			 * high bits starting with the first set bit in
1561 			 * shared_bits are all equal in both pfn and end_pfn.
1562 			 */
1563 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1564 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1565 		}
1566 
1567 		/*
1568 		 * Fallback to domain selective flush if no PSI support or
1569 		 * the size is too big.
1570 		 */
1571 		if (!cap_pgsel_inv(iommu->cap) ||
1572 		    mask > cap_max_amask_val(iommu->cap))
1573 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1574 							DMA_TLB_DSI_FLUSH);
1575 		else
1576 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1577 							DMA_TLB_PSI_FLUSH);
1578 	}
1579 
1580 	/*
1581 	 * In caching mode, changes of pages from non-present to present require
1582 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1583 	 */
1584 	if (!cap_caching_mode(iommu->cap) || !map)
1585 		iommu_flush_dev_iotlb(domain, addr, mask);
1586 }
1587 
1588 /* Notification for newly created mappings */
1589 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1590 					struct dmar_domain *domain,
1591 					unsigned long pfn, unsigned int pages)
1592 {
1593 	/*
1594 	 * It's a non-present to present mapping. Only flush if caching mode
1595 	 * and second level.
1596 	 */
1597 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1598 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1599 	else
1600 		iommu_flush_write_buffer(iommu);
1601 }
1602 
1603 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1604 {
1605 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1606 	struct iommu_domain_info *info;
1607 	unsigned long idx;
1608 
1609 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1610 		struct intel_iommu *iommu = info->iommu;
1611 		u16 did = domain_id_iommu(dmar_domain, iommu);
1612 
1613 		if (dmar_domain->use_first_level)
1614 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1615 		else
1616 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1617 						 DMA_TLB_DSI_FLUSH);
1618 
1619 		if (!cap_caching_mode(iommu->cap))
1620 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1621 	}
1622 }
1623 
1624 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1625 {
1626 	u32 pmen;
1627 	unsigned long flags;
1628 
1629 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1630 		return;
1631 
1632 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1633 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1634 	pmen &= ~DMA_PMEN_EPM;
1635 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1636 
1637 	/* wait for the protected region status bit to clear */
1638 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1639 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1640 
1641 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643 
1644 static void iommu_enable_translation(struct intel_iommu *iommu)
1645 {
1646 	u32 sts;
1647 	unsigned long flags;
1648 
1649 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1650 	iommu->gcmd |= DMA_GCMD_TE;
1651 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652 
1653 	/* Make sure hardware complete it */
1654 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 		      readl, (sts & DMA_GSTS_TES), sts);
1656 
1657 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1658 }
1659 
1660 static void iommu_disable_translation(struct intel_iommu *iommu)
1661 {
1662 	u32 sts;
1663 	unsigned long flag;
1664 
1665 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1666 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1667 		return;
1668 
1669 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1670 	iommu->gcmd &= ~DMA_GCMD_TE;
1671 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1672 
1673 	/* Make sure hardware complete it */
1674 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1675 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1676 
1677 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1678 }
1679 
1680 static int iommu_init_domains(struct intel_iommu *iommu)
1681 {
1682 	u32 ndomains;
1683 
1684 	ndomains = cap_ndoms(iommu->cap);
1685 	pr_debug("%s: Number of Domains supported <%d>\n",
1686 		 iommu->name, ndomains);
1687 
1688 	spin_lock_init(&iommu->lock);
1689 
1690 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1691 	if (!iommu->domain_ids)
1692 		return -ENOMEM;
1693 
1694 	/*
1695 	 * If Caching mode is set, then invalid translations are tagged
1696 	 * with domain-id 0, hence we need to pre-allocate it. We also
1697 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1698 	 * make sure it is not used for a real domain.
1699 	 */
1700 	set_bit(0, iommu->domain_ids);
1701 
1702 	/*
1703 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1704 	 * entry for first-level or pass-through translation modes should
1705 	 * be programmed with a domain id different from those used for
1706 	 * second-level or nested translation. We reserve a domain id for
1707 	 * this purpose. This domain id is also used for identity domain
1708 	 * in legacy mode.
1709 	 */
1710 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1711 
1712 	return 0;
1713 }
1714 
1715 static void disable_dmar_iommu(struct intel_iommu *iommu)
1716 {
1717 	if (!iommu->domain_ids)
1718 		return;
1719 
1720 	/*
1721 	 * All iommu domains must have been detached from the devices,
1722 	 * hence there should be no domain IDs in use.
1723 	 */
1724 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1725 		    > NUM_RESERVED_DID))
1726 		return;
1727 
1728 	if (iommu->gcmd & DMA_GCMD_TE)
1729 		iommu_disable_translation(iommu);
1730 }
1731 
1732 static void free_dmar_iommu(struct intel_iommu *iommu)
1733 {
1734 	if (iommu->domain_ids) {
1735 		bitmap_free(iommu->domain_ids);
1736 		iommu->domain_ids = NULL;
1737 	}
1738 
1739 	if (iommu->copied_tables) {
1740 		bitmap_free(iommu->copied_tables);
1741 		iommu->copied_tables = NULL;
1742 	}
1743 
1744 	/* free context mapping */
1745 	free_context_table(iommu);
1746 
1747 #ifdef CONFIG_INTEL_IOMMU_SVM
1748 	if (pasid_supported(iommu)) {
1749 		if (ecap_prs(iommu->ecap))
1750 			intel_svm_finish_prq(iommu);
1751 	}
1752 #endif
1753 }
1754 
1755 /*
1756  * Check and return whether first level is used by default for
1757  * DMA translation.
1758  */
1759 static bool first_level_by_default(unsigned int type)
1760 {
1761 	/* Only SL is available in legacy mode */
1762 	if (!scalable_mode_support())
1763 		return false;
1764 
1765 	/* Only level (either FL or SL) is available, just use it */
1766 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1767 		return intel_cap_flts_sanity();
1768 
1769 	/* Both levels are available, decide it based on domain type */
1770 	return type != IOMMU_DOMAIN_UNMANAGED;
1771 }
1772 
1773 static struct dmar_domain *alloc_domain(unsigned int type)
1774 {
1775 	struct dmar_domain *domain;
1776 
1777 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1778 	if (!domain)
1779 		return NULL;
1780 
1781 	domain->nid = NUMA_NO_NODE;
1782 	if (first_level_by_default(type))
1783 		domain->use_first_level = true;
1784 	domain->has_iotlb_device = false;
1785 	INIT_LIST_HEAD(&domain->devices);
1786 	INIT_LIST_HEAD(&domain->dev_pasids);
1787 	spin_lock_init(&domain->lock);
1788 	xa_init(&domain->iommu_array);
1789 
1790 	return domain;
1791 }
1792 
1793 static int domain_attach_iommu(struct dmar_domain *domain,
1794 			       struct intel_iommu *iommu)
1795 {
1796 	struct iommu_domain_info *info, *curr;
1797 	unsigned long ndomains;
1798 	int num, ret = -ENOSPC;
1799 
1800 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1801 	if (!info)
1802 		return -ENOMEM;
1803 
1804 	spin_lock(&iommu->lock);
1805 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1806 	if (curr) {
1807 		curr->refcnt++;
1808 		spin_unlock(&iommu->lock);
1809 		kfree(info);
1810 		return 0;
1811 	}
1812 
1813 	ndomains = cap_ndoms(iommu->cap);
1814 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1815 	if (num >= ndomains) {
1816 		pr_err("%s: No free domain ids\n", iommu->name);
1817 		goto err_unlock;
1818 	}
1819 
1820 	set_bit(num, iommu->domain_ids);
1821 	info->refcnt	= 1;
1822 	info->did	= num;
1823 	info->iommu	= iommu;
1824 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1825 			  NULL, info, GFP_ATOMIC);
1826 	if (curr) {
1827 		ret = xa_err(curr) ? : -EBUSY;
1828 		goto err_clear;
1829 	}
1830 	domain_update_iommu_cap(domain);
1831 
1832 	spin_unlock(&iommu->lock);
1833 	return 0;
1834 
1835 err_clear:
1836 	clear_bit(info->did, iommu->domain_ids);
1837 err_unlock:
1838 	spin_unlock(&iommu->lock);
1839 	kfree(info);
1840 	return ret;
1841 }
1842 
1843 static void domain_detach_iommu(struct dmar_domain *domain,
1844 				struct intel_iommu *iommu)
1845 {
1846 	struct iommu_domain_info *info;
1847 
1848 	spin_lock(&iommu->lock);
1849 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1850 	if (--info->refcnt == 0) {
1851 		clear_bit(info->did, iommu->domain_ids);
1852 		xa_erase(&domain->iommu_array, iommu->seq_id);
1853 		domain->nid = NUMA_NO_NODE;
1854 		domain_update_iommu_cap(domain);
1855 		kfree(info);
1856 	}
1857 	spin_unlock(&iommu->lock);
1858 }
1859 
1860 static inline int guestwidth_to_adjustwidth(int gaw)
1861 {
1862 	int agaw;
1863 	int r = (gaw - 12) % 9;
1864 
1865 	if (r == 0)
1866 		agaw = gaw;
1867 	else
1868 		agaw = gaw + 9 - r;
1869 	if (agaw > 64)
1870 		agaw = 64;
1871 	return agaw;
1872 }
1873 
1874 static void domain_exit(struct dmar_domain *domain)
1875 {
1876 	if (domain->pgd) {
1877 		LIST_HEAD(freelist);
1878 
1879 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1880 		put_pages_list(&freelist);
1881 	}
1882 
1883 	if (WARN_ON(!list_empty(&domain->devices)))
1884 		return;
1885 
1886 	kfree(domain);
1887 }
1888 
1889 /*
1890  * Get the PASID directory size for scalable mode context entry.
1891  * Value of X in the PDTS field of a scalable mode context entry
1892  * indicates PASID directory with 2^(X + 7) entries.
1893  */
1894 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1895 {
1896 	unsigned long pds, max_pde;
1897 
1898 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1899 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1900 	if (pds < 7)
1901 		return 0;
1902 
1903 	return pds - 7;
1904 }
1905 
1906 /*
1907  * Set the RID_PASID field of a scalable mode context entry. The
1908  * IOMMU hardware will use the PASID value set in this field for
1909  * DMA translations of DMA requests without PASID.
1910  */
1911 static inline void
1912 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1913 {
1914 	context->hi |= pasid & ((1 << 20) - 1);
1915 }
1916 
1917 /*
1918  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1919  * entry.
1920  */
1921 static inline void context_set_sm_dte(struct context_entry *context)
1922 {
1923 	context->lo |= BIT_ULL(2);
1924 }
1925 
1926 /*
1927  * Set the PRE(Page Request Enable) field of a scalable mode context
1928  * entry.
1929  */
1930 static inline void context_set_sm_pre(struct context_entry *context)
1931 {
1932 	context->lo |= BIT_ULL(4);
1933 }
1934 
1935 /* Convert value to context PASID directory size field coding. */
1936 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1937 
1938 static int domain_context_mapping_one(struct dmar_domain *domain,
1939 				      struct intel_iommu *iommu,
1940 				      struct pasid_table *table,
1941 				      u8 bus, u8 devfn)
1942 {
1943 	struct device_domain_info *info =
1944 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1945 	u16 did = domain_id_iommu(domain, iommu);
1946 	int translation = CONTEXT_TT_MULTI_LEVEL;
1947 	struct context_entry *context;
1948 	int ret;
1949 
1950 	if (hw_pass_through && domain_type_is_si(domain))
1951 		translation = CONTEXT_TT_PASS_THROUGH;
1952 
1953 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1954 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1955 
1956 	spin_lock(&iommu->lock);
1957 	ret = -ENOMEM;
1958 	context = iommu_context_addr(iommu, bus, devfn, 1);
1959 	if (!context)
1960 		goto out_unlock;
1961 
1962 	ret = 0;
1963 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1964 		goto out_unlock;
1965 
1966 	/*
1967 	 * For kdump cases, old valid entries may be cached due to the
1968 	 * in-flight DMA and copied pgtable, but there is no unmapping
1969 	 * behaviour for them, thus we need an explicit cache flush for
1970 	 * the newly-mapped device. For kdump, at this point, the device
1971 	 * is supposed to finish reset at its driver probe stage, so no
1972 	 * in-flight DMA will exist, and we don't need to worry anymore
1973 	 * hereafter.
1974 	 */
1975 	if (context_copied(iommu, bus, devfn)) {
1976 		u16 did_old = context_domain_id(context);
1977 
1978 		if (did_old < cap_ndoms(iommu->cap)) {
1979 			iommu->flush.flush_context(iommu, did_old,
1980 						   (((u16)bus) << 8) | devfn,
1981 						   DMA_CCMD_MASK_NOBIT,
1982 						   DMA_CCMD_DEVICE_INVL);
1983 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1984 						 DMA_TLB_DSI_FLUSH);
1985 		}
1986 
1987 		clear_context_copied(iommu, bus, devfn);
1988 	}
1989 
1990 	context_clear_entry(context);
1991 
1992 	if (sm_supported(iommu)) {
1993 		unsigned long pds;
1994 
1995 		/* Setup the PASID DIR pointer: */
1996 		pds = context_get_sm_pds(table);
1997 		context->lo = (u64)virt_to_phys(table->table) |
1998 				context_pdts(pds);
1999 
2000 		/* Setup the RID_PASID field: */
2001 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
2002 
2003 		/*
2004 		 * Setup the Device-TLB enable bit and Page request
2005 		 * Enable bit:
2006 		 */
2007 		if (info && info->ats_supported)
2008 			context_set_sm_dte(context);
2009 		if (info && info->pri_supported)
2010 			context_set_sm_pre(context);
2011 		if (info && info->pasid_supported)
2012 			context_set_pasid(context);
2013 	} else {
2014 		struct dma_pte *pgd = domain->pgd;
2015 		int agaw;
2016 
2017 		context_set_domain_id(context, did);
2018 
2019 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2020 			/*
2021 			 * Skip top levels of page tables for iommu which has
2022 			 * less agaw than default. Unnecessary for PT mode.
2023 			 */
2024 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2025 				ret = -ENOMEM;
2026 				pgd = phys_to_virt(dma_pte_addr(pgd));
2027 				if (!dma_pte_present(pgd))
2028 					goto out_unlock;
2029 			}
2030 
2031 			if (info && info->ats_supported)
2032 				translation = CONTEXT_TT_DEV_IOTLB;
2033 			else
2034 				translation = CONTEXT_TT_MULTI_LEVEL;
2035 
2036 			context_set_address_root(context, virt_to_phys(pgd));
2037 			context_set_address_width(context, agaw);
2038 		} else {
2039 			/*
2040 			 * In pass through mode, AW must be programmed to
2041 			 * indicate the largest AGAW value supported by
2042 			 * hardware. And ASR is ignored by hardware.
2043 			 */
2044 			context_set_address_width(context, iommu->msagaw);
2045 		}
2046 
2047 		context_set_translation_type(context, translation);
2048 	}
2049 
2050 	context_set_fault_enable(context);
2051 	context_set_present(context);
2052 	if (!ecap_coherent(iommu->ecap))
2053 		clflush_cache_range(context, sizeof(*context));
2054 
2055 	/*
2056 	 * It's a non-present to present mapping. If hardware doesn't cache
2057 	 * non-present entry we only need to flush the write-buffer. If the
2058 	 * _does_ cache non-present entries, then it does so in the special
2059 	 * domain #0, which we have to flush:
2060 	 */
2061 	if (cap_caching_mode(iommu->cap)) {
2062 		iommu->flush.flush_context(iommu, 0,
2063 					   (((u16)bus) << 8) | devfn,
2064 					   DMA_CCMD_MASK_NOBIT,
2065 					   DMA_CCMD_DEVICE_INVL);
2066 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2067 	} else {
2068 		iommu_flush_write_buffer(iommu);
2069 	}
2070 
2071 	ret = 0;
2072 
2073 out_unlock:
2074 	spin_unlock(&iommu->lock);
2075 
2076 	return ret;
2077 }
2078 
2079 struct domain_context_mapping_data {
2080 	struct dmar_domain *domain;
2081 	struct intel_iommu *iommu;
2082 	struct pasid_table *table;
2083 };
2084 
2085 static int domain_context_mapping_cb(struct pci_dev *pdev,
2086 				     u16 alias, void *opaque)
2087 {
2088 	struct domain_context_mapping_data *data = opaque;
2089 
2090 	return domain_context_mapping_one(data->domain, data->iommu,
2091 					  data->table, PCI_BUS_NUM(alias),
2092 					  alias & 0xff);
2093 }
2094 
2095 static int
2096 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2097 {
2098 	struct domain_context_mapping_data data;
2099 	struct pasid_table *table;
2100 	struct intel_iommu *iommu;
2101 	u8 bus, devfn;
2102 
2103 	iommu = device_to_iommu(dev, &bus, &devfn);
2104 	if (!iommu)
2105 		return -ENODEV;
2106 
2107 	table = intel_pasid_get_table(dev);
2108 
2109 	if (!dev_is_pci(dev))
2110 		return domain_context_mapping_one(domain, iommu, table,
2111 						  bus, devfn);
2112 
2113 	data.domain = domain;
2114 	data.iommu = iommu;
2115 	data.table = table;
2116 
2117 	return pci_for_each_dma_alias(to_pci_dev(dev),
2118 				      &domain_context_mapping_cb, &data);
2119 }
2120 
2121 /* Returns a number of VTD pages, but aligned to MM page size */
2122 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2123 					    size_t size)
2124 {
2125 	host_addr &= ~PAGE_MASK;
2126 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2127 }
2128 
2129 /* Return largest possible superpage level for a given mapping */
2130 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2131 					  unsigned long iov_pfn,
2132 					  unsigned long phy_pfn,
2133 					  unsigned long pages)
2134 {
2135 	int support, level = 1;
2136 	unsigned long pfnmerge;
2137 
2138 	support = domain->iommu_superpage;
2139 
2140 	/* To use a large page, the virtual *and* physical addresses
2141 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2142 	   of them will mean we have to use smaller pages. So just
2143 	   merge them and check both at once. */
2144 	pfnmerge = iov_pfn | phy_pfn;
2145 
2146 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2147 		pages >>= VTD_STRIDE_SHIFT;
2148 		if (!pages)
2149 			break;
2150 		pfnmerge >>= VTD_STRIDE_SHIFT;
2151 		level++;
2152 		support--;
2153 	}
2154 	return level;
2155 }
2156 
2157 /*
2158  * Ensure that old small page tables are removed to make room for superpage(s).
2159  * We're going to add new large pages, so make sure we don't remove their parent
2160  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2161  */
2162 static void switch_to_super_page(struct dmar_domain *domain,
2163 				 unsigned long start_pfn,
2164 				 unsigned long end_pfn, int level)
2165 {
2166 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2167 	struct iommu_domain_info *info;
2168 	struct dma_pte *pte = NULL;
2169 	unsigned long i;
2170 
2171 	while (start_pfn <= end_pfn) {
2172 		if (!pte)
2173 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2174 					     GFP_ATOMIC);
2175 
2176 		if (dma_pte_present(pte)) {
2177 			dma_pte_free_pagetable(domain, start_pfn,
2178 					       start_pfn + lvl_pages - 1,
2179 					       level + 1);
2180 
2181 			xa_for_each(&domain->iommu_array, i, info)
2182 				iommu_flush_iotlb_psi(info->iommu, domain,
2183 						      start_pfn, lvl_pages,
2184 						      0, 0);
2185 		}
2186 
2187 		pte++;
2188 		start_pfn += lvl_pages;
2189 		if (first_pte_in_page(pte))
2190 			pte = NULL;
2191 	}
2192 }
2193 
2194 static int
2195 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2196 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2197 		 gfp_t gfp)
2198 {
2199 	struct dma_pte *first_pte = NULL, *pte = NULL;
2200 	unsigned int largepage_lvl = 0;
2201 	unsigned long lvl_pages = 0;
2202 	phys_addr_t pteval;
2203 	u64 attr;
2204 
2205 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2206 		return -EINVAL;
2207 
2208 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2209 		return -EINVAL;
2210 
2211 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2212 	attr |= DMA_FL_PTE_PRESENT;
2213 	if (domain->use_first_level) {
2214 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2215 		if (prot & DMA_PTE_WRITE)
2216 			attr |= DMA_FL_PTE_DIRTY;
2217 	}
2218 
2219 	domain->has_mappings = true;
2220 
2221 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2222 
2223 	while (nr_pages > 0) {
2224 		uint64_t tmp;
2225 
2226 		if (!pte) {
2227 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2228 					phys_pfn, nr_pages);
2229 
2230 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2231 					     gfp);
2232 			if (!pte)
2233 				return -ENOMEM;
2234 			first_pte = pte;
2235 
2236 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2237 
2238 			/* It is large page*/
2239 			if (largepage_lvl > 1) {
2240 				unsigned long end_pfn;
2241 				unsigned long pages_to_remove;
2242 
2243 				pteval |= DMA_PTE_LARGE_PAGE;
2244 				pages_to_remove = min_t(unsigned long, nr_pages,
2245 							nr_pte_to_next_page(pte) * lvl_pages);
2246 				end_pfn = iov_pfn + pages_to_remove - 1;
2247 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2248 			} else {
2249 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2250 			}
2251 
2252 		}
2253 		/* We don't need lock here, nobody else
2254 		 * touches the iova range
2255 		 */
2256 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2257 		if (tmp) {
2258 			static int dumps = 5;
2259 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2260 				iov_pfn, tmp, (unsigned long long)pteval);
2261 			if (dumps) {
2262 				dumps--;
2263 				debug_dma_dump_mappings(NULL);
2264 			}
2265 			WARN_ON(1);
2266 		}
2267 
2268 		nr_pages -= lvl_pages;
2269 		iov_pfn += lvl_pages;
2270 		phys_pfn += lvl_pages;
2271 		pteval += lvl_pages * VTD_PAGE_SIZE;
2272 
2273 		/* If the next PTE would be the first in a new page, then we
2274 		 * need to flush the cache on the entries we've just written.
2275 		 * And then we'll need to recalculate 'pte', so clear it and
2276 		 * let it get set again in the if (!pte) block above.
2277 		 *
2278 		 * If we're done (!nr_pages) we need to flush the cache too.
2279 		 *
2280 		 * Also if we've been setting superpages, we may need to
2281 		 * recalculate 'pte' and switch back to smaller pages for the
2282 		 * end of the mapping, if the trailing size is not enough to
2283 		 * use another superpage (i.e. nr_pages < lvl_pages).
2284 		 */
2285 		pte++;
2286 		if (!nr_pages || first_pte_in_page(pte) ||
2287 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2288 			domain_flush_cache(domain, first_pte,
2289 					   (void *)pte - (void *)first_pte);
2290 			pte = NULL;
2291 		}
2292 	}
2293 
2294 	return 0;
2295 }
2296 
2297 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2298 {
2299 	struct intel_iommu *iommu = info->iommu;
2300 	struct context_entry *context;
2301 	u16 did_old;
2302 
2303 	if (!iommu)
2304 		return;
2305 
2306 	spin_lock(&iommu->lock);
2307 	context = iommu_context_addr(iommu, bus, devfn, 0);
2308 	if (!context) {
2309 		spin_unlock(&iommu->lock);
2310 		return;
2311 	}
2312 
2313 	if (sm_supported(iommu)) {
2314 		if (hw_pass_through && domain_type_is_si(info->domain))
2315 			did_old = FLPT_DEFAULT_DID;
2316 		else
2317 			did_old = domain_id_iommu(info->domain, iommu);
2318 	} else {
2319 		did_old = context_domain_id(context);
2320 	}
2321 
2322 	context_clear_entry(context);
2323 	__iommu_flush_cache(iommu, context, sizeof(*context));
2324 	spin_unlock(&iommu->lock);
2325 	iommu->flush.flush_context(iommu,
2326 				   did_old,
2327 				   (((u16)bus) << 8) | devfn,
2328 				   DMA_CCMD_MASK_NOBIT,
2329 				   DMA_CCMD_DEVICE_INVL);
2330 
2331 	if (sm_supported(iommu))
2332 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2333 
2334 	iommu->flush.flush_iotlb(iommu,
2335 				 did_old,
2336 				 0,
2337 				 0,
2338 				 DMA_TLB_DSI_FLUSH);
2339 
2340 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2341 }
2342 
2343 static int domain_setup_first_level(struct intel_iommu *iommu,
2344 				    struct dmar_domain *domain,
2345 				    struct device *dev,
2346 				    u32 pasid)
2347 {
2348 	struct dma_pte *pgd = domain->pgd;
2349 	int agaw, level;
2350 	int flags = 0;
2351 
2352 	/*
2353 	 * Skip top levels of page tables for iommu which has
2354 	 * less agaw than default. Unnecessary for PT mode.
2355 	 */
2356 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2357 		pgd = phys_to_virt(dma_pte_addr(pgd));
2358 		if (!dma_pte_present(pgd))
2359 			return -ENOMEM;
2360 	}
2361 
2362 	level = agaw_to_level(agaw);
2363 	if (level != 4 && level != 5)
2364 		return -EINVAL;
2365 
2366 	if (level == 5)
2367 		flags |= PASID_FLAG_FL5LP;
2368 
2369 	if (domain->force_snooping)
2370 		flags |= PASID_FLAG_PAGE_SNOOP;
2371 
2372 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2373 					     domain_id_iommu(domain, iommu),
2374 					     flags);
2375 }
2376 
2377 static bool dev_is_real_dma_subdevice(struct device *dev)
2378 {
2379 	return dev && dev_is_pci(dev) &&
2380 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2381 }
2382 
2383 static int iommu_domain_identity_map(struct dmar_domain *domain,
2384 				     unsigned long first_vpfn,
2385 				     unsigned long last_vpfn)
2386 {
2387 	/*
2388 	 * RMRR range might have overlap with physical memory range,
2389 	 * clear it first
2390 	 */
2391 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2392 
2393 	return __domain_mapping(domain, first_vpfn,
2394 				first_vpfn, last_vpfn - first_vpfn + 1,
2395 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2396 }
2397 
2398 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2399 
2400 static int __init si_domain_init(int hw)
2401 {
2402 	struct dmar_rmrr_unit *rmrr;
2403 	struct device *dev;
2404 	int i, nid, ret;
2405 
2406 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2407 	if (!si_domain)
2408 		return -EFAULT;
2409 
2410 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2411 		domain_exit(si_domain);
2412 		si_domain = NULL;
2413 		return -EFAULT;
2414 	}
2415 
2416 	if (hw)
2417 		return 0;
2418 
2419 	for_each_online_node(nid) {
2420 		unsigned long start_pfn, end_pfn;
2421 		int i;
2422 
2423 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2424 			ret = iommu_domain_identity_map(si_domain,
2425 					mm_to_dma_pfn_start(start_pfn),
2426 					mm_to_dma_pfn_end(end_pfn-1));
2427 			if (ret)
2428 				return ret;
2429 		}
2430 	}
2431 
2432 	/*
2433 	 * Identity map the RMRRs so that devices with RMRRs could also use
2434 	 * the si_domain.
2435 	 */
2436 	for_each_rmrr_units(rmrr) {
2437 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2438 					  i, dev) {
2439 			unsigned long long start = rmrr->base_address;
2440 			unsigned long long end = rmrr->end_address;
2441 
2442 			if (WARN_ON(end < start ||
2443 				    end >> agaw_to_width(si_domain->agaw)))
2444 				continue;
2445 
2446 			ret = iommu_domain_identity_map(si_domain,
2447 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2448 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2449 			if (ret)
2450 				return ret;
2451 		}
2452 	}
2453 
2454 	return 0;
2455 }
2456 
2457 static int dmar_domain_attach_device(struct dmar_domain *domain,
2458 				     struct device *dev)
2459 {
2460 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2461 	struct intel_iommu *iommu;
2462 	unsigned long flags;
2463 	u8 bus, devfn;
2464 	int ret;
2465 
2466 	iommu = device_to_iommu(dev, &bus, &devfn);
2467 	if (!iommu)
2468 		return -ENODEV;
2469 
2470 	ret = domain_attach_iommu(domain, iommu);
2471 	if (ret)
2472 		return ret;
2473 	info->domain = domain;
2474 	spin_lock_irqsave(&domain->lock, flags);
2475 	list_add(&info->link, &domain->devices);
2476 	spin_unlock_irqrestore(&domain->lock, flags);
2477 
2478 	/* PASID table is mandatory for a PCI device in scalable mode. */
2479 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2480 		/* Setup the PASID entry for requests without PASID: */
2481 		if (hw_pass_through && domain_type_is_si(domain))
2482 			ret = intel_pasid_setup_pass_through(iommu, domain,
2483 					dev, IOMMU_NO_PASID);
2484 		else if (domain->use_first_level)
2485 			ret = domain_setup_first_level(iommu, domain, dev,
2486 					IOMMU_NO_PASID);
2487 		else
2488 			ret = intel_pasid_setup_second_level(iommu, domain,
2489 					dev, IOMMU_NO_PASID);
2490 		if (ret) {
2491 			dev_err(dev, "Setup RID2PASID failed\n");
2492 			device_block_translation(dev);
2493 			return ret;
2494 		}
2495 	}
2496 
2497 	ret = domain_context_mapping(domain, dev);
2498 	if (ret) {
2499 		dev_err(dev, "Domain context map failed\n");
2500 		device_block_translation(dev);
2501 		return ret;
2502 	}
2503 
2504 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2505 		iommu_enable_pci_caps(info);
2506 
2507 	return 0;
2508 }
2509 
2510 /**
2511  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2512  * is relaxable (ie. is allowed to be not enforced under some conditions)
2513  * @dev: device handle
2514  *
2515  * We assume that PCI USB devices with RMRRs have them largely
2516  * for historical reasons and that the RMRR space is not actively used post
2517  * boot.  This exclusion may change if vendors begin to abuse it.
2518  *
2519  * The same exception is made for graphics devices, with the requirement that
2520  * any use of the RMRR regions will be torn down before assigning the device
2521  * to a guest.
2522  *
2523  * Return: true if the RMRR is relaxable, false otherwise
2524  */
2525 static bool device_rmrr_is_relaxable(struct device *dev)
2526 {
2527 	struct pci_dev *pdev;
2528 
2529 	if (!dev_is_pci(dev))
2530 		return false;
2531 
2532 	pdev = to_pci_dev(dev);
2533 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2534 		return true;
2535 	else
2536 		return false;
2537 }
2538 
2539 /*
2540  * Return the required default domain type for a specific device.
2541  *
2542  * @dev: the device in query
2543  * @startup: true if this is during early boot
2544  *
2545  * Returns:
2546  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2547  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2548  *  - 0: both identity and dynamic domains work for this device
2549  */
2550 static int device_def_domain_type(struct device *dev)
2551 {
2552 	if (dev_is_pci(dev)) {
2553 		struct pci_dev *pdev = to_pci_dev(dev);
2554 
2555 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2556 			return IOMMU_DOMAIN_IDENTITY;
2557 
2558 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2559 			return IOMMU_DOMAIN_IDENTITY;
2560 	}
2561 
2562 	return 0;
2563 }
2564 
2565 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2566 {
2567 	/*
2568 	 * Start from the sane iommu hardware state.
2569 	 * If the queued invalidation is already initialized by us
2570 	 * (for example, while enabling interrupt-remapping) then
2571 	 * we got the things already rolling from a sane state.
2572 	 */
2573 	if (!iommu->qi) {
2574 		/*
2575 		 * Clear any previous faults.
2576 		 */
2577 		dmar_fault(-1, iommu);
2578 		/*
2579 		 * Disable queued invalidation if supported and already enabled
2580 		 * before OS handover.
2581 		 */
2582 		dmar_disable_qi(iommu);
2583 	}
2584 
2585 	if (dmar_enable_qi(iommu)) {
2586 		/*
2587 		 * Queued Invalidate not enabled, use Register Based Invalidate
2588 		 */
2589 		iommu->flush.flush_context = __iommu_flush_context;
2590 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2591 		pr_info("%s: Using Register based invalidation\n",
2592 			iommu->name);
2593 	} else {
2594 		iommu->flush.flush_context = qi_flush_context;
2595 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2596 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2597 	}
2598 }
2599 
2600 static int copy_context_table(struct intel_iommu *iommu,
2601 			      struct root_entry *old_re,
2602 			      struct context_entry **tbl,
2603 			      int bus, bool ext)
2604 {
2605 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2606 	struct context_entry *new_ce = NULL, ce;
2607 	struct context_entry *old_ce = NULL;
2608 	struct root_entry re;
2609 	phys_addr_t old_ce_phys;
2610 
2611 	tbl_idx = ext ? bus * 2 : bus;
2612 	memcpy(&re, old_re, sizeof(re));
2613 
2614 	for (devfn = 0; devfn < 256; devfn++) {
2615 		/* First calculate the correct index */
2616 		idx = (ext ? devfn * 2 : devfn) % 256;
2617 
2618 		if (idx == 0) {
2619 			/* First save what we may have and clean up */
2620 			if (new_ce) {
2621 				tbl[tbl_idx] = new_ce;
2622 				__iommu_flush_cache(iommu, new_ce,
2623 						    VTD_PAGE_SIZE);
2624 				pos = 1;
2625 			}
2626 
2627 			if (old_ce)
2628 				memunmap(old_ce);
2629 
2630 			ret = 0;
2631 			if (devfn < 0x80)
2632 				old_ce_phys = root_entry_lctp(&re);
2633 			else
2634 				old_ce_phys = root_entry_uctp(&re);
2635 
2636 			if (!old_ce_phys) {
2637 				if (ext && devfn == 0) {
2638 					/* No LCTP, try UCTP */
2639 					devfn = 0x7f;
2640 					continue;
2641 				} else {
2642 					goto out;
2643 				}
2644 			}
2645 
2646 			ret = -ENOMEM;
2647 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2648 					MEMREMAP_WB);
2649 			if (!old_ce)
2650 				goto out;
2651 
2652 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2653 			if (!new_ce)
2654 				goto out_unmap;
2655 
2656 			ret = 0;
2657 		}
2658 
2659 		/* Now copy the context entry */
2660 		memcpy(&ce, old_ce + idx, sizeof(ce));
2661 
2662 		if (!context_present(&ce))
2663 			continue;
2664 
2665 		did = context_domain_id(&ce);
2666 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2667 			set_bit(did, iommu->domain_ids);
2668 
2669 		set_context_copied(iommu, bus, devfn);
2670 		new_ce[idx] = ce;
2671 	}
2672 
2673 	tbl[tbl_idx + pos] = new_ce;
2674 
2675 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2676 
2677 out_unmap:
2678 	memunmap(old_ce);
2679 
2680 out:
2681 	return ret;
2682 }
2683 
2684 static int copy_translation_tables(struct intel_iommu *iommu)
2685 {
2686 	struct context_entry **ctxt_tbls;
2687 	struct root_entry *old_rt;
2688 	phys_addr_t old_rt_phys;
2689 	int ctxt_table_entries;
2690 	u64 rtaddr_reg;
2691 	int bus, ret;
2692 	bool new_ext, ext;
2693 
2694 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2695 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2696 	new_ext    = !!sm_supported(iommu);
2697 
2698 	/*
2699 	 * The RTT bit can only be changed when translation is disabled,
2700 	 * but disabling translation means to open a window for data
2701 	 * corruption. So bail out and don't copy anything if we would
2702 	 * have to change the bit.
2703 	 */
2704 	if (new_ext != ext)
2705 		return -EINVAL;
2706 
2707 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2708 	if (!iommu->copied_tables)
2709 		return -ENOMEM;
2710 
2711 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2712 	if (!old_rt_phys)
2713 		return -EINVAL;
2714 
2715 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2716 	if (!old_rt)
2717 		return -ENOMEM;
2718 
2719 	/* This is too big for the stack - allocate it from slab */
2720 	ctxt_table_entries = ext ? 512 : 256;
2721 	ret = -ENOMEM;
2722 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2723 	if (!ctxt_tbls)
2724 		goto out_unmap;
2725 
2726 	for (bus = 0; bus < 256; bus++) {
2727 		ret = copy_context_table(iommu, &old_rt[bus],
2728 					 ctxt_tbls, bus, ext);
2729 		if (ret) {
2730 			pr_err("%s: Failed to copy context table for bus %d\n",
2731 				iommu->name, bus);
2732 			continue;
2733 		}
2734 	}
2735 
2736 	spin_lock(&iommu->lock);
2737 
2738 	/* Context tables are copied, now write them to the root_entry table */
2739 	for (bus = 0; bus < 256; bus++) {
2740 		int idx = ext ? bus * 2 : bus;
2741 		u64 val;
2742 
2743 		if (ctxt_tbls[idx]) {
2744 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2745 			iommu->root_entry[bus].lo = val;
2746 		}
2747 
2748 		if (!ext || !ctxt_tbls[idx + 1])
2749 			continue;
2750 
2751 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2752 		iommu->root_entry[bus].hi = val;
2753 	}
2754 
2755 	spin_unlock(&iommu->lock);
2756 
2757 	kfree(ctxt_tbls);
2758 
2759 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2760 
2761 	ret = 0;
2762 
2763 out_unmap:
2764 	memunmap(old_rt);
2765 
2766 	return ret;
2767 }
2768 
2769 static int __init init_dmars(void)
2770 {
2771 	struct dmar_drhd_unit *drhd;
2772 	struct intel_iommu *iommu;
2773 	int ret;
2774 
2775 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2776 	if (ret)
2777 		goto free_iommu;
2778 
2779 	for_each_iommu(iommu, drhd) {
2780 		if (drhd->ignored) {
2781 			iommu_disable_translation(iommu);
2782 			continue;
2783 		}
2784 
2785 		/*
2786 		 * Find the max pasid size of all IOMMU's in the system.
2787 		 * We need to ensure the system pasid table is no bigger
2788 		 * than the smallest supported.
2789 		 */
2790 		if (pasid_supported(iommu)) {
2791 			u32 temp = 2 << ecap_pss(iommu->ecap);
2792 
2793 			intel_pasid_max_id = min_t(u32, temp,
2794 						   intel_pasid_max_id);
2795 		}
2796 
2797 		intel_iommu_init_qi(iommu);
2798 
2799 		ret = iommu_init_domains(iommu);
2800 		if (ret)
2801 			goto free_iommu;
2802 
2803 		init_translation_status(iommu);
2804 
2805 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2806 			iommu_disable_translation(iommu);
2807 			clear_translation_pre_enabled(iommu);
2808 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2809 				iommu->name);
2810 		}
2811 
2812 		/*
2813 		 * TBD:
2814 		 * we could share the same root & context tables
2815 		 * among all IOMMU's. Need to Split it later.
2816 		 */
2817 		ret = iommu_alloc_root_entry(iommu);
2818 		if (ret)
2819 			goto free_iommu;
2820 
2821 		if (translation_pre_enabled(iommu)) {
2822 			pr_info("Translation already enabled - trying to copy translation structures\n");
2823 
2824 			ret = copy_translation_tables(iommu);
2825 			if (ret) {
2826 				/*
2827 				 * We found the IOMMU with translation
2828 				 * enabled - but failed to copy over the
2829 				 * old root-entry table. Try to proceed
2830 				 * by disabling translation now and
2831 				 * allocating a clean root-entry table.
2832 				 * This might cause DMAR faults, but
2833 				 * probably the dump will still succeed.
2834 				 */
2835 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2836 				       iommu->name);
2837 				iommu_disable_translation(iommu);
2838 				clear_translation_pre_enabled(iommu);
2839 			} else {
2840 				pr_info("Copied translation tables from previous kernel for %s\n",
2841 					iommu->name);
2842 			}
2843 		}
2844 
2845 		if (!ecap_pass_through(iommu->ecap))
2846 			hw_pass_through = 0;
2847 		intel_svm_check(iommu);
2848 	}
2849 
2850 	/*
2851 	 * Now that qi is enabled on all iommus, set the root entry and flush
2852 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2853 	 * flush_context function will loop forever and the boot hangs.
2854 	 */
2855 	for_each_active_iommu(iommu, drhd) {
2856 		iommu_flush_write_buffer(iommu);
2857 		iommu_set_root_entry(iommu);
2858 	}
2859 
2860 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2861 	dmar_map_gfx = 0;
2862 #endif
2863 
2864 	if (!dmar_map_gfx)
2865 		iommu_identity_mapping |= IDENTMAP_GFX;
2866 
2867 	check_tylersburg_isoch();
2868 
2869 	ret = si_domain_init(hw_pass_through);
2870 	if (ret)
2871 		goto free_iommu;
2872 
2873 	/*
2874 	 * for each drhd
2875 	 *   enable fault log
2876 	 *   global invalidate context cache
2877 	 *   global invalidate iotlb
2878 	 *   enable translation
2879 	 */
2880 	for_each_iommu(iommu, drhd) {
2881 		if (drhd->ignored) {
2882 			/*
2883 			 * we always have to disable PMRs or DMA may fail on
2884 			 * this device
2885 			 */
2886 			if (force_on)
2887 				iommu_disable_protect_mem_regions(iommu);
2888 			continue;
2889 		}
2890 
2891 		iommu_flush_write_buffer(iommu);
2892 
2893 #ifdef CONFIG_INTEL_IOMMU_SVM
2894 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2895 			/*
2896 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2897 			 * could cause possible lock race condition.
2898 			 */
2899 			up_write(&dmar_global_lock);
2900 			ret = intel_svm_enable_prq(iommu);
2901 			down_write(&dmar_global_lock);
2902 			if (ret)
2903 				goto free_iommu;
2904 		}
2905 #endif
2906 		ret = dmar_set_interrupt(iommu);
2907 		if (ret)
2908 			goto free_iommu;
2909 	}
2910 
2911 	return 0;
2912 
2913 free_iommu:
2914 	for_each_active_iommu(iommu, drhd) {
2915 		disable_dmar_iommu(iommu);
2916 		free_dmar_iommu(iommu);
2917 	}
2918 	if (si_domain) {
2919 		domain_exit(si_domain);
2920 		si_domain = NULL;
2921 	}
2922 
2923 	return ret;
2924 }
2925 
2926 static void __init init_no_remapping_devices(void)
2927 {
2928 	struct dmar_drhd_unit *drhd;
2929 	struct device *dev;
2930 	int i;
2931 
2932 	for_each_drhd_unit(drhd) {
2933 		if (!drhd->include_all) {
2934 			for_each_active_dev_scope(drhd->devices,
2935 						  drhd->devices_cnt, i, dev)
2936 				break;
2937 			/* ignore DMAR unit if no devices exist */
2938 			if (i == drhd->devices_cnt)
2939 				drhd->ignored = 1;
2940 		}
2941 	}
2942 
2943 	for_each_active_drhd_unit(drhd) {
2944 		if (drhd->include_all)
2945 			continue;
2946 
2947 		for_each_active_dev_scope(drhd->devices,
2948 					  drhd->devices_cnt, i, dev)
2949 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2950 				break;
2951 		if (i < drhd->devices_cnt)
2952 			continue;
2953 
2954 		/* This IOMMU has *only* gfx devices. Either bypass it or
2955 		   set the gfx_mapped flag, as appropriate */
2956 		drhd->gfx_dedicated = 1;
2957 		if (!dmar_map_gfx)
2958 			drhd->ignored = 1;
2959 	}
2960 }
2961 
2962 #ifdef CONFIG_SUSPEND
2963 static int init_iommu_hw(void)
2964 {
2965 	struct dmar_drhd_unit *drhd;
2966 	struct intel_iommu *iommu = NULL;
2967 	int ret;
2968 
2969 	for_each_active_iommu(iommu, drhd) {
2970 		if (iommu->qi) {
2971 			ret = dmar_reenable_qi(iommu);
2972 			if (ret)
2973 				return ret;
2974 		}
2975 	}
2976 
2977 	for_each_iommu(iommu, drhd) {
2978 		if (drhd->ignored) {
2979 			/*
2980 			 * we always have to disable PMRs or DMA may fail on
2981 			 * this device
2982 			 */
2983 			if (force_on)
2984 				iommu_disable_protect_mem_regions(iommu);
2985 			continue;
2986 		}
2987 
2988 		iommu_flush_write_buffer(iommu);
2989 		iommu_set_root_entry(iommu);
2990 		iommu_enable_translation(iommu);
2991 		iommu_disable_protect_mem_regions(iommu);
2992 	}
2993 
2994 	return 0;
2995 }
2996 
2997 static void iommu_flush_all(void)
2998 {
2999 	struct dmar_drhd_unit *drhd;
3000 	struct intel_iommu *iommu;
3001 
3002 	for_each_active_iommu(iommu, drhd) {
3003 		iommu->flush.flush_context(iommu, 0, 0, 0,
3004 					   DMA_CCMD_GLOBAL_INVL);
3005 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3006 					 DMA_TLB_GLOBAL_FLUSH);
3007 	}
3008 }
3009 
3010 static int iommu_suspend(void)
3011 {
3012 	struct dmar_drhd_unit *drhd;
3013 	struct intel_iommu *iommu = NULL;
3014 	unsigned long flag;
3015 
3016 	iommu_flush_all();
3017 
3018 	for_each_active_iommu(iommu, drhd) {
3019 		iommu_disable_translation(iommu);
3020 
3021 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3022 
3023 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3024 			readl(iommu->reg + DMAR_FECTL_REG);
3025 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3026 			readl(iommu->reg + DMAR_FEDATA_REG);
3027 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3028 			readl(iommu->reg + DMAR_FEADDR_REG);
3029 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3030 			readl(iommu->reg + DMAR_FEUADDR_REG);
3031 
3032 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3033 	}
3034 	return 0;
3035 }
3036 
3037 static void iommu_resume(void)
3038 {
3039 	struct dmar_drhd_unit *drhd;
3040 	struct intel_iommu *iommu = NULL;
3041 	unsigned long flag;
3042 
3043 	if (init_iommu_hw()) {
3044 		if (force_on)
3045 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3046 		else
3047 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3048 		return;
3049 	}
3050 
3051 	for_each_active_iommu(iommu, drhd) {
3052 
3053 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3054 
3055 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3056 			iommu->reg + DMAR_FECTL_REG);
3057 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3058 			iommu->reg + DMAR_FEDATA_REG);
3059 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3060 			iommu->reg + DMAR_FEADDR_REG);
3061 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3062 			iommu->reg + DMAR_FEUADDR_REG);
3063 
3064 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3065 	}
3066 }
3067 
3068 static struct syscore_ops iommu_syscore_ops = {
3069 	.resume		= iommu_resume,
3070 	.suspend	= iommu_suspend,
3071 };
3072 
3073 static void __init init_iommu_pm_ops(void)
3074 {
3075 	register_syscore_ops(&iommu_syscore_ops);
3076 }
3077 
3078 #else
3079 static inline void init_iommu_pm_ops(void) {}
3080 #endif	/* CONFIG_PM */
3081 
3082 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3083 {
3084 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3085 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3086 	    rmrr->end_address <= rmrr->base_address ||
3087 	    arch_rmrr_sanity_check(rmrr))
3088 		return -EINVAL;
3089 
3090 	return 0;
3091 }
3092 
3093 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3094 {
3095 	struct acpi_dmar_reserved_memory *rmrr;
3096 	struct dmar_rmrr_unit *rmrru;
3097 
3098 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3099 	if (rmrr_sanity_check(rmrr)) {
3100 		pr_warn(FW_BUG
3101 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3102 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3103 			   rmrr->base_address, rmrr->end_address,
3104 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3105 			   dmi_get_system_info(DMI_BIOS_VERSION),
3106 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3107 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3108 	}
3109 
3110 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3111 	if (!rmrru)
3112 		goto out;
3113 
3114 	rmrru->hdr = header;
3115 
3116 	rmrru->base_address = rmrr->base_address;
3117 	rmrru->end_address = rmrr->end_address;
3118 
3119 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3120 				((void *)rmrr) + rmrr->header.length,
3121 				&rmrru->devices_cnt);
3122 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3123 		goto free_rmrru;
3124 
3125 	list_add(&rmrru->list, &dmar_rmrr_units);
3126 
3127 	return 0;
3128 free_rmrru:
3129 	kfree(rmrru);
3130 out:
3131 	return -ENOMEM;
3132 }
3133 
3134 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3135 {
3136 	struct dmar_atsr_unit *atsru;
3137 	struct acpi_dmar_atsr *tmp;
3138 
3139 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3140 				dmar_rcu_check()) {
3141 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3142 		if (atsr->segment != tmp->segment)
3143 			continue;
3144 		if (atsr->header.length != tmp->header.length)
3145 			continue;
3146 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3147 			return atsru;
3148 	}
3149 
3150 	return NULL;
3151 }
3152 
3153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3154 {
3155 	struct acpi_dmar_atsr *atsr;
3156 	struct dmar_atsr_unit *atsru;
3157 
3158 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3159 		return 0;
3160 
3161 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3162 	atsru = dmar_find_atsr(atsr);
3163 	if (atsru)
3164 		return 0;
3165 
3166 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3167 	if (!atsru)
3168 		return -ENOMEM;
3169 
3170 	/*
3171 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3172 	 * copy the memory content because the memory buffer will be freed
3173 	 * on return.
3174 	 */
3175 	atsru->hdr = (void *)(atsru + 1);
3176 	memcpy(atsru->hdr, hdr, hdr->length);
3177 	atsru->include_all = atsr->flags & 0x1;
3178 	if (!atsru->include_all) {
3179 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3180 				(void *)atsr + atsr->header.length,
3181 				&atsru->devices_cnt);
3182 		if (atsru->devices_cnt && atsru->devices == NULL) {
3183 			kfree(atsru);
3184 			return -ENOMEM;
3185 		}
3186 	}
3187 
3188 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3189 
3190 	return 0;
3191 }
3192 
3193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3194 {
3195 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3196 	kfree(atsru);
3197 }
3198 
3199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3200 {
3201 	struct acpi_dmar_atsr *atsr;
3202 	struct dmar_atsr_unit *atsru;
3203 
3204 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3205 	atsru = dmar_find_atsr(atsr);
3206 	if (atsru) {
3207 		list_del_rcu(&atsru->list);
3208 		synchronize_rcu();
3209 		intel_iommu_free_atsr(atsru);
3210 	}
3211 
3212 	return 0;
3213 }
3214 
3215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3216 {
3217 	int i;
3218 	struct device *dev;
3219 	struct acpi_dmar_atsr *atsr;
3220 	struct dmar_atsr_unit *atsru;
3221 
3222 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3223 	atsru = dmar_find_atsr(atsr);
3224 	if (!atsru)
3225 		return 0;
3226 
3227 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3228 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3229 					  i, dev)
3230 			return -EBUSY;
3231 	}
3232 
3233 	return 0;
3234 }
3235 
3236 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3237 {
3238 	struct dmar_satc_unit *satcu;
3239 	struct acpi_dmar_satc *tmp;
3240 
3241 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3242 				dmar_rcu_check()) {
3243 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3244 		if (satc->segment != tmp->segment)
3245 			continue;
3246 		if (satc->header.length != tmp->header.length)
3247 			continue;
3248 		if (memcmp(satc, tmp, satc->header.length) == 0)
3249 			return satcu;
3250 	}
3251 
3252 	return NULL;
3253 }
3254 
3255 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3256 {
3257 	struct acpi_dmar_satc *satc;
3258 	struct dmar_satc_unit *satcu;
3259 
3260 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3261 		return 0;
3262 
3263 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3264 	satcu = dmar_find_satc(satc);
3265 	if (satcu)
3266 		return 0;
3267 
3268 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3269 	if (!satcu)
3270 		return -ENOMEM;
3271 
3272 	satcu->hdr = (void *)(satcu + 1);
3273 	memcpy(satcu->hdr, hdr, hdr->length);
3274 	satcu->atc_required = satc->flags & 0x1;
3275 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3276 					      (void *)satc + satc->header.length,
3277 					      &satcu->devices_cnt);
3278 	if (satcu->devices_cnt && !satcu->devices) {
3279 		kfree(satcu);
3280 		return -ENOMEM;
3281 	}
3282 	list_add_rcu(&satcu->list, &dmar_satc_units);
3283 
3284 	return 0;
3285 }
3286 
3287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3288 {
3289 	int sp, ret;
3290 	struct intel_iommu *iommu = dmaru->iommu;
3291 
3292 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3293 	if (ret)
3294 		goto out;
3295 
3296 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3297 		pr_warn("%s: Doesn't support hardware pass through.\n",
3298 			iommu->name);
3299 		return -ENXIO;
3300 	}
3301 
3302 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3303 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3304 		pr_warn("%s: Doesn't support large page.\n",
3305 			iommu->name);
3306 		return -ENXIO;
3307 	}
3308 
3309 	/*
3310 	 * Disable translation if already enabled prior to OS handover.
3311 	 */
3312 	if (iommu->gcmd & DMA_GCMD_TE)
3313 		iommu_disable_translation(iommu);
3314 
3315 	ret = iommu_init_domains(iommu);
3316 	if (ret == 0)
3317 		ret = iommu_alloc_root_entry(iommu);
3318 	if (ret)
3319 		goto out;
3320 
3321 	intel_svm_check(iommu);
3322 
3323 	if (dmaru->ignored) {
3324 		/*
3325 		 * we always have to disable PMRs or DMA may fail on this device
3326 		 */
3327 		if (force_on)
3328 			iommu_disable_protect_mem_regions(iommu);
3329 		return 0;
3330 	}
3331 
3332 	intel_iommu_init_qi(iommu);
3333 	iommu_flush_write_buffer(iommu);
3334 
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337 		ret = intel_svm_enable_prq(iommu);
3338 		if (ret)
3339 			goto disable_iommu;
3340 	}
3341 #endif
3342 	ret = dmar_set_interrupt(iommu);
3343 	if (ret)
3344 		goto disable_iommu;
3345 
3346 	iommu_set_root_entry(iommu);
3347 	iommu_enable_translation(iommu);
3348 
3349 	iommu_disable_protect_mem_regions(iommu);
3350 	return 0;
3351 
3352 disable_iommu:
3353 	disable_dmar_iommu(iommu);
3354 out:
3355 	free_dmar_iommu(iommu);
3356 	return ret;
3357 }
3358 
3359 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3360 {
3361 	int ret = 0;
3362 	struct intel_iommu *iommu = dmaru->iommu;
3363 
3364 	if (!intel_iommu_enabled)
3365 		return 0;
3366 	if (iommu == NULL)
3367 		return -EINVAL;
3368 
3369 	if (insert) {
3370 		ret = intel_iommu_add(dmaru);
3371 	} else {
3372 		disable_dmar_iommu(iommu);
3373 		free_dmar_iommu(iommu);
3374 	}
3375 
3376 	return ret;
3377 }
3378 
3379 static void intel_iommu_free_dmars(void)
3380 {
3381 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3382 	struct dmar_atsr_unit *atsru, *atsr_n;
3383 	struct dmar_satc_unit *satcu, *satc_n;
3384 
3385 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3386 		list_del(&rmrru->list);
3387 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3388 		kfree(rmrru);
3389 	}
3390 
3391 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3392 		list_del(&atsru->list);
3393 		intel_iommu_free_atsr(atsru);
3394 	}
3395 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3396 		list_del(&satcu->list);
3397 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3398 		kfree(satcu);
3399 	}
3400 }
3401 
3402 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3403 {
3404 	struct dmar_satc_unit *satcu;
3405 	struct acpi_dmar_satc *satc;
3406 	struct device *tmp;
3407 	int i;
3408 
3409 	dev = pci_physfn(dev);
3410 	rcu_read_lock();
3411 
3412 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3413 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3414 		if (satc->segment != pci_domain_nr(dev->bus))
3415 			continue;
3416 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3417 			if (to_pci_dev(tmp) == dev)
3418 				goto out;
3419 	}
3420 	satcu = NULL;
3421 out:
3422 	rcu_read_unlock();
3423 	return satcu;
3424 }
3425 
3426 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3427 {
3428 	int i, ret = 1;
3429 	struct pci_bus *bus;
3430 	struct pci_dev *bridge = NULL;
3431 	struct device *tmp;
3432 	struct acpi_dmar_atsr *atsr;
3433 	struct dmar_atsr_unit *atsru;
3434 	struct dmar_satc_unit *satcu;
3435 
3436 	dev = pci_physfn(dev);
3437 	satcu = dmar_find_matched_satc_unit(dev);
3438 	if (satcu)
3439 		/*
3440 		 * This device supports ATS as it is in SATC table.
3441 		 * When IOMMU is in legacy mode, enabling ATS is done
3442 		 * automatically by HW for the device that requires
3443 		 * ATS, hence OS should not enable this device ATS
3444 		 * to avoid duplicated TLB invalidation.
3445 		 */
3446 		return !(satcu->atc_required && !sm_supported(iommu));
3447 
3448 	for (bus = dev->bus; bus; bus = bus->parent) {
3449 		bridge = bus->self;
3450 		/* If it's an integrated device, allow ATS */
3451 		if (!bridge)
3452 			return 1;
3453 		/* Connected via non-PCIe: no ATS */
3454 		if (!pci_is_pcie(bridge) ||
3455 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3456 			return 0;
3457 		/* If we found the root port, look it up in the ATSR */
3458 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3459 			break;
3460 	}
3461 
3462 	rcu_read_lock();
3463 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3464 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3465 		if (atsr->segment != pci_domain_nr(dev->bus))
3466 			continue;
3467 
3468 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3469 			if (tmp == &bridge->dev)
3470 				goto out;
3471 
3472 		if (atsru->include_all)
3473 			goto out;
3474 	}
3475 	ret = 0;
3476 out:
3477 	rcu_read_unlock();
3478 
3479 	return ret;
3480 }
3481 
3482 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3483 {
3484 	int ret;
3485 	struct dmar_rmrr_unit *rmrru;
3486 	struct dmar_atsr_unit *atsru;
3487 	struct dmar_satc_unit *satcu;
3488 	struct acpi_dmar_atsr *atsr;
3489 	struct acpi_dmar_reserved_memory *rmrr;
3490 	struct acpi_dmar_satc *satc;
3491 
3492 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3493 		return 0;
3494 
3495 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3496 		rmrr = container_of(rmrru->hdr,
3497 				    struct acpi_dmar_reserved_memory, header);
3498 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3499 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3500 				((void *)rmrr) + rmrr->header.length,
3501 				rmrr->segment, rmrru->devices,
3502 				rmrru->devices_cnt);
3503 			if (ret < 0)
3504 				return ret;
3505 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3506 			dmar_remove_dev_scope(info, rmrr->segment,
3507 				rmrru->devices, rmrru->devices_cnt);
3508 		}
3509 	}
3510 
3511 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3512 		if (atsru->include_all)
3513 			continue;
3514 
3515 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3516 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3517 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3518 					(void *)atsr + atsr->header.length,
3519 					atsr->segment, atsru->devices,
3520 					atsru->devices_cnt);
3521 			if (ret > 0)
3522 				break;
3523 			else if (ret < 0)
3524 				return ret;
3525 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3526 			if (dmar_remove_dev_scope(info, atsr->segment,
3527 					atsru->devices, atsru->devices_cnt))
3528 				break;
3529 		}
3530 	}
3531 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3532 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3533 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3534 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3535 					(void *)satc + satc->header.length,
3536 					satc->segment, satcu->devices,
3537 					satcu->devices_cnt);
3538 			if (ret > 0)
3539 				break;
3540 			else if (ret < 0)
3541 				return ret;
3542 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3543 			if (dmar_remove_dev_scope(info, satc->segment,
3544 					satcu->devices, satcu->devices_cnt))
3545 				break;
3546 		}
3547 	}
3548 
3549 	return 0;
3550 }
3551 
3552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3553 				       unsigned long val, void *v)
3554 {
3555 	struct memory_notify *mhp = v;
3556 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3557 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3558 			mhp->nr_pages - 1);
3559 
3560 	switch (val) {
3561 	case MEM_GOING_ONLINE:
3562 		if (iommu_domain_identity_map(si_domain,
3563 					      start_vpfn, last_vpfn)) {
3564 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3565 				start_vpfn, last_vpfn);
3566 			return NOTIFY_BAD;
3567 		}
3568 		break;
3569 
3570 	case MEM_OFFLINE:
3571 	case MEM_CANCEL_ONLINE:
3572 		{
3573 			struct dmar_drhd_unit *drhd;
3574 			struct intel_iommu *iommu;
3575 			LIST_HEAD(freelist);
3576 
3577 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3578 
3579 			rcu_read_lock();
3580 			for_each_active_iommu(iommu, drhd)
3581 				iommu_flush_iotlb_psi(iommu, si_domain,
3582 					start_vpfn, mhp->nr_pages,
3583 					list_empty(&freelist), 0);
3584 			rcu_read_unlock();
3585 			put_pages_list(&freelist);
3586 		}
3587 		break;
3588 	}
3589 
3590 	return NOTIFY_OK;
3591 }
3592 
3593 static struct notifier_block intel_iommu_memory_nb = {
3594 	.notifier_call = intel_iommu_memory_notifier,
3595 	.priority = 0
3596 };
3597 
3598 static void intel_disable_iommus(void)
3599 {
3600 	struct intel_iommu *iommu = NULL;
3601 	struct dmar_drhd_unit *drhd;
3602 
3603 	for_each_iommu(iommu, drhd)
3604 		iommu_disable_translation(iommu);
3605 }
3606 
3607 void intel_iommu_shutdown(void)
3608 {
3609 	struct dmar_drhd_unit *drhd;
3610 	struct intel_iommu *iommu = NULL;
3611 
3612 	if (no_iommu || dmar_disabled)
3613 		return;
3614 
3615 	down_write(&dmar_global_lock);
3616 
3617 	/* Disable PMRs explicitly here. */
3618 	for_each_iommu(iommu, drhd)
3619 		iommu_disable_protect_mem_regions(iommu);
3620 
3621 	/* Make sure the IOMMUs are switched off */
3622 	intel_disable_iommus();
3623 
3624 	up_write(&dmar_global_lock);
3625 }
3626 
3627 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3628 {
3629 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3630 
3631 	return container_of(iommu_dev, struct intel_iommu, iommu);
3632 }
3633 
3634 static ssize_t version_show(struct device *dev,
3635 			    struct device_attribute *attr, char *buf)
3636 {
3637 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3638 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3639 	return sysfs_emit(buf, "%d:%d\n",
3640 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3641 }
3642 static DEVICE_ATTR_RO(version);
3643 
3644 static ssize_t address_show(struct device *dev,
3645 			    struct device_attribute *attr, char *buf)
3646 {
3647 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3648 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3649 }
3650 static DEVICE_ATTR_RO(address);
3651 
3652 static ssize_t cap_show(struct device *dev,
3653 			struct device_attribute *attr, char *buf)
3654 {
3655 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3656 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3657 }
3658 static DEVICE_ATTR_RO(cap);
3659 
3660 static ssize_t ecap_show(struct device *dev,
3661 			 struct device_attribute *attr, char *buf)
3662 {
3663 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3664 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3665 }
3666 static DEVICE_ATTR_RO(ecap);
3667 
3668 static ssize_t domains_supported_show(struct device *dev,
3669 				      struct device_attribute *attr, char *buf)
3670 {
3671 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3672 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3673 }
3674 static DEVICE_ATTR_RO(domains_supported);
3675 
3676 static ssize_t domains_used_show(struct device *dev,
3677 				 struct device_attribute *attr, char *buf)
3678 {
3679 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3680 	return sysfs_emit(buf, "%d\n",
3681 			  bitmap_weight(iommu->domain_ids,
3682 					cap_ndoms(iommu->cap)));
3683 }
3684 static DEVICE_ATTR_RO(domains_used);
3685 
3686 static struct attribute *intel_iommu_attrs[] = {
3687 	&dev_attr_version.attr,
3688 	&dev_attr_address.attr,
3689 	&dev_attr_cap.attr,
3690 	&dev_attr_ecap.attr,
3691 	&dev_attr_domains_supported.attr,
3692 	&dev_attr_domains_used.attr,
3693 	NULL,
3694 };
3695 
3696 static struct attribute_group intel_iommu_group = {
3697 	.name = "intel-iommu",
3698 	.attrs = intel_iommu_attrs,
3699 };
3700 
3701 const struct attribute_group *intel_iommu_groups[] = {
3702 	&intel_iommu_group,
3703 	NULL,
3704 };
3705 
3706 static inline bool has_external_pci(void)
3707 {
3708 	struct pci_dev *pdev = NULL;
3709 
3710 	for_each_pci_dev(pdev)
3711 		if (pdev->external_facing) {
3712 			pci_dev_put(pdev);
3713 			return true;
3714 		}
3715 
3716 	return false;
3717 }
3718 
3719 static int __init platform_optin_force_iommu(void)
3720 {
3721 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3722 		return 0;
3723 
3724 	if (no_iommu || dmar_disabled)
3725 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3726 
3727 	/*
3728 	 * If Intel-IOMMU is disabled by default, we will apply identity
3729 	 * map for all devices except those marked as being untrusted.
3730 	 */
3731 	if (dmar_disabled)
3732 		iommu_set_default_passthrough(false);
3733 
3734 	dmar_disabled = 0;
3735 	no_iommu = 0;
3736 
3737 	return 1;
3738 }
3739 
3740 static int __init probe_acpi_namespace_devices(void)
3741 {
3742 	struct dmar_drhd_unit *drhd;
3743 	/* To avoid a -Wunused-but-set-variable warning. */
3744 	struct intel_iommu *iommu __maybe_unused;
3745 	struct device *dev;
3746 	int i, ret = 0;
3747 
3748 	for_each_active_iommu(iommu, drhd) {
3749 		for_each_active_dev_scope(drhd->devices,
3750 					  drhd->devices_cnt, i, dev) {
3751 			struct acpi_device_physical_node *pn;
3752 			struct acpi_device *adev;
3753 
3754 			if (dev->bus != &acpi_bus_type)
3755 				continue;
3756 
3757 			adev = to_acpi_device(dev);
3758 			mutex_lock(&adev->physical_node_lock);
3759 			list_for_each_entry(pn,
3760 					    &adev->physical_node_list, node) {
3761 				ret = iommu_probe_device(pn->dev);
3762 				if (ret)
3763 					break;
3764 			}
3765 			mutex_unlock(&adev->physical_node_lock);
3766 
3767 			if (ret)
3768 				return ret;
3769 		}
3770 	}
3771 
3772 	return 0;
3773 }
3774 
3775 static __init int tboot_force_iommu(void)
3776 {
3777 	if (!tboot_enabled())
3778 		return 0;
3779 
3780 	if (no_iommu || dmar_disabled)
3781 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3782 
3783 	dmar_disabled = 0;
3784 	no_iommu = 0;
3785 
3786 	return 1;
3787 }
3788 
3789 int __init intel_iommu_init(void)
3790 {
3791 	int ret = -ENODEV;
3792 	struct dmar_drhd_unit *drhd;
3793 	struct intel_iommu *iommu;
3794 
3795 	/*
3796 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3797 	 * opt in, so enforce that.
3798 	 */
3799 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3800 		    platform_optin_force_iommu();
3801 
3802 	down_write(&dmar_global_lock);
3803 	if (dmar_table_init()) {
3804 		if (force_on)
3805 			panic("tboot: Failed to initialize DMAR table\n");
3806 		goto out_free_dmar;
3807 	}
3808 
3809 	if (dmar_dev_scope_init() < 0) {
3810 		if (force_on)
3811 			panic("tboot: Failed to initialize DMAR device scope\n");
3812 		goto out_free_dmar;
3813 	}
3814 
3815 	up_write(&dmar_global_lock);
3816 
3817 	/*
3818 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3819 	 * complain later when we register it under the lock.
3820 	 */
3821 	dmar_register_bus_notifier();
3822 
3823 	down_write(&dmar_global_lock);
3824 
3825 	if (!no_iommu)
3826 		intel_iommu_debugfs_init();
3827 
3828 	if (no_iommu || dmar_disabled) {
3829 		/*
3830 		 * We exit the function here to ensure IOMMU's remapping and
3831 		 * mempool aren't setup, which means that the IOMMU's PMRs
3832 		 * won't be disabled via the call to init_dmars(). So disable
3833 		 * it explicitly here. The PMRs were setup by tboot prior to
3834 		 * calling SENTER, but the kernel is expected to reset/tear
3835 		 * down the PMRs.
3836 		 */
3837 		if (intel_iommu_tboot_noforce) {
3838 			for_each_iommu(iommu, drhd)
3839 				iommu_disable_protect_mem_regions(iommu);
3840 		}
3841 
3842 		/*
3843 		 * Make sure the IOMMUs are switched off, even when we
3844 		 * boot into a kexec kernel and the previous kernel left
3845 		 * them enabled
3846 		 */
3847 		intel_disable_iommus();
3848 		goto out_free_dmar;
3849 	}
3850 
3851 	if (list_empty(&dmar_rmrr_units))
3852 		pr_info("No RMRR found\n");
3853 
3854 	if (list_empty(&dmar_atsr_units))
3855 		pr_info("No ATSR found\n");
3856 
3857 	if (list_empty(&dmar_satc_units))
3858 		pr_info("No SATC found\n");
3859 
3860 	init_no_remapping_devices();
3861 
3862 	ret = init_dmars();
3863 	if (ret) {
3864 		if (force_on)
3865 			panic("tboot: Failed to initialize DMARs\n");
3866 		pr_err("Initialization failed\n");
3867 		goto out_free_dmar;
3868 	}
3869 	up_write(&dmar_global_lock);
3870 
3871 	init_iommu_pm_ops();
3872 
3873 	down_read(&dmar_global_lock);
3874 	for_each_active_iommu(iommu, drhd) {
3875 		/*
3876 		 * The flush queue implementation does not perform
3877 		 * page-selective invalidations that are required for efficient
3878 		 * TLB flushes in virtual environments.  The benefit of batching
3879 		 * is likely to be much lower than the overhead of synchronizing
3880 		 * the virtual and physical IOMMU page-tables.
3881 		 */
3882 		if (cap_caching_mode(iommu->cap) &&
3883 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3884 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3885 			iommu_set_dma_strict();
3886 		}
3887 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3888 				       intel_iommu_groups,
3889 				       "%s", iommu->name);
3890 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3891 
3892 		iommu_pmu_register(iommu);
3893 	}
3894 	up_read(&dmar_global_lock);
3895 
3896 	if (si_domain && !hw_pass_through)
3897 		register_memory_notifier(&intel_iommu_memory_nb);
3898 
3899 	down_read(&dmar_global_lock);
3900 	if (probe_acpi_namespace_devices())
3901 		pr_warn("ACPI name space devices didn't probe correctly\n");
3902 
3903 	/* Finally, we enable the DMA remapping hardware. */
3904 	for_each_iommu(iommu, drhd) {
3905 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3906 			iommu_enable_translation(iommu);
3907 
3908 		iommu_disable_protect_mem_regions(iommu);
3909 	}
3910 	up_read(&dmar_global_lock);
3911 
3912 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3913 
3914 	intel_iommu_enabled = 1;
3915 
3916 	return 0;
3917 
3918 out_free_dmar:
3919 	intel_iommu_free_dmars();
3920 	up_write(&dmar_global_lock);
3921 	return ret;
3922 }
3923 
3924 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3925 {
3926 	struct device_domain_info *info = opaque;
3927 
3928 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3929 	return 0;
3930 }
3931 
3932 /*
3933  * NB - intel-iommu lacks any sort of reference counting for the users of
3934  * dependent devices.  If multiple endpoints have intersecting dependent
3935  * devices, unbinding the driver from any one of them will possibly leave
3936  * the others unable to operate.
3937  */
3938 static void domain_context_clear(struct device_domain_info *info)
3939 {
3940 	if (!dev_is_pci(info->dev)) {
3941 		domain_context_clear_one(info, info->bus, info->devfn);
3942 		return;
3943 	}
3944 
3945 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3946 			       &domain_context_clear_one_cb, info);
3947 }
3948 
3949 static void dmar_remove_one_dev_info(struct device *dev)
3950 {
3951 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3952 	struct dmar_domain *domain = info->domain;
3953 	struct intel_iommu *iommu = info->iommu;
3954 	unsigned long flags;
3955 
3956 	if (!dev_is_real_dma_subdevice(info->dev)) {
3957 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3958 			intel_pasid_tear_down_entry(iommu, info->dev,
3959 					IOMMU_NO_PASID, false);
3960 
3961 		iommu_disable_pci_caps(info);
3962 		domain_context_clear(info);
3963 	}
3964 
3965 	spin_lock_irqsave(&domain->lock, flags);
3966 	list_del(&info->link);
3967 	spin_unlock_irqrestore(&domain->lock, flags);
3968 
3969 	domain_detach_iommu(domain, iommu);
3970 	info->domain = NULL;
3971 }
3972 
3973 /*
3974  * Clear the page table pointer in context or pasid table entries so that
3975  * all DMA requests without PASID from the device are blocked. If the page
3976  * table has been set, clean up the data structures.
3977  */
3978 static void device_block_translation(struct device *dev)
3979 {
3980 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3981 	struct intel_iommu *iommu = info->iommu;
3982 	unsigned long flags;
3983 
3984 	iommu_disable_pci_caps(info);
3985 	if (!dev_is_real_dma_subdevice(dev)) {
3986 		if (sm_supported(iommu))
3987 			intel_pasid_tear_down_entry(iommu, dev,
3988 						    IOMMU_NO_PASID, false);
3989 		else
3990 			domain_context_clear(info);
3991 	}
3992 
3993 	if (!info->domain)
3994 		return;
3995 
3996 	spin_lock_irqsave(&info->domain->lock, flags);
3997 	list_del(&info->link);
3998 	spin_unlock_irqrestore(&info->domain->lock, flags);
3999 
4000 	domain_detach_iommu(info->domain, iommu);
4001 	info->domain = NULL;
4002 }
4003 
4004 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4005 {
4006 	int adjust_width;
4007 
4008 	/* calculate AGAW */
4009 	domain->gaw = guest_width;
4010 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4011 	domain->agaw = width_to_agaw(adjust_width);
4012 
4013 	domain->iommu_coherency = false;
4014 	domain->iommu_superpage = 0;
4015 	domain->max_addr = 0;
4016 
4017 	/* always allocate the top pgd */
4018 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4019 	if (!domain->pgd)
4020 		return -ENOMEM;
4021 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4022 	return 0;
4023 }
4024 
4025 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4026 				      struct device *dev)
4027 {
4028 	device_block_translation(dev);
4029 	return 0;
4030 }
4031 
4032 static struct iommu_domain blocking_domain = {
4033 	.ops = &(const struct iommu_domain_ops) {
4034 		.attach_dev	= blocking_domain_attach_dev,
4035 		.free		= intel_iommu_domain_free
4036 	}
4037 };
4038 
4039 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4040 {
4041 	struct dmar_domain *dmar_domain;
4042 	struct iommu_domain *domain;
4043 
4044 	switch (type) {
4045 	case IOMMU_DOMAIN_BLOCKED:
4046 		return &blocking_domain;
4047 	case IOMMU_DOMAIN_DMA:
4048 	case IOMMU_DOMAIN_UNMANAGED:
4049 		dmar_domain = alloc_domain(type);
4050 		if (!dmar_domain) {
4051 			pr_err("Can't allocate dmar_domain\n");
4052 			return NULL;
4053 		}
4054 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4055 			pr_err("Domain initialization failed\n");
4056 			domain_exit(dmar_domain);
4057 			return NULL;
4058 		}
4059 
4060 		domain = &dmar_domain->domain;
4061 		domain->geometry.aperture_start = 0;
4062 		domain->geometry.aperture_end   =
4063 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4064 		domain->geometry.force_aperture = true;
4065 
4066 		return domain;
4067 	case IOMMU_DOMAIN_IDENTITY:
4068 		return &si_domain->domain;
4069 	case IOMMU_DOMAIN_SVA:
4070 		return intel_svm_domain_alloc();
4071 	default:
4072 		return NULL;
4073 	}
4074 
4075 	return NULL;
4076 }
4077 
4078 static void intel_iommu_domain_free(struct iommu_domain *domain)
4079 {
4080 	if (domain != &si_domain->domain && domain != &blocking_domain)
4081 		domain_exit(to_dmar_domain(domain));
4082 }
4083 
4084 static int prepare_domain_attach_device(struct iommu_domain *domain,
4085 					struct device *dev)
4086 {
4087 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4088 	struct intel_iommu *iommu;
4089 	int addr_width;
4090 
4091 	iommu = device_to_iommu(dev, NULL, NULL);
4092 	if (!iommu)
4093 		return -ENODEV;
4094 
4095 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4096 		return -EINVAL;
4097 
4098 	/* check if this iommu agaw is sufficient for max mapped address */
4099 	addr_width = agaw_to_width(iommu->agaw);
4100 	if (addr_width > cap_mgaw(iommu->cap))
4101 		addr_width = cap_mgaw(iommu->cap);
4102 
4103 	if (dmar_domain->max_addr > (1LL << addr_width))
4104 		return -EINVAL;
4105 	dmar_domain->gaw = addr_width;
4106 
4107 	/*
4108 	 * Knock out extra levels of page tables if necessary
4109 	 */
4110 	while (iommu->agaw < dmar_domain->agaw) {
4111 		struct dma_pte *pte;
4112 
4113 		pte = dmar_domain->pgd;
4114 		if (dma_pte_present(pte)) {
4115 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4116 			free_pgtable_page(pte);
4117 		}
4118 		dmar_domain->agaw--;
4119 	}
4120 
4121 	return 0;
4122 }
4123 
4124 static int intel_iommu_attach_device(struct iommu_domain *domain,
4125 				     struct device *dev)
4126 {
4127 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4128 	int ret;
4129 
4130 	if (info->domain)
4131 		device_block_translation(dev);
4132 
4133 	ret = prepare_domain_attach_device(domain, dev);
4134 	if (ret)
4135 		return ret;
4136 
4137 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4138 }
4139 
4140 static int intel_iommu_map(struct iommu_domain *domain,
4141 			   unsigned long iova, phys_addr_t hpa,
4142 			   size_t size, int iommu_prot, gfp_t gfp)
4143 {
4144 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4145 	u64 max_addr;
4146 	int prot = 0;
4147 
4148 	if (iommu_prot & IOMMU_READ)
4149 		prot |= DMA_PTE_READ;
4150 	if (iommu_prot & IOMMU_WRITE)
4151 		prot |= DMA_PTE_WRITE;
4152 	if (dmar_domain->set_pte_snp)
4153 		prot |= DMA_PTE_SNP;
4154 
4155 	max_addr = iova + size;
4156 	if (dmar_domain->max_addr < max_addr) {
4157 		u64 end;
4158 
4159 		/* check if minimum agaw is sufficient for mapped address */
4160 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4161 		if (end < max_addr) {
4162 			pr_err("%s: iommu width (%d) is not "
4163 			       "sufficient for the mapped address (%llx)\n",
4164 			       __func__, dmar_domain->gaw, max_addr);
4165 			return -EFAULT;
4166 		}
4167 		dmar_domain->max_addr = max_addr;
4168 	}
4169 	/* Round up size to next multiple of PAGE_SIZE, if it and
4170 	   the low bits of hpa would take us onto the next page */
4171 	size = aligned_nrpages(hpa, size);
4172 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4173 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4174 }
4175 
4176 static int intel_iommu_map_pages(struct iommu_domain *domain,
4177 				 unsigned long iova, phys_addr_t paddr,
4178 				 size_t pgsize, size_t pgcount,
4179 				 int prot, gfp_t gfp, size_t *mapped)
4180 {
4181 	unsigned long pgshift = __ffs(pgsize);
4182 	size_t size = pgcount << pgshift;
4183 	int ret;
4184 
4185 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4186 		return -EINVAL;
4187 
4188 	if (!IS_ALIGNED(iova | paddr, pgsize))
4189 		return -EINVAL;
4190 
4191 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4192 	if (!ret && mapped)
4193 		*mapped = size;
4194 
4195 	return ret;
4196 }
4197 
4198 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4199 				unsigned long iova, size_t size,
4200 				struct iommu_iotlb_gather *gather)
4201 {
4202 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203 	unsigned long start_pfn, last_pfn;
4204 	int level = 0;
4205 
4206 	/* Cope with horrid API which requires us to unmap more than the
4207 	   size argument if it happens to be a large-page mapping. */
4208 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4209 				     &level, GFP_ATOMIC)))
4210 		return 0;
4211 
4212 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4213 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4214 
4215 	start_pfn = iova >> VTD_PAGE_SHIFT;
4216 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4217 
4218 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4219 
4220 	if (dmar_domain->max_addr == iova + size)
4221 		dmar_domain->max_addr = iova;
4222 
4223 	/*
4224 	 * We do not use page-selective IOTLB invalidation in flush queue,
4225 	 * so there is no need to track page and sync iotlb.
4226 	 */
4227 	if (!iommu_iotlb_gather_queued(gather))
4228 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4229 
4230 	return size;
4231 }
4232 
4233 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4234 				      unsigned long iova,
4235 				      size_t pgsize, size_t pgcount,
4236 				      struct iommu_iotlb_gather *gather)
4237 {
4238 	unsigned long pgshift = __ffs(pgsize);
4239 	size_t size = pgcount << pgshift;
4240 
4241 	return intel_iommu_unmap(domain, iova, size, gather);
4242 }
4243 
4244 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4245 				 struct iommu_iotlb_gather *gather)
4246 {
4247 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4248 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4249 	size_t size = gather->end - gather->start;
4250 	struct iommu_domain_info *info;
4251 	unsigned long start_pfn;
4252 	unsigned long nrpages;
4253 	unsigned long i;
4254 
4255 	nrpages = aligned_nrpages(gather->start, size);
4256 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4257 
4258 	xa_for_each(&dmar_domain->iommu_array, i, info)
4259 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4260 				      start_pfn, nrpages,
4261 				      list_empty(&gather->freelist), 0);
4262 
4263 	put_pages_list(&gather->freelist);
4264 }
4265 
4266 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4267 					    dma_addr_t iova)
4268 {
4269 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4270 	struct dma_pte *pte;
4271 	int level = 0;
4272 	u64 phys = 0;
4273 
4274 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4275 			     GFP_ATOMIC);
4276 	if (pte && dma_pte_present(pte))
4277 		phys = dma_pte_addr(pte) +
4278 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4279 						VTD_PAGE_SHIFT) - 1));
4280 
4281 	return phys;
4282 }
4283 
4284 static bool domain_support_force_snooping(struct dmar_domain *domain)
4285 {
4286 	struct device_domain_info *info;
4287 	bool support = true;
4288 
4289 	assert_spin_locked(&domain->lock);
4290 	list_for_each_entry(info, &domain->devices, link) {
4291 		if (!ecap_sc_support(info->iommu->ecap)) {
4292 			support = false;
4293 			break;
4294 		}
4295 	}
4296 
4297 	return support;
4298 }
4299 
4300 static void domain_set_force_snooping(struct dmar_domain *domain)
4301 {
4302 	struct device_domain_info *info;
4303 
4304 	assert_spin_locked(&domain->lock);
4305 	/*
4306 	 * Second level page table supports per-PTE snoop control. The
4307 	 * iommu_map() interface will handle this by setting SNP bit.
4308 	 */
4309 	if (!domain->use_first_level) {
4310 		domain->set_pte_snp = true;
4311 		return;
4312 	}
4313 
4314 	list_for_each_entry(info, &domain->devices, link)
4315 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4316 						     IOMMU_NO_PASID);
4317 }
4318 
4319 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4320 {
4321 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4322 	unsigned long flags;
4323 
4324 	if (dmar_domain->force_snooping)
4325 		return true;
4326 
4327 	spin_lock_irqsave(&dmar_domain->lock, flags);
4328 	if (!domain_support_force_snooping(dmar_domain) ||
4329 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4330 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4331 		return false;
4332 	}
4333 
4334 	domain_set_force_snooping(dmar_domain);
4335 	dmar_domain->force_snooping = true;
4336 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4337 
4338 	return true;
4339 }
4340 
4341 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4342 {
4343 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4344 
4345 	switch (cap) {
4346 	case IOMMU_CAP_CACHE_COHERENCY:
4347 	case IOMMU_CAP_DEFERRED_FLUSH:
4348 		return true;
4349 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4350 		return dmar_platform_optin();
4351 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4352 		return ecap_sc_support(info->iommu->ecap);
4353 	default:
4354 		return false;
4355 	}
4356 }
4357 
4358 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4359 {
4360 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4361 	struct device_domain_info *info;
4362 	struct intel_iommu *iommu;
4363 	u8 bus, devfn;
4364 	int ret;
4365 
4366 	iommu = device_to_iommu(dev, &bus, &devfn);
4367 	if (!iommu || !iommu->iommu.ops)
4368 		return ERR_PTR(-ENODEV);
4369 
4370 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4371 	if (!info)
4372 		return ERR_PTR(-ENOMEM);
4373 
4374 	if (dev_is_real_dma_subdevice(dev)) {
4375 		info->bus = pdev->bus->number;
4376 		info->devfn = pdev->devfn;
4377 		info->segment = pci_domain_nr(pdev->bus);
4378 	} else {
4379 		info->bus = bus;
4380 		info->devfn = devfn;
4381 		info->segment = iommu->segment;
4382 	}
4383 
4384 	info->dev = dev;
4385 	info->iommu = iommu;
4386 	if (dev_is_pci(dev)) {
4387 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4388 		    pci_ats_supported(pdev) &&
4389 		    dmar_ats_supported(pdev, iommu)) {
4390 			info->ats_supported = 1;
4391 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4392 
4393 			/*
4394 			 * For IOMMU that supports device IOTLB throttling
4395 			 * (DIT), we assign PFSID to the invalidation desc
4396 			 * of a VF such that IOMMU HW can gauge queue depth
4397 			 * at PF level. If DIT is not set, PFSID will be
4398 			 * treated as reserved, which should be set to 0.
4399 			 */
4400 			if (ecap_dit(iommu->ecap))
4401 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4402 			info->ats_qdep = pci_ats_queue_depth(pdev);
4403 		}
4404 		if (sm_supported(iommu)) {
4405 			if (pasid_supported(iommu)) {
4406 				int features = pci_pasid_features(pdev);
4407 
4408 				if (features >= 0)
4409 					info->pasid_supported = features | 1;
4410 			}
4411 
4412 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4413 			    pci_pri_supported(pdev))
4414 				info->pri_supported = 1;
4415 		}
4416 	}
4417 
4418 	dev_iommu_priv_set(dev, info);
4419 
4420 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4421 		ret = intel_pasid_alloc_table(dev);
4422 		if (ret) {
4423 			dev_err(dev, "PASID table allocation failed\n");
4424 			dev_iommu_priv_set(dev, NULL);
4425 			kfree(info);
4426 			return ERR_PTR(ret);
4427 		}
4428 	}
4429 
4430 	return &iommu->iommu;
4431 }
4432 
4433 static void intel_iommu_release_device(struct device *dev)
4434 {
4435 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4436 
4437 	dmar_remove_one_dev_info(dev);
4438 	intel_pasid_free_table(dev);
4439 	dev_iommu_priv_set(dev, NULL);
4440 	kfree(info);
4441 	set_dma_ops(dev, NULL);
4442 }
4443 
4444 static void intel_iommu_probe_finalize(struct device *dev)
4445 {
4446 	set_dma_ops(dev, NULL);
4447 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4448 }
4449 
4450 static void intel_iommu_get_resv_regions(struct device *device,
4451 					 struct list_head *head)
4452 {
4453 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4454 	struct iommu_resv_region *reg;
4455 	struct dmar_rmrr_unit *rmrr;
4456 	struct device *i_dev;
4457 	int i;
4458 
4459 	rcu_read_lock();
4460 	for_each_rmrr_units(rmrr) {
4461 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4462 					  i, i_dev) {
4463 			struct iommu_resv_region *resv;
4464 			enum iommu_resv_type type;
4465 			size_t length;
4466 
4467 			if (i_dev != device &&
4468 			    !is_downstream_to_pci_bridge(device, i_dev))
4469 				continue;
4470 
4471 			length = rmrr->end_address - rmrr->base_address + 1;
4472 
4473 			type = device_rmrr_is_relaxable(device) ?
4474 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4475 
4476 			resv = iommu_alloc_resv_region(rmrr->base_address,
4477 						       length, prot, type,
4478 						       GFP_ATOMIC);
4479 			if (!resv)
4480 				break;
4481 
4482 			list_add_tail(&resv->list, head);
4483 		}
4484 	}
4485 	rcu_read_unlock();
4486 
4487 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4488 	if (dev_is_pci(device)) {
4489 		struct pci_dev *pdev = to_pci_dev(device);
4490 
4491 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4492 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4493 					IOMMU_RESV_DIRECT_RELAXABLE,
4494 					GFP_KERNEL);
4495 			if (reg)
4496 				list_add_tail(&reg->list, head);
4497 		}
4498 	}
4499 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4500 
4501 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4502 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4503 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4504 	if (!reg)
4505 		return;
4506 	list_add_tail(&reg->list, head);
4507 }
4508 
4509 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4510 {
4511 	if (dev_is_pci(dev))
4512 		return pci_device_group(dev);
4513 	return generic_device_group(dev);
4514 }
4515 
4516 static int intel_iommu_enable_sva(struct device *dev)
4517 {
4518 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4519 	struct intel_iommu *iommu;
4520 
4521 	if (!info || dmar_disabled)
4522 		return -EINVAL;
4523 
4524 	iommu = info->iommu;
4525 	if (!iommu)
4526 		return -EINVAL;
4527 
4528 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4529 		return -ENODEV;
4530 
4531 	if (!info->pasid_enabled || !info->ats_enabled)
4532 		return -EINVAL;
4533 
4534 	/*
4535 	 * Devices having device-specific I/O fault handling should not
4536 	 * support PCI/PRI. The IOMMU side has no means to check the
4537 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4538 	 * default that if the device driver enables SVA on a non-PRI
4539 	 * device, it will handle IOPF in its own way.
4540 	 */
4541 	if (!info->pri_supported)
4542 		return 0;
4543 
4544 	/* Devices supporting PRI should have it enabled. */
4545 	if (!info->pri_enabled)
4546 		return -EINVAL;
4547 
4548 	return 0;
4549 }
4550 
4551 static int intel_iommu_enable_iopf(struct device *dev)
4552 {
4553 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4554 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4555 	struct intel_iommu *iommu;
4556 	int ret;
4557 
4558 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4559 		return -ENODEV;
4560 
4561 	if (info->pri_enabled)
4562 		return -EBUSY;
4563 
4564 	iommu = info->iommu;
4565 	if (!iommu)
4566 		return -EINVAL;
4567 
4568 	/* PASID is required in PRG Response Message. */
4569 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4570 		return -EINVAL;
4571 
4572 	ret = pci_reset_pri(pdev);
4573 	if (ret)
4574 		return ret;
4575 
4576 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4577 	if (ret)
4578 		return ret;
4579 
4580 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4581 	if (ret)
4582 		goto iopf_remove_device;
4583 
4584 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4585 	if (ret)
4586 		goto iopf_unregister_handler;
4587 	info->pri_enabled = 1;
4588 
4589 	return 0;
4590 
4591 iopf_unregister_handler:
4592 	iommu_unregister_device_fault_handler(dev);
4593 iopf_remove_device:
4594 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4595 
4596 	return ret;
4597 }
4598 
4599 static int intel_iommu_disable_iopf(struct device *dev)
4600 {
4601 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4602 	struct intel_iommu *iommu = info->iommu;
4603 
4604 	if (!info->pri_enabled)
4605 		return -EINVAL;
4606 
4607 	/*
4608 	 * PCIe spec states that by clearing PRI enable bit, the Page
4609 	 * Request Interface will not issue new page requests, but has
4610 	 * outstanding page requests that have been transmitted or are
4611 	 * queued for transmission. This is supposed to be called after
4612 	 * the device driver has stopped DMA, all PASIDs have been
4613 	 * unbound and the outstanding PRQs have been drained.
4614 	 */
4615 	pci_disable_pri(to_pci_dev(dev));
4616 	info->pri_enabled = 0;
4617 
4618 	/*
4619 	 * With PRI disabled and outstanding PRQs drained, unregistering
4620 	 * fault handler and removing device from iopf queue should never
4621 	 * fail.
4622 	 */
4623 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4624 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4625 
4626 	return 0;
4627 }
4628 
4629 static int
4630 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4631 {
4632 	switch (feat) {
4633 	case IOMMU_DEV_FEAT_IOPF:
4634 		return intel_iommu_enable_iopf(dev);
4635 
4636 	case IOMMU_DEV_FEAT_SVA:
4637 		return intel_iommu_enable_sva(dev);
4638 
4639 	default:
4640 		return -ENODEV;
4641 	}
4642 }
4643 
4644 static int
4645 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4646 {
4647 	switch (feat) {
4648 	case IOMMU_DEV_FEAT_IOPF:
4649 		return intel_iommu_disable_iopf(dev);
4650 
4651 	case IOMMU_DEV_FEAT_SVA:
4652 		return 0;
4653 
4654 	default:
4655 		return -ENODEV;
4656 	}
4657 }
4658 
4659 static bool intel_iommu_is_attach_deferred(struct device *dev)
4660 {
4661 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4662 
4663 	return translation_pre_enabled(info->iommu) && !info->domain;
4664 }
4665 
4666 /*
4667  * Check that the device does not live on an external facing PCI port that is
4668  * marked as untrusted. Such devices should not be able to apply quirks and
4669  * thus not be able to bypass the IOMMU restrictions.
4670  */
4671 static bool risky_device(struct pci_dev *pdev)
4672 {
4673 	if (pdev->untrusted) {
4674 		pci_info(pdev,
4675 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4676 			 pdev->vendor, pdev->device);
4677 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4678 		return true;
4679 	}
4680 	return false;
4681 }
4682 
4683 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4684 				       unsigned long iova, size_t size)
4685 {
4686 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4687 	unsigned long pages = aligned_nrpages(iova, size);
4688 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4689 	struct iommu_domain_info *info;
4690 	unsigned long i;
4691 
4692 	xa_for_each(&dmar_domain->iommu_array, i, info)
4693 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4694 }
4695 
4696 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4697 {
4698 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4699 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4700 	struct dmar_domain *dmar_domain;
4701 	struct iommu_domain *domain;
4702 	unsigned long flags;
4703 
4704 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4705 	if (WARN_ON_ONCE(!domain))
4706 		goto out_tear_down;
4707 
4708 	/*
4709 	 * The SVA implementation needs to handle its own stuffs like the mm
4710 	 * notification. Before consolidating that code into iommu core, let
4711 	 * the intel sva code handle it.
4712 	 */
4713 	if (domain->type == IOMMU_DOMAIN_SVA) {
4714 		intel_svm_remove_dev_pasid(dev, pasid);
4715 		goto out_tear_down;
4716 	}
4717 
4718 	dmar_domain = to_dmar_domain(domain);
4719 	spin_lock_irqsave(&dmar_domain->lock, flags);
4720 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4721 		if (curr->dev == dev && curr->pasid == pasid) {
4722 			list_del(&curr->link_domain);
4723 			dev_pasid = curr;
4724 			break;
4725 		}
4726 	}
4727 	WARN_ON_ONCE(!dev_pasid);
4728 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4729 
4730 	domain_detach_iommu(dmar_domain, iommu);
4731 	kfree(dev_pasid);
4732 out_tear_down:
4733 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4734 	intel_drain_pasid_prq(dev, pasid);
4735 }
4736 
4737 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4738 				     struct device *dev, ioasid_t pasid)
4739 {
4740 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4741 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4742 	struct intel_iommu *iommu = info->iommu;
4743 	struct dev_pasid_info *dev_pasid;
4744 	unsigned long flags;
4745 	int ret;
4746 
4747 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4748 		return -EOPNOTSUPP;
4749 
4750 	if (context_copied(iommu, info->bus, info->devfn))
4751 		return -EBUSY;
4752 
4753 	ret = prepare_domain_attach_device(domain, dev);
4754 	if (ret)
4755 		return ret;
4756 
4757 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4758 	if (!dev_pasid)
4759 		return -ENOMEM;
4760 
4761 	ret = domain_attach_iommu(dmar_domain, iommu);
4762 	if (ret)
4763 		goto out_free;
4764 
4765 	if (domain_type_is_si(dmar_domain))
4766 		ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4767 						     dev, pasid);
4768 	else if (dmar_domain->use_first_level)
4769 		ret = domain_setup_first_level(iommu, dmar_domain,
4770 					       dev, pasid);
4771 	else
4772 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4773 						     dev, pasid);
4774 	if (ret)
4775 		goto out_detach_iommu;
4776 
4777 	dev_pasid->dev = dev;
4778 	dev_pasid->pasid = pasid;
4779 	spin_lock_irqsave(&dmar_domain->lock, flags);
4780 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4781 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4782 
4783 	return 0;
4784 out_detach_iommu:
4785 	domain_detach_iommu(dmar_domain, iommu);
4786 out_free:
4787 	kfree(dev_pasid);
4788 	return ret;
4789 }
4790 
4791 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4792 {
4793 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4794 	struct intel_iommu *iommu = info->iommu;
4795 	struct iommu_hw_info_vtd *vtd;
4796 
4797 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4798 	if (!vtd)
4799 		return ERR_PTR(-ENOMEM);
4800 
4801 	vtd->cap_reg = iommu->cap;
4802 	vtd->ecap_reg = iommu->ecap;
4803 	*length = sizeof(*vtd);
4804 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4805 	return vtd;
4806 }
4807 
4808 const struct iommu_ops intel_iommu_ops = {
4809 	.capable		= intel_iommu_capable,
4810 	.hw_info		= intel_iommu_hw_info,
4811 	.domain_alloc		= intel_iommu_domain_alloc,
4812 	.probe_device		= intel_iommu_probe_device,
4813 	.probe_finalize		= intel_iommu_probe_finalize,
4814 	.release_device		= intel_iommu_release_device,
4815 	.get_resv_regions	= intel_iommu_get_resv_regions,
4816 	.device_group		= intel_iommu_device_group,
4817 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4818 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4819 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4820 	.def_domain_type	= device_def_domain_type,
4821 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4822 	.pgsize_bitmap		= SZ_4K,
4823 #ifdef CONFIG_INTEL_IOMMU_SVM
4824 	.page_response		= intel_svm_page_response,
4825 #endif
4826 	.default_domain_ops = &(const struct iommu_domain_ops) {
4827 		.attach_dev		= intel_iommu_attach_device,
4828 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4829 		.map_pages		= intel_iommu_map_pages,
4830 		.unmap_pages		= intel_iommu_unmap_pages,
4831 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4832 		.flush_iotlb_all        = intel_flush_iotlb_all,
4833 		.iotlb_sync		= intel_iommu_tlb_sync,
4834 		.iova_to_phys		= intel_iommu_iova_to_phys,
4835 		.free			= intel_iommu_domain_free,
4836 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4837 	}
4838 };
4839 
4840 static void quirk_iommu_igfx(struct pci_dev *dev)
4841 {
4842 	if (risky_device(dev))
4843 		return;
4844 
4845 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4846 	dmar_map_gfx = 0;
4847 }
4848 
4849 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4857 
4858 /* Broadwell igfx malfunctions with dmar */
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4883 
4884 static void quirk_iommu_rwbf(struct pci_dev *dev)
4885 {
4886 	if (risky_device(dev))
4887 		return;
4888 
4889 	/*
4890 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4891 	 * but needs it. Same seems to hold for the desktop versions.
4892 	 */
4893 	pci_info(dev, "Forcing write-buffer flush capability\n");
4894 	rwbf_quirk = 1;
4895 }
4896 
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4904 
4905 #define GGC 0x52
4906 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4907 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4908 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4909 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4910 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4911 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4912 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4913 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4914 
4915 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4916 {
4917 	unsigned short ggc;
4918 
4919 	if (risky_device(dev))
4920 		return;
4921 
4922 	if (pci_read_config_word(dev, GGC, &ggc))
4923 		return;
4924 
4925 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4926 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4927 		dmar_map_gfx = 0;
4928 	} else if (dmar_map_gfx) {
4929 		/* we have to ensure the gfx device is idle before we flush */
4930 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4931 		iommu_set_dma_strict();
4932 	}
4933 }
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4938 
4939 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4940 {
4941 	unsigned short ver;
4942 
4943 	if (!IS_GFX_DEVICE(dev))
4944 		return;
4945 
4946 	ver = (dev->device >> 8) & 0xff;
4947 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4948 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4949 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4950 		return;
4951 
4952 	if (risky_device(dev))
4953 		return;
4954 
4955 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4956 	iommu_skip_te_disable = 1;
4957 }
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4959 
4960 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4961    ISOCH DMAR unit for the Azalia sound device, but not give it any
4962    TLB entries, which causes it to deadlock. Check for that.  We do
4963    this in a function called from init_dmars(), instead of in a PCI
4964    quirk, because we don't want to print the obnoxious "BIOS broken"
4965    message if VT-d is actually disabled.
4966 */
4967 static void __init check_tylersburg_isoch(void)
4968 {
4969 	struct pci_dev *pdev;
4970 	uint32_t vtisochctrl;
4971 
4972 	/* If there's no Azalia in the system anyway, forget it. */
4973 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4974 	if (!pdev)
4975 		return;
4976 
4977 	if (risky_device(pdev)) {
4978 		pci_dev_put(pdev);
4979 		return;
4980 	}
4981 
4982 	pci_dev_put(pdev);
4983 
4984 	/* System Management Registers. Might be hidden, in which case
4985 	   we can't do the sanity check. But that's OK, because the
4986 	   known-broken BIOSes _don't_ actually hide it, so far. */
4987 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4988 	if (!pdev)
4989 		return;
4990 
4991 	if (risky_device(pdev)) {
4992 		pci_dev_put(pdev);
4993 		return;
4994 	}
4995 
4996 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4997 		pci_dev_put(pdev);
4998 		return;
4999 	}
5000 
5001 	pci_dev_put(pdev);
5002 
5003 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5004 	if (vtisochctrl & 1)
5005 		return;
5006 
5007 	/* Drop all bits other than the number of TLB entries */
5008 	vtisochctrl &= 0x1c;
5009 
5010 	/* If we have the recommended number of TLB entries (16), fine. */
5011 	if (vtisochctrl == 0x10)
5012 		return;
5013 
5014 	/* Zero TLB entries? You get to ride the short bus to school. */
5015 	if (!vtisochctrl) {
5016 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5017 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5018 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5019 		     dmi_get_system_info(DMI_BIOS_VERSION),
5020 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5021 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5022 		return;
5023 	}
5024 
5025 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5026 	       vtisochctrl);
5027 }
5028 
5029 /*
5030  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5031  * invalidation completion before posted writes initiated with translated address
5032  * that utilized translations matching the invalidation address range, violating
5033  * the invalidation completion ordering.
5034  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5035  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5036  * under the control of the trusted/privileged host device driver must use this
5037  * quirk.
5038  * Device TLBs are invalidated under the following six conditions:
5039  * 1. Device driver does DMA API unmap IOVA
5040  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5041  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5042  *    exit_mmap() due to crash
5043  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5044  *    VM has to free pages that were unmapped
5045  * 5. Userspace driver unmaps a DMA buffer
5046  * 6. Cache invalidation in vSVA usage (upcoming)
5047  *
5048  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5049  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5050  * invalidate TLB the same way as normal user unmap which will use this quirk.
5051  * The dTLB invalidation after PASID cache flush does not need this quirk.
5052  *
5053  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5054  */
5055 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5056 			       unsigned long address, unsigned long mask,
5057 			       u32 pasid, u16 qdep)
5058 {
5059 	u16 sid;
5060 
5061 	if (likely(!info->dtlb_extra_inval))
5062 		return;
5063 
5064 	sid = PCI_DEVID(info->bus, info->devfn);
5065 	if (pasid == IOMMU_NO_PASID) {
5066 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5067 				   qdep, address, mask);
5068 	} else {
5069 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5070 					 pasid, qdep, address, mask);
5071 	}
5072 }
5073 
5074 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5075 
5076 /*
5077  * Function to submit a command to the enhanced command interface. The
5078  * valid enhanced command descriptions are defined in Table 47 of the
5079  * VT-d spec. The VT-d hardware implementation may support some but not
5080  * all commands, which can be determined by checking the Enhanced
5081  * Command Capability Register.
5082  *
5083  * Return values:
5084  *  - 0: Command successful without any error;
5085  *  - Negative: software error value;
5086  *  - Nonzero positive: failure status code defined in Table 48.
5087  */
5088 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5089 {
5090 	unsigned long flags;
5091 	u64 res;
5092 	int ret;
5093 
5094 	if (!cap_ecmds(iommu->cap))
5095 		return -ENODEV;
5096 
5097 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5098 
5099 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5100 	if (res & DMA_ECMD_ECRSP_IP) {
5101 		ret = -EBUSY;
5102 		goto err;
5103 	}
5104 
5105 	/*
5106 	 * Unconditionally write the operand B, because
5107 	 * - There is no side effect if an ecmd doesn't require an
5108 	 *   operand B, but we set the register to some value.
5109 	 * - It's not invoked in any critical path. The extra MMIO
5110 	 *   write doesn't bring any performance concerns.
5111 	 */
5112 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5113 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5114 
5115 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5116 		      !(res & DMA_ECMD_ECRSP_IP), res);
5117 
5118 	if (res & DMA_ECMD_ECRSP_IP) {
5119 		ret = -ETIMEDOUT;
5120 		goto err;
5121 	}
5122 
5123 	ret = ecmd_get_status_code(res);
5124 err:
5125 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5126 
5127 	return ret;
5128 }
5129