xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision f7875966)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
58 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN		(1)
63 
64 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
65 
66 /* page table handling */
67 #define LEVEL_STRIDE		(9)
68 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
69 
70 static inline int agaw_to_level(int agaw)
71 {
72 	return agaw + 2;
73 }
74 
75 static inline int agaw_to_width(int agaw)
76 {
77 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79 
80 static inline int width_to_agaw(int width)
81 {
82 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84 
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87 	return (level - 1) * LEVEL_STRIDE;
88 }
89 
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94 
95 static inline u64 level_mask(int level)
96 {
97 	return -1ULL << level_to_offset_bits(level);
98 }
99 
100 static inline u64 level_size(int level)
101 {
102 	return 1ULL << level_to_offset_bits(level);
103 }
104 
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107 	return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109 
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114 
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123 	return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127 	return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131 	return page_to_dma_pfn(virt_to_page(p));
132 }
133 
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_set_present(struct context_entry *context)
172 {
173 	context->lo |= 1;
174 }
175 
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178 	context->lo &= (((u64)-1) << 2) | 1;
179 }
180 
181 static inline void context_set_translation_type(struct context_entry *context,
182 						unsigned long value)
183 {
184 	context->lo &= (((u64)-1) << 4) | 3;
185 	context->lo |= (value & 3) << 2;
186 }
187 
188 static inline void context_set_address_root(struct context_entry *context,
189 					    unsigned long value)
190 {
191 	context->lo &= ~VTD_PAGE_MASK;
192 	context->lo |= value & VTD_PAGE_MASK;
193 }
194 
195 static inline void context_set_address_width(struct context_entry *context,
196 					     unsigned long value)
197 {
198 	context->hi |= value & 7;
199 }
200 
201 static inline void context_set_domain_id(struct context_entry *context,
202 					 unsigned long value)
203 {
204 	context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206 
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209 	context->lo |= CONTEXT_PASIDE;
210 }
211 
212 static inline int context_domain_id(struct context_entry *c)
213 {
214 	return((c->hi >> 8) & 0xffff);
215 }
216 
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219 	context->lo = 0;
220 	context->hi = 0;
221 }
222 
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225 	if (!iommu->copied_tables)
226 		return false;
227 
228 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230 
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236 
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242 
243 /*
244  * This domain is a statically identity mapping domain.
245  *	1. This domain creats a static 1:1 mapping to all usable memory.
246  * 	2. It maps to each iommu if successful.
247  *	3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251 
252 struct dmar_rmrr_unit {
253 	struct list_head list;		/* list of rmrr units	*/
254 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
255 	u64	base_address;		/* reserved base address*/
256 	u64	end_address;		/* reserved end address */
257 	struct dmar_dev_scope *devices;	/* target devices */
258 	int	devices_cnt;		/* target device count */
259 };
260 
261 struct dmar_atsr_unit {
262 	struct list_head list;		/* list of ATSR units */
263 	struct acpi_dmar_header *hdr;	/* ACPI header */
264 	struct dmar_dev_scope *devices;	/* target devices */
265 	int devices_cnt;		/* target device count */
266 	u8 include_all:1;		/* include all ports */
267 };
268 
269 struct dmar_satc_unit {
270 	struct list_head list;		/* list of SATC units */
271 	struct acpi_dmar_header *hdr;	/* ACPI header */
272 	struct dmar_dev_scope *devices;	/* target devices */
273 	struct intel_iommu *iommu;	/* the corresponding iommu */
274 	int devices_cnt;		/* target device count */
275 	u8 atc_required:1;		/* ATS is required */
276 };
277 
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281 
282 #define for_each_rmrr_units(rmrr) \
283 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284 
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287 
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290 
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293 
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298 
299 #define IDENTMAP_GFX		2
300 #define IDENTMAP_AZALIA		4
301 
302 const struct iommu_ops intel_iommu_ops;
303 
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308 
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313 
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316 	u32 gsts;
317 
318 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 	if (gsts & DMA_GSTS_TES)
320 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322 
323 static int __init intel_iommu_setup(char *str)
324 {
325 	if (!str)
326 		return -EINVAL;
327 
328 	while (*str) {
329 		if (!strncmp(str, "on", 2)) {
330 			dmar_disabled = 0;
331 			pr_info("IOMMU enabled\n");
332 		} else if (!strncmp(str, "off", 3)) {
333 			dmar_disabled = 1;
334 			no_platform_optin = 1;
335 			pr_info("IOMMU disabled\n");
336 		} else if (!strncmp(str, "igfx_off", 8)) {
337 			dmar_map_gfx = 0;
338 			pr_info("Disable GFX device mapping\n");
339 		} else if (!strncmp(str, "forcedac", 8)) {
340 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 			iommu_dma_forcedac = true;
342 		} else if (!strncmp(str, "strict", 6)) {
343 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344 			iommu_set_dma_strict();
345 		} else if (!strncmp(str, "sp_off", 6)) {
346 			pr_info("Disable supported super page\n");
347 			intel_iommu_superpage = 0;
348 		} else if (!strncmp(str, "sm_on", 5)) {
349 			pr_info("Enable scalable mode if hardware supports\n");
350 			intel_iommu_sm = 1;
351 		} else if (!strncmp(str, "sm_off", 6)) {
352 			pr_info("Scalable mode is disallowed\n");
353 			intel_iommu_sm = 0;
354 		} else if (!strncmp(str, "tboot_noforce", 13)) {
355 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356 			intel_iommu_tboot_noforce = 1;
357 		} else {
358 			pr_notice("Unknown option - '%s'\n", str);
359 		}
360 
361 		str += strcspn(str, ",");
362 		while (*str == ',')
363 			str++;
364 	}
365 
366 	return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369 
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372 	struct page *page;
373 	void *vaddr = NULL;
374 
375 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376 	if (page)
377 		vaddr = page_address(page);
378 	return vaddr;
379 }
380 
381 void free_pgtable_page(void *vaddr)
382 {
383 	free_page((unsigned long)vaddr);
384 }
385 
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390 
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392 				       unsigned long pfn)
393 {
394 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395 
396 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398 
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406 	unsigned long fl_sagaw, sl_sagaw;
407 
408 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409 	sl_sagaw = cap_sagaw(iommu->cap);
410 
411 	/* Second level only. */
412 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 		return sl_sagaw;
414 
415 	/* First level only. */
416 	if (!ecap_slts(iommu->ecap))
417 		return fl_sagaw;
418 
419 	return fl_sagaw & sl_sagaw;
420 }
421 
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424 	unsigned long sagaw;
425 	int agaw;
426 
427 	sagaw = __iommu_calculate_sagaw(iommu);
428 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429 		if (test_bit(agaw, &sagaw))
430 			break;
431 	}
432 
433 	return agaw;
434 }
435 
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443 
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453 
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456 	return sm_supported(iommu) ?
457 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459 
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462 	struct iommu_domain_info *info;
463 	struct dmar_drhd_unit *drhd;
464 	struct intel_iommu *iommu;
465 	bool found = false;
466 	unsigned long i;
467 
468 	domain->iommu_coherency = true;
469 	xa_for_each(&domain->iommu_array, i, info) {
470 		found = true;
471 		if (!iommu_paging_structure_coherency(info->iommu)) {
472 			domain->iommu_coherency = false;
473 			break;
474 		}
475 	}
476 	if (found)
477 		return;
478 
479 	/* No hardware attached; use lowest common denominator */
480 	rcu_read_lock();
481 	for_each_active_iommu(iommu, drhd) {
482 		if (!iommu_paging_structure_coherency(iommu)) {
483 			domain->iommu_coherency = false;
484 			break;
485 		}
486 	}
487 	rcu_read_unlock();
488 }
489 
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 					 struct intel_iommu *skip)
492 {
493 	struct dmar_drhd_unit *drhd;
494 	struct intel_iommu *iommu;
495 	int mask = 0x3;
496 
497 	if (!intel_iommu_superpage)
498 		return 0;
499 
500 	/* set iommu_superpage to the smallest common denominator */
501 	rcu_read_lock();
502 	for_each_active_iommu(iommu, drhd) {
503 		if (iommu != skip) {
504 			if (domain && domain->use_first_level) {
505 				if (!cap_fl1gp_support(iommu->cap))
506 					mask = 0x1;
507 			} else {
508 				mask &= cap_super_page_val(iommu->cap);
509 			}
510 
511 			if (!mask)
512 				break;
513 		}
514 	}
515 	rcu_read_unlock();
516 
517 	return fls(mask);
518 }
519 
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522 	struct device_domain_info *info;
523 	int nid = NUMA_NO_NODE;
524 	unsigned long flags;
525 
526 	spin_lock_irqsave(&domain->lock, flags);
527 	list_for_each_entry(info, &domain->devices, link) {
528 		/*
529 		 * There could possibly be multiple device numa nodes as devices
530 		 * within the same domain may sit behind different IOMMUs. There
531 		 * isn't perfect answer in such situation, so we select first
532 		 * come first served policy.
533 		 */
534 		nid = dev_to_node(info->dev);
535 		if (nid != NUMA_NO_NODE)
536 			break;
537 	}
538 	spin_unlock_irqrestore(&domain->lock, flags);
539 
540 	return nid;
541 }
542 
543 static void domain_update_iotlb(struct dmar_domain *domain);
544 
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548 	unsigned long bitmap = 0;
549 
550 	/*
551 	 * 1-level super page supports page size of 2MiB, 2-level super page
552 	 * supports page size of both 2MiB and 1GiB.
553 	 */
554 	if (domain->iommu_superpage == 1)
555 		bitmap |= SZ_2M;
556 	else if (domain->iommu_superpage == 2)
557 		bitmap |= SZ_2M | SZ_1G;
558 
559 	return bitmap;
560 }
561 
562 /* Some capabilities may be different across iommus */
563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565 	domain_update_iommu_coherency(domain);
566 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567 
568 	/*
569 	 * If RHSA is missing, we should default to the device numa domain
570 	 * as fall back.
571 	 */
572 	if (domain->nid == NUMA_NO_NODE)
573 		domain->nid = domain_update_device_node(domain);
574 
575 	/*
576 	 * First-level translation restricts the input-address to a
577 	 * canonical address (i.e., address bits 63:N have the same
578 	 * value as address bit [N-1], where N is 48-bits with 4-level
579 	 * paging and 57-bits with 5-level paging). Hence, skip bit
580 	 * [N-1].
581 	 */
582 	if (domain->use_first_level)
583 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 	else
585 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586 
587 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588 	domain_update_iotlb(domain);
589 }
590 
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 					 u8 devfn, int alloc)
593 {
594 	struct root_entry *root = &iommu->root_entry[bus];
595 	struct context_entry *context;
596 	u64 *entry;
597 
598 	/*
599 	 * Except that the caller requested to allocate a new entry,
600 	 * returning a copied context entry makes no sense.
601 	 */
602 	if (!alloc && context_copied(iommu, bus, devfn))
603 		return NULL;
604 
605 	entry = &root->lo;
606 	if (sm_supported(iommu)) {
607 		if (devfn >= 0x80) {
608 			devfn -= 0x80;
609 			entry = &root->hi;
610 		}
611 		devfn *= 2;
612 	}
613 	if (*entry & 1)
614 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 	else {
616 		unsigned long phy_addr;
617 		if (!alloc)
618 			return NULL;
619 
620 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621 		if (!context)
622 			return NULL;
623 
624 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 		phy_addr = virt_to_phys((void *)context);
626 		*entry = phy_addr | 1;
627 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
628 	}
629 	return &context[devfn];
630 }
631 
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *				 sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643 	struct pci_dev *pdev, *pbridge;
644 
645 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 		return false;
647 
648 	pdev = to_pci_dev(dev);
649 	pbridge = to_pci_dev(bridge);
650 
651 	if (pbridge->subordinate &&
652 	    pbridge->subordinate->number <= pdev->bus->number &&
653 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 		return true;
655 
656 	return false;
657 }
658 
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661 	struct dmar_drhd_unit *drhd;
662 	u32 vtbar;
663 	int rc;
664 
665 	/* We know that this device on this chipset has its own IOMMU.
666 	 * If we find it under a different IOMMU, then the BIOS is lying
667 	 * to us. Hope that the IOMMU for this device is actually
668 	 * disabled, and it needs no translation...
669 	 */
670 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 	if (rc) {
672 		/* "can't" happen */
673 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 		return false;
675 	}
676 	vtbar &= 0xffff0000;
677 
678 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
679 	drhd = dmar_find_matched_drhd_unit(pdev);
680 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 		return true;
684 	}
685 
686 	return false;
687 }
688 
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691 	if (!iommu || iommu->drhd->ignored)
692 		return true;
693 
694 	if (dev_is_pci(dev)) {
695 		struct pci_dev *pdev = to_pci_dev(dev);
696 
697 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 		    quirk_ioat_snb_local_iommu(pdev))
700 			return true;
701 	}
702 
703 	return false;
704 }
705 
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708 	struct dmar_drhd_unit *drhd = NULL;
709 	struct pci_dev *pdev = NULL;
710 	struct intel_iommu *iommu;
711 	struct device *tmp;
712 	u16 segment = 0;
713 	int i;
714 
715 	if (!dev)
716 		return NULL;
717 
718 	if (dev_is_pci(dev)) {
719 		struct pci_dev *pf_pdev;
720 
721 		pdev = pci_real_dma_dev(to_pci_dev(dev));
722 
723 		/* VFs aren't listed in scope tables; we need to look up
724 		 * the PF instead to find the IOMMU. */
725 		pf_pdev = pci_physfn(pdev);
726 		dev = &pf_pdev->dev;
727 		segment = pci_domain_nr(pdev->bus);
728 	} else if (has_acpi_companion(dev))
729 		dev = &ACPI_COMPANION(dev)->dev;
730 
731 	rcu_read_lock();
732 	for_each_iommu(iommu, drhd) {
733 		if (pdev && segment != drhd->segment)
734 			continue;
735 
736 		for_each_active_dev_scope(drhd->devices,
737 					  drhd->devices_cnt, i, tmp) {
738 			if (tmp == dev) {
739 				/* For a VF use its original BDF# not that of the PF
740 				 * which we used for the IOMMU lookup. Strictly speaking
741 				 * we could do this for all PCI devices; we only need to
742 				 * get the BDF# from the scope table for ACPI matches. */
743 				if (pdev && pdev->is_virtfn)
744 					goto got_pdev;
745 
746 				if (bus && devfn) {
747 					*bus = drhd->devices[i].bus;
748 					*devfn = drhd->devices[i].devfn;
749 				}
750 				goto out;
751 			}
752 
753 			if (is_downstream_to_pci_bridge(dev, tmp))
754 				goto got_pdev;
755 		}
756 
757 		if (pdev && drhd->include_all) {
758 got_pdev:
759 			if (bus && devfn) {
760 				*bus = pdev->bus->number;
761 				*devfn = pdev->devfn;
762 			}
763 			goto out;
764 		}
765 	}
766 	iommu = NULL;
767 out:
768 	if (iommu_is_dummy(iommu, dev))
769 		iommu = NULL;
770 
771 	rcu_read_unlock();
772 
773 	return iommu;
774 }
775 
776 static void domain_flush_cache(struct dmar_domain *domain,
777 			       void *addr, int size)
778 {
779 	if (!domain->iommu_coherency)
780 		clflush_cache_range(addr, size);
781 }
782 
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785 	struct context_entry *context;
786 	int i;
787 
788 	if (!iommu->root_entry)
789 		return;
790 
791 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
792 		context = iommu_context_addr(iommu, i, 0, 0);
793 		if (context)
794 			free_pgtable_page(context);
795 
796 		if (!sm_supported(iommu))
797 			continue;
798 
799 		context = iommu_context_addr(iommu, i, 0x80, 0);
800 		if (context)
801 			free_pgtable_page(context);
802 	}
803 
804 	free_pgtable_page(iommu->root_entry);
805 	iommu->root_entry = NULL;
806 }
807 
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812 	struct dma_pte *pte;
813 	int offset;
814 
815 	while (1) {
816 		offset = pfn_level_offset(pfn, level);
817 		pte = &parent[offset];
818 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 			pr_info("PTE not present at level %d\n", level);
820 			break;
821 		}
822 
823 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824 
825 		if (level == 1)
826 			break;
827 
828 		parent = phys_to_virt(dma_pte_addr(pte));
829 		level--;
830 	}
831 }
832 
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 			  unsigned long long addr, u32 pasid)
835 {
836 	struct pasid_dir_entry *dir, *pde;
837 	struct pasid_entry *entries, *pte;
838 	struct context_entry *ctx_entry;
839 	struct root_entry *rt_entry;
840 	int i, dir_index, index, level;
841 	u8 devfn = source_id & 0xff;
842 	u8 bus = source_id >> 8;
843 	struct dma_pte *pgtable;
844 
845 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846 
847 	/* root entry dump */
848 	rt_entry = &iommu->root_entry[bus];
849 	if (!rt_entry) {
850 		pr_info("root table entry is not present\n");
851 		return;
852 	}
853 
854 	if (sm_supported(iommu))
855 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 			rt_entry->hi, rt_entry->lo);
857 	else
858 		pr_info("root entry: 0x%016llx", rt_entry->lo);
859 
860 	/* context entry dump */
861 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862 	if (!ctx_entry) {
863 		pr_info("context table entry is not present\n");
864 		return;
865 	}
866 
867 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 		ctx_entry->hi, ctx_entry->lo);
869 
870 	/* legacy mode does not require PASID entries */
871 	if (!sm_supported(iommu)) {
872 		level = agaw_to_level(ctx_entry->hi & 7);
873 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 		goto pgtable_walk;
875 	}
876 
877 	/* get the pointer to pasid directory entry */
878 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 	if (!dir) {
880 		pr_info("pasid directory entry is not present\n");
881 		return;
882 	}
883 	/* For request-without-pasid, get the pasid from context entry */
884 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885 		pasid = IOMMU_NO_PASID;
886 
887 	dir_index = pasid >> PASID_PDE_SHIFT;
888 	pde = &dir[dir_index];
889 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890 
891 	/* get the pointer to the pasid table entry */
892 	entries = get_pasid_table_from_pde(pde);
893 	if (!entries) {
894 		pr_info("pasid table entry is not present\n");
895 		return;
896 	}
897 	index = pasid & PASID_PTE_MASK;
898 	pte = &entries[index];
899 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901 
902 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905 	} else {
906 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908 	}
909 
910 pgtable_walk:
911 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914 
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916 				      unsigned long pfn, int *target_level,
917 				      gfp_t gfp)
918 {
919 	struct dma_pte *parent, *pte;
920 	int level = agaw_to_level(domain->agaw);
921 	int offset;
922 
923 	if (!domain_pfn_supported(domain, pfn))
924 		/* Address beyond IOMMU's addressing capabilities. */
925 		return NULL;
926 
927 	parent = domain->pgd;
928 
929 	while (1) {
930 		void *tmp_page;
931 
932 		offset = pfn_level_offset(pfn, level);
933 		pte = &parent[offset];
934 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935 			break;
936 		if (level == *target_level)
937 			break;
938 
939 		if (!dma_pte_present(pte)) {
940 			uint64_t pteval;
941 
942 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
943 
944 			if (!tmp_page)
945 				return NULL;
946 
947 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949 			if (domain->use_first_level)
950 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951 
952 			if (cmpxchg64(&pte->val, 0ULL, pteval))
953 				/* Someone else set it while we were thinking; use theirs. */
954 				free_pgtable_page(tmp_page);
955 			else
956 				domain_flush_cache(domain, pte, sizeof(*pte));
957 		}
958 		if (level == 1)
959 			break;
960 
961 		parent = phys_to_virt(dma_pte_addr(pte));
962 		level--;
963 	}
964 
965 	if (!*target_level)
966 		*target_level = level;
967 
968 	return pte;
969 }
970 
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973 					 unsigned long pfn,
974 					 int level, int *large_page)
975 {
976 	struct dma_pte *parent, *pte;
977 	int total = agaw_to_level(domain->agaw);
978 	int offset;
979 
980 	parent = domain->pgd;
981 	while (level <= total) {
982 		offset = pfn_level_offset(pfn, total);
983 		pte = &parent[offset];
984 		if (level == total)
985 			return pte;
986 
987 		if (!dma_pte_present(pte)) {
988 			*large_page = total;
989 			break;
990 		}
991 
992 		if (dma_pte_superpage(pte)) {
993 			*large_page = total;
994 			return pte;
995 		}
996 
997 		parent = phys_to_virt(dma_pte_addr(pte));
998 		total--;
999 	}
1000 	return NULL;
1001 }
1002 
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005 				unsigned long start_pfn,
1006 				unsigned long last_pfn)
1007 {
1008 	unsigned int large_page;
1009 	struct dma_pte *first_pte, *pte;
1010 
1011 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 	    WARN_ON(start_pfn > last_pfn))
1013 		return;
1014 
1015 	/* we don't need lock here; nobody else touches the iova range */
1016 	do {
1017 		large_page = 1;
1018 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019 		if (!pte) {
1020 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021 			continue;
1022 		}
1023 		do {
1024 			dma_clear_pte(pte);
1025 			start_pfn += lvl_to_nr_pages(large_page);
1026 			pte++;
1027 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028 
1029 		domain_flush_cache(domain, first_pte,
1030 				   (void *)pte - (void *)first_pte);
1031 
1032 	} while (start_pfn && start_pfn <= last_pfn);
1033 }
1034 
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036 			       int retain_level, struct dma_pte *pte,
1037 			       unsigned long pfn, unsigned long start_pfn,
1038 			       unsigned long last_pfn)
1039 {
1040 	pfn = max(start_pfn, pfn);
1041 	pte = &pte[pfn_level_offset(pfn, level)];
1042 
1043 	do {
1044 		unsigned long level_pfn;
1045 		struct dma_pte *level_pte;
1046 
1047 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048 			goto next;
1049 
1050 		level_pfn = pfn & level_mask(level);
1051 		level_pte = phys_to_virt(dma_pte_addr(pte));
1052 
1053 		if (level > 2) {
1054 			dma_pte_free_level(domain, level - 1, retain_level,
1055 					   level_pte, level_pfn, start_pfn,
1056 					   last_pfn);
1057 		}
1058 
1059 		/*
1060 		 * Free the page table if we're below the level we want to
1061 		 * retain and the range covers the entire table.
1062 		 */
1063 		if (level < retain_level && !(start_pfn > level_pfn ||
1064 		      last_pfn < level_pfn + level_size(level) - 1)) {
1065 			dma_clear_pte(pte);
1066 			domain_flush_cache(domain, pte, sizeof(*pte));
1067 			free_pgtable_page(level_pte);
1068 		}
1069 next:
1070 		pfn += level_size(level);
1071 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073 
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079 				   unsigned long start_pfn,
1080 				   unsigned long last_pfn,
1081 				   int retain_level)
1082 {
1083 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 
1085 	/* We don't need lock here; nobody else touches the iova range */
1086 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 			   domain->pgd, 0, start_pfn, last_pfn);
1088 
1089 	/* free pgd */
1090 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 		free_pgtable_page(domain->pgd);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 				    int level, struct dma_pte *pte,
1104 				    struct list_head *freelist)
1105 {
1106 	struct page *pg;
1107 
1108 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 	list_add_tail(&pg->lru, freelist);
1110 
1111 	if (level == 1)
1112 		return;
1113 
1114 	pte = page_address(pg);
1115 	do {
1116 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118 		pte++;
1119 	} while (!first_pte_in_page(pte));
1120 }
1121 
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 				struct dma_pte *pte, unsigned long pfn,
1124 				unsigned long start_pfn, unsigned long last_pfn,
1125 				struct list_head *freelist)
1126 {
1127 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128 
1129 	pfn = max(start_pfn, pfn);
1130 	pte = &pte[pfn_level_offset(pfn, level)];
1131 
1132 	do {
1133 		unsigned long level_pfn = pfn & level_mask(level);
1134 
1135 		if (!dma_pte_present(pte))
1136 			goto next;
1137 
1138 		/* If range covers entire pagetable, free it */
1139 		if (start_pfn <= level_pfn &&
1140 		    last_pfn >= level_pfn + level_size(level) - 1) {
1141 			/* These suborbinate page tables are going away entirely. Don't
1142 			   bother to clear them; we're just going to *free* them. */
1143 			if (level > 1 && !dma_pte_superpage(pte))
1144 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145 
1146 			dma_clear_pte(pte);
1147 			if (!first_pte)
1148 				first_pte = pte;
1149 			last_pte = pte;
1150 		} else if (level > 1) {
1151 			/* Recurse down into a level that isn't *entirely* obsolete */
1152 			dma_pte_clear_level(domain, level - 1,
1153 					    phys_to_virt(dma_pte_addr(pte)),
1154 					    level_pfn, start_pfn, last_pfn,
1155 					    freelist);
1156 		}
1157 next:
1158 		pfn = level_pfn + level_size(level);
1159 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160 
1161 	if (first_pte)
1162 		domain_flush_cache(domain, first_pte,
1163 				   (void *)++last_pte - (void *)first_pte);
1164 }
1165 
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 			 unsigned long last_pfn, struct list_head *freelist)
1171 {
1172 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 	    WARN_ON(start_pfn > last_pfn))
1174 		return;
1175 
1176 	/* we don't need lock here; nobody else touches the iova range */
1177 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1179 
1180 	/* free pgd */
1181 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 		struct page *pgd_page = virt_to_page(domain->pgd);
1183 		list_add_tail(&pgd_page->lru, freelist);
1184 		domain->pgd = NULL;
1185 	}
1186 }
1187 
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191 	struct root_entry *root;
1192 
1193 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194 	if (!root) {
1195 		pr_err("Allocating root entry for %s failed\n",
1196 			iommu->name);
1197 		return -ENOMEM;
1198 	}
1199 
1200 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1201 	iommu->root_entry = root;
1202 
1203 	return 0;
1204 }
1205 
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208 	u64 addr;
1209 	u32 sts;
1210 	unsigned long flag;
1211 
1212 	addr = virt_to_phys(iommu->root_entry);
1213 	if (sm_supported(iommu))
1214 		addr |= DMA_RTADDR_SMT;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218 
1219 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220 
1221 	/* Make sure hardware complete it */
1222 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223 		      readl, (sts & DMA_GSTS_RTPS), sts);
1224 
1225 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226 
1227 	/*
1228 	 * Hardware invalidates all DMA remapping hardware translation
1229 	 * caches as part of SRTP flow.
1230 	 */
1231 	if (cap_esrtps(iommu->cap))
1232 		return;
1233 
1234 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 	if (sm_supported(iommu))
1236 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239 
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242 	u32 val;
1243 	unsigned long flag;
1244 
1245 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246 		return;
1247 
1248 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250 
1251 	/* Make sure hardware complete it */
1252 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1254 
1255 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257 
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260 				  u16 did, u16 source_id, u8 function_mask,
1261 				  u64 type)
1262 {
1263 	u64 val = 0;
1264 	unsigned long flag;
1265 
1266 	switch (type) {
1267 	case DMA_CCMD_GLOBAL_INVL:
1268 		val = DMA_CCMD_GLOBAL_INVL;
1269 		break;
1270 	case DMA_CCMD_DOMAIN_INVL:
1271 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 		break;
1273 	case DMA_CCMD_DEVICE_INVL:
1274 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276 		break;
1277 	default:
1278 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279 			iommu->name, type);
1280 		return;
1281 	}
1282 	val |= DMA_CCMD_ICC;
1283 
1284 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286 
1287 	/* Make sure hardware complete it */
1288 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290 
1291 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293 
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 				u64 addr, unsigned int size_order, u64 type)
1297 {
1298 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 	u64 val = 0, val_iva = 0;
1300 	unsigned long flag;
1301 
1302 	switch (type) {
1303 	case DMA_TLB_GLOBAL_FLUSH:
1304 		/* global flush doesn't need set IVA_REG */
1305 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 		break;
1307 	case DMA_TLB_DSI_FLUSH:
1308 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 		break;
1310 	case DMA_TLB_PSI_FLUSH:
1311 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312 		/* IH bit is passed in as part of address */
1313 		val_iva = size_order | addr;
1314 		break;
1315 	default:
1316 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317 			iommu->name, type);
1318 		return;
1319 	}
1320 
1321 	if (cap_write_drain(iommu->cap))
1322 		val |= DMA_TLB_WRITE_DRAIN;
1323 
1324 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 	/* Note: Only uses first TLB reg currently */
1326 	if (val_iva)
1327 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329 
1330 	/* Make sure hardware complete it */
1331 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333 
1334 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 
1336 	/* check IOTLB invalidation granularity */
1337 	if (DMA_TLB_IAIG(val) == 0)
1338 		pr_err("Flush IOTLB failed\n");
1339 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 			(unsigned long long)DMA_TLB_IIRG(type),
1342 			(unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344 
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349 	struct device_domain_info *info;
1350 	unsigned long flags;
1351 
1352 	spin_lock_irqsave(&domain->lock, flags);
1353 	list_for_each_entry(info, &domain->devices, link) {
1354 		if (info->iommu == iommu && info->bus == bus &&
1355 		    info->devfn == devfn) {
1356 			spin_unlock_irqrestore(&domain->lock, flags);
1357 			return info;
1358 		}
1359 	}
1360 	spin_unlock_irqrestore(&domain->lock, flags);
1361 
1362 	return NULL;
1363 }
1364 
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 	struct dev_pasid_info *dev_pasid;
1368 	struct device_domain_info *info;
1369 	bool has_iotlb_device = false;
1370 	unsigned long flags;
1371 
1372 	spin_lock_irqsave(&domain->lock, flags);
1373 	list_for_each_entry(info, &domain->devices, link) {
1374 		if (info->ats_enabled) {
1375 			has_iotlb_device = true;
1376 			break;
1377 		}
1378 	}
1379 
1380 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 		info = dev_iommu_priv_get(dev_pasid->dev);
1382 		if (info->ats_enabled) {
1383 			has_iotlb_device = true;
1384 			break;
1385 		}
1386 	}
1387 	domain->has_iotlb_device = has_iotlb_device;
1388 	spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390 
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401 		return false;
1402 
1403 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404 		return false;
1405 
1406 	return true;
1407 }
1408 
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411 	struct pci_dev *pdev;
1412 
1413 	if (!dev_is_pci(info->dev))
1414 		return;
1415 
1416 	pdev = to_pci_dev(info->dev);
1417 
1418 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1419 	   the device if you enable PASID support after ATS support is
1420 	   undefined. So always enable PASID support on devices which
1421 	   have it, even if we can't yet know if we're ever going to
1422 	   use it. */
1423 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 		info->pasid_enabled = 1;
1425 
1426 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428 		info->ats_enabled = 1;
1429 		domain_update_iotlb(info->domain);
1430 	}
1431 }
1432 
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435 	struct pci_dev *pdev;
1436 
1437 	if (!dev_is_pci(info->dev))
1438 		return;
1439 
1440 	pdev = to_pci_dev(info->dev);
1441 
1442 	if (info->ats_enabled) {
1443 		pci_disable_ats(pdev);
1444 		info->ats_enabled = 0;
1445 		domain_update_iotlb(info->domain);
1446 	}
1447 
1448 	if (info->pasid_enabled) {
1449 		pci_disable_pasid(pdev);
1450 		info->pasid_enabled = 0;
1451 	}
1452 }
1453 
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 				    u64 addr, unsigned int mask)
1456 {
1457 	u16 sid, qdep;
1458 
1459 	if (!info || !info->ats_enabled)
1460 		return;
1461 
1462 	sid = info->bus << 8 | info->devfn;
1463 	qdep = info->ats_qdep;
1464 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 			   qdep, addr, mask);
1466 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468 
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 				  u64 addr, unsigned mask)
1471 {
1472 	struct dev_pasid_info *dev_pasid;
1473 	struct device_domain_info *info;
1474 	unsigned long flags;
1475 
1476 	if (!domain->has_iotlb_device)
1477 		return;
1478 
1479 	spin_lock_irqsave(&domain->lock, flags);
1480 	list_for_each_entry(info, &domain->devices, link)
1481 		__iommu_flush_dev_iotlb(info, addr, mask);
1482 
1483 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 		info = dev_iommu_priv_get(dev_pasid->dev);
1485 
1486 		if (!info->ats_enabled)
1487 			continue;
1488 
1489 		qi_flush_dev_iotlb_pasid(info->iommu,
1490 					 PCI_DEVID(info->bus, info->devfn),
1491 					 info->pfsid, dev_pasid->pasid,
1492 					 info->ats_qdep, addr,
1493 					 mask);
1494 	}
1495 	spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497 
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 				     struct dmar_domain *domain, u64 addr,
1500 				     unsigned long npages, bool ih)
1501 {
1502 	u16 did = domain_id_iommu(domain, iommu);
1503 	struct dev_pasid_info *dev_pasid;
1504 	unsigned long flags;
1505 
1506 	spin_lock_irqsave(&domain->lock, flags);
1507 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509 
1510 	if (!list_empty(&domain->devices))
1511 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512 	spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514 
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 				  struct dmar_domain *domain,
1517 				  unsigned long pfn, unsigned int pages,
1518 				  int ih, int map)
1519 {
1520 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 	unsigned int mask = ilog2(aligned_pages);
1522 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523 	u16 did = domain_id_iommu(domain, iommu);
1524 
1525 	if (WARN_ON(!pages))
1526 		return;
1527 
1528 	if (ih)
1529 		ih = 1 << 6;
1530 
1531 	if (domain->use_first_level) {
1532 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533 	} else {
1534 		unsigned long bitmask = aligned_pages - 1;
1535 
1536 		/*
1537 		 * PSI masks the low order bits of the base address. If the
1538 		 * address isn't aligned to the mask, then compute a mask value
1539 		 * needed to ensure the target range is flushed.
1540 		 */
1541 		if (unlikely(bitmask & pfn)) {
1542 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543 
1544 			/*
1545 			 * Since end_pfn <= pfn + bitmask, the only way bits
1546 			 * higher than bitmask can differ in pfn and end_pfn is
1547 			 * by carrying. This means after masking out bitmask,
1548 			 * high bits starting with the first set bit in
1549 			 * shared_bits are all equal in both pfn and end_pfn.
1550 			 */
1551 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553 		}
1554 
1555 		/*
1556 		 * Fallback to domain selective flush if no PSI support or
1557 		 * the size is too big.
1558 		 */
1559 		if (!cap_pgsel_inv(iommu->cap) ||
1560 		    mask > cap_max_amask_val(iommu->cap))
1561 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 							DMA_TLB_DSI_FLUSH);
1563 		else
1564 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 							DMA_TLB_PSI_FLUSH);
1566 	}
1567 
1568 	/*
1569 	 * In caching mode, changes of pages from non-present to present require
1570 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1571 	 */
1572 	if (!cap_caching_mode(iommu->cap) || !map)
1573 		iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575 
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 					struct dmar_domain *domain,
1579 					unsigned long pfn, unsigned int pages)
1580 {
1581 	/*
1582 	 * It's a non-present to present mapping. Only flush if caching mode
1583 	 * and second level.
1584 	 */
1585 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 	else
1588 		iommu_flush_write_buffer(iommu);
1589 }
1590 
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594 	struct iommu_domain_info *info;
1595 	unsigned long idx;
1596 
1597 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 		struct intel_iommu *iommu = info->iommu;
1599 		u16 did = domain_id_iommu(dmar_domain, iommu);
1600 
1601 		if (dmar_domain->use_first_level)
1602 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603 		else
1604 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605 						 DMA_TLB_DSI_FLUSH);
1606 
1607 		if (!cap_caching_mode(iommu->cap))
1608 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609 	}
1610 }
1611 
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614 	u32 pmen;
1615 	unsigned long flags;
1616 
1617 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618 		return;
1619 
1620 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 	pmen &= ~DMA_PMEN_EPM;
1623 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624 
1625 	/* wait for the protected region status bit to clear */
1626 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631 
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634 	u32 sts;
1635 	unsigned long flags;
1636 
1637 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 	iommu->gcmd |= DMA_GCMD_TE;
1639 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640 
1641 	/* Make sure hardware complete it */
1642 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643 		      readl, (sts & DMA_GSTS_TES), sts);
1644 
1645 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647 
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650 	u32 sts;
1651 	unsigned long flag;
1652 
1653 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655 		return;
1656 
1657 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658 	iommu->gcmd &= ~DMA_GCMD_TE;
1659 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660 
1661 	/* Make sure hardware complete it */
1662 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1664 
1665 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667 
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670 	u32 ndomains;
1671 
1672 	ndomains = cap_ndoms(iommu->cap);
1673 	pr_debug("%s: Number of Domains supported <%d>\n",
1674 		 iommu->name, ndomains);
1675 
1676 	spin_lock_init(&iommu->lock);
1677 
1678 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679 	if (!iommu->domain_ids)
1680 		return -ENOMEM;
1681 
1682 	/*
1683 	 * If Caching mode is set, then invalid translations are tagged
1684 	 * with domain-id 0, hence we need to pre-allocate it. We also
1685 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 	 * make sure it is not used for a real domain.
1687 	 */
1688 	set_bit(0, iommu->domain_ids);
1689 
1690 	/*
1691 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 	 * entry for first-level or pass-through translation modes should
1693 	 * be programmed with a domain id different from those used for
1694 	 * second-level or nested translation. We reserve a domain id for
1695 	 * this purpose.
1696 	 */
1697 	if (sm_supported(iommu))
1698 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699 
1700 	return 0;
1701 }
1702 
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705 	if (!iommu->domain_ids)
1706 		return;
1707 
1708 	/*
1709 	 * All iommu domains must have been detached from the devices,
1710 	 * hence there should be no domain IDs in use.
1711 	 */
1712 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 		    > NUM_RESERVED_DID))
1714 		return;
1715 
1716 	if (iommu->gcmd & DMA_GCMD_TE)
1717 		iommu_disable_translation(iommu);
1718 }
1719 
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722 	if (iommu->domain_ids) {
1723 		bitmap_free(iommu->domain_ids);
1724 		iommu->domain_ids = NULL;
1725 	}
1726 
1727 	if (iommu->copied_tables) {
1728 		bitmap_free(iommu->copied_tables);
1729 		iommu->copied_tables = NULL;
1730 	}
1731 
1732 	/* free context mapping */
1733 	free_context_table(iommu);
1734 
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736 	if (pasid_supported(iommu)) {
1737 		if (ecap_prs(iommu->ecap))
1738 			intel_svm_finish_prq(iommu);
1739 	}
1740 #endif
1741 }
1742 
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749 	/* Only SL is available in legacy mode */
1750 	if (!scalable_mode_support())
1751 		return false;
1752 
1753 	/* Only level (either FL or SL) is available, just use it */
1754 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 		return intel_cap_flts_sanity();
1756 
1757 	/* Both levels are available, decide it based on domain type */
1758 	return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760 
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763 	struct dmar_domain *domain;
1764 
1765 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766 	if (!domain)
1767 		return NULL;
1768 
1769 	domain->nid = NUMA_NO_NODE;
1770 	if (first_level_by_default(type))
1771 		domain->use_first_level = true;
1772 	domain->has_iotlb_device = false;
1773 	INIT_LIST_HEAD(&domain->devices);
1774 	INIT_LIST_HEAD(&domain->dev_pasids);
1775 	spin_lock_init(&domain->lock);
1776 	xa_init(&domain->iommu_array);
1777 
1778 	return domain;
1779 }
1780 
1781 static int domain_attach_iommu(struct dmar_domain *domain,
1782 			       struct intel_iommu *iommu)
1783 {
1784 	struct iommu_domain_info *info, *curr;
1785 	unsigned long ndomains;
1786 	int num, ret = -ENOSPC;
1787 
1788 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1789 	if (!info)
1790 		return -ENOMEM;
1791 
1792 	spin_lock(&iommu->lock);
1793 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794 	if (curr) {
1795 		curr->refcnt++;
1796 		spin_unlock(&iommu->lock);
1797 		kfree(info);
1798 		return 0;
1799 	}
1800 
1801 	ndomains = cap_ndoms(iommu->cap);
1802 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803 	if (num >= ndomains) {
1804 		pr_err("%s: No free domain ids\n", iommu->name);
1805 		goto err_unlock;
1806 	}
1807 
1808 	set_bit(num, iommu->domain_ids);
1809 	info->refcnt	= 1;
1810 	info->did	= num;
1811 	info->iommu	= iommu;
1812 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813 			  NULL, info, GFP_ATOMIC);
1814 	if (curr) {
1815 		ret = xa_err(curr) ? : -EBUSY;
1816 		goto err_clear;
1817 	}
1818 	domain_update_iommu_cap(domain);
1819 
1820 	spin_unlock(&iommu->lock);
1821 	return 0;
1822 
1823 err_clear:
1824 	clear_bit(info->did, iommu->domain_ids);
1825 err_unlock:
1826 	spin_unlock(&iommu->lock);
1827 	kfree(info);
1828 	return ret;
1829 }
1830 
1831 static void domain_detach_iommu(struct dmar_domain *domain,
1832 				struct intel_iommu *iommu)
1833 {
1834 	struct iommu_domain_info *info;
1835 
1836 	spin_lock(&iommu->lock);
1837 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1838 	if (--info->refcnt == 0) {
1839 		clear_bit(info->did, iommu->domain_ids);
1840 		xa_erase(&domain->iommu_array, iommu->seq_id);
1841 		domain->nid = NUMA_NO_NODE;
1842 		domain_update_iommu_cap(domain);
1843 		kfree(info);
1844 	}
1845 	spin_unlock(&iommu->lock);
1846 }
1847 
1848 static inline int guestwidth_to_adjustwidth(int gaw)
1849 {
1850 	int agaw;
1851 	int r = (gaw - 12) % 9;
1852 
1853 	if (r == 0)
1854 		agaw = gaw;
1855 	else
1856 		agaw = gaw + 9 - r;
1857 	if (agaw > 64)
1858 		agaw = 64;
1859 	return agaw;
1860 }
1861 
1862 static void domain_exit(struct dmar_domain *domain)
1863 {
1864 	if (domain->pgd) {
1865 		LIST_HEAD(freelist);
1866 
1867 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868 		put_pages_list(&freelist);
1869 	}
1870 
1871 	if (WARN_ON(!list_empty(&domain->devices)))
1872 		return;
1873 
1874 	kfree(domain);
1875 }
1876 
1877 /*
1878  * Get the PASID directory size for scalable mode context entry.
1879  * Value of X in the PDTS field of a scalable mode context entry
1880  * indicates PASID directory with 2^(X + 7) entries.
1881  */
1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883 {
1884 	unsigned long pds, max_pde;
1885 
1886 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888 	if (pds < 7)
1889 		return 0;
1890 
1891 	return pds - 7;
1892 }
1893 
1894 /*
1895  * Set the RID_PASID field of a scalable mode context entry. The
1896  * IOMMU hardware will use the PASID value set in this field for
1897  * DMA translations of DMA requests without PASID.
1898  */
1899 static inline void
1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901 {
1902 	context->hi |= pasid & ((1 << 20) - 1);
1903 }
1904 
1905 /*
1906  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907  * entry.
1908  */
1909 static inline void context_set_sm_dte(struct context_entry *context)
1910 {
1911 	context->lo |= BIT_ULL(2);
1912 }
1913 
1914 /*
1915  * Set the PRE(Page Request Enable) field of a scalable mode context
1916  * entry.
1917  */
1918 static inline void context_set_sm_pre(struct context_entry *context)
1919 {
1920 	context->lo |= BIT_ULL(4);
1921 }
1922 
1923 /* Convert value to context PASID directory size field coding. */
1924 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1925 
1926 static int domain_context_mapping_one(struct dmar_domain *domain,
1927 				      struct intel_iommu *iommu,
1928 				      struct pasid_table *table,
1929 				      u8 bus, u8 devfn)
1930 {
1931 	struct device_domain_info *info =
1932 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1933 	u16 did = domain_id_iommu(domain, iommu);
1934 	int translation = CONTEXT_TT_MULTI_LEVEL;
1935 	struct context_entry *context;
1936 	int ret;
1937 
1938 	if (hw_pass_through && domain_type_is_si(domain))
1939 		translation = CONTEXT_TT_PASS_THROUGH;
1940 
1941 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1942 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943 
1944 	spin_lock(&iommu->lock);
1945 	ret = -ENOMEM;
1946 	context = iommu_context_addr(iommu, bus, devfn, 1);
1947 	if (!context)
1948 		goto out_unlock;
1949 
1950 	ret = 0;
1951 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1952 		goto out_unlock;
1953 
1954 	/*
1955 	 * For kdump cases, old valid entries may be cached due to the
1956 	 * in-flight DMA and copied pgtable, but there is no unmapping
1957 	 * behaviour for them, thus we need an explicit cache flush for
1958 	 * the newly-mapped device. For kdump, at this point, the device
1959 	 * is supposed to finish reset at its driver probe stage, so no
1960 	 * in-flight DMA will exist, and we don't need to worry anymore
1961 	 * hereafter.
1962 	 */
1963 	if (context_copied(iommu, bus, devfn)) {
1964 		u16 did_old = context_domain_id(context);
1965 
1966 		if (did_old < cap_ndoms(iommu->cap)) {
1967 			iommu->flush.flush_context(iommu, did_old,
1968 						   (((u16)bus) << 8) | devfn,
1969 						   DMA_CCMD_MASK_NOBIT,
1970 						   DMA_CCMD_DEVICE_INVL);
1971 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972 						 DMA_TLB_DSI_FLUSH);
1973 		}
1974 
1975 		clear_context_copied(iommu, bus, devfn);
1976 	}
1977 
1978 	context_clear_entry(context);
1979 
1980 	if (sm_supported(iommu)) {
1981 		unsigned long pds;
1982 
1983 		/* Setup the PASID DIR pointer: */
1984 		pds = context_get_sm_pds(table);
1985 		context->lo = (u64)virt_to_phys(table->table) |
1986 				context_pdts(pds);
1987 
1988 		/* Setup the RID_PASID field: */
1989 		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990 
1991 		/*
1992 		 * Setup the Device-TLB enable bit and Page request
1993 		 * Enable bit:
1994 		 */
1995 		if (info && info->ats_supported)
1996 			context_set_sm_dte(context);
1997 		if (info && info->pri_supported)
1998 			context_set_sm_pre(context);
1999 		if (info && info->pasid_supported)
2000 			context_set_pasid(context);
2001 	} else {
2002 		struct dma_pte *pgd = domain->pgd;
2003 		int agaw;
2004 
2005 		context_set_domain_id(context, did);
2006 
2007 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2008 			/*
2009 			 * Skip top levels of page tables for iommu which has
2010 			 * less agaw than default. Unnecessary for PT mode.
2011 			 */
2012 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013 				ret = -ENOMEM;
2014 				pgd = phys_to_virt(dma_pte_addr(pgd));
2015 				if (!dma_pte_present(pgd))
2016 					goto out_unlock;
2017 			}
2018 
2019 			if (info && info->ats_supported)
2020 				translation = CONTEXT_TT_DEV_IOTLB;
2021 			else
2022 				translation = CONTEXT_TT_MULTI_LEVEL;
2023 
2024 			context_set_address_root(context, virt_to_phys(pgd));
2025 			context_set_address_width(context, agaw);
2026 		} else {
2027 			/*
2028 			 * In pass through mode, AW must be programmed to
2029 			 * indicate the largest AGAW value supported by
2030 			 * hardware. And ASR is ignored by hardware.
2031 			 */
2032 			context_set_address_width(context, iommu->msagaw);
2033 		}
2034 
2035 		context_set_translation_type(context, translation);
2036 	}
2037 
2038 	context_set_fault_enable(context);
2039 	context_set_present(context);
2040 	if (!ecap_coherent(iommu->ecap))
2041 		clflush_cache_range(context, sizeof(*context));
2042 
2043 	/*
2044 	 * It's a non-present to present mapping. If hardware doesn't cache
2045 	 * non-present entry we only need to flush the write-buffer. If the
2046 	 * _does_ cache non-present entries, then it does so in the special
2047 	 * domain #0, which we have to flush:
2048 	 */
2049 	if (cap_caching_mode(iommu->cap)) {
2050 		iommu->flush.flush_context(iommu, 0,
2051 					   (((u16)bus) << 8) | devfn,
2052 					   DMA_CCMD_MASK_NOBIT,
2053 					   DMA_CCMD_DEVICE_INVL);
2054 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055 	} else {
2056 		iommu_flush_write_buffer(iommu);
2057 	}
2058 
2059 	ret = 0;
2060 
2061 out_unlock:
2062 	spin_unlock(&iommu->lock);
2063 
2064 	return ret;
2065 }
2066 
2067 struct domain_context_mapping_data {
2068 	struct dmar_domain *domain;
2069 	struct intel_iommu *iommu;
2070 	struct pasid_table *table;
2071 };
2072 
2073 static int domain_context_mapping_cb(struct pci_dev *pdev,
2074 				     u16 alias, void *opaque)
2075 {
2076 	struct domain_context_mapping_data *data = opaque;
2077 
2078 	return domain_context_mapping_one(data->domain, data->iommu,
2079 					  data->table, PCI_BUS_NUM(alias),
2080 					  alias & 0xff);
2081 }
2082 
2083 static int
2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085 {
2086 	struct domain_context_mapping_data data;
2087 	struct pasid_table *table;
2088 	struct intel_iommu *iommu;
2089 	u8 bus, devfn;
2090 
2091 	iommu = device_to_iommu(dev, &bus, &devfn);
2092 	if (!iommu)
2093 		return -ENODEV;
2094 
2095 	table = intel_pasid_get_table(dev);
2096 
2097 	if (!dev_is_pci(dev))
2098 		return domain_context_mapping_one(domain, iommu, table,
2099 						  bus, devfn);
2100 
2101 	data.domain = domain;
2102 	data.iommu = iommu;
2103 	data.table = table;
2104 
2105 	return pci_for_each_dma_alias(to_pci_dev(dev),
2106 				      &domain_context_mapping_cb, &data);
2107 }
2108 
2109 /* Returns a number of VTD pages, but aligned to MM page size */
2110 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111 					    size_t size)
2112 {
2113 	host_addr &= ~PAGE_MASK;
2114 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 }
2116 
2117 /* Return largest possible superpage level for a given mapping */
2118 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119 					  unsigned long iov_pfn,
2120 					  unsigned long phy_pfn,
2121 					  unsigned long pages)
2122 {
2123 	int support, level = 1;
2124 	unsigned long pfnmerge;
2125 
2126 	support = domain->iommu_superpage;
2127 
2128 	/* To use a large page, the virtual *and* physical addresses
2129 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130 	   of them will mean we have to use smaller pages. So just
2131 	   merge them and check both at once. */
2132 	pfnmerge = iov_pfn | phy_pfn;
2133 
2134 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135 		pages >>= VTD_STRIDE_SHIFT;
2136 		if (!pages)
2137 			break;
2138 		pfnmerge >>= VTD_STRIDE_SHIFT;
2139 		level++;
2140 		support--;
2141 	}
2142 	return level;
2143 }
2144 
2145 /*
2146  * Ensure that old small page tables are removed to make room for superpage(s).
2147  * We're going to add new large pages, so make sure we don't remove their parent
2148  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149  */
2150 static void switch_to_super_page(struct dmar_domain *domain,
2151 				 unsigned long start_pfn,
2152 				 unsigned long end_pfn, int level)
2153 {
2154 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2155 	struct iommu_domain_info *info;
2156 	struct dma_pte *pte = NULL;
2157 	unsigned long i;
2158 
2159 	while (start_pfn <= end_pfn) {
2160 		if (!pte)
2161 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162 					     GFP_ATOMIC);
2163 
2164 		if (dma_pte_present(pte)) {
2165 			dma_pte_free_pagetable(domain, start_pfn,
2166 					       start_pfn + lvl_pages - 1,
2167 					       level + 1);
2168 
2169 			xa_for_each(&domain->iommu_array, i, info)
2170 				iommu_flush_iotlb_psi(info->iommu, domain,
2171 						      start_pfn, lvl_pages,
2172 						      0, 0);
2173 		}
2174 
2175 		pte++;
2176 		start_pfn += lvl_pages;
2177 		if (first_pte_in_page(pte))
2178 			pte = NULL;
2179 	}
2180 }
2181 
2182 static int
2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185 		 gfp_t gfp)
2186 {
2187 	struct dma_pte *first_pte = NULL, *pte = NULL;
2188 	unsigned int largepage_lvl = 0;
2189 	unsigned long lvl_pages = 0;
2190 	phys_addr_t pteval;
2191 	u64 attr;
2192 
2193 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194 		return -EINVAL;
2195 
2196 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197 		return -EINVAL;
2198 
2199 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200 	attr |= DMA_FL_PTE_PRESENT;
2201 	if (domain->use_first_level) {
2202 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203 		if (prot & DMA_PTE_WRITE)
2204 			attr |= DMA_FL_PTE_DIRTY;
2205 	}
2206 
2207 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2208 
2209 	while (nr_pages > 0) {
2210 		uint64_t tmp;
2211 
2212 		if (!pte) {
2213 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2214 					phys_pfn, nr_pages);
2215 
2216 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2217 					     gfp);
2218 			if (!pte)
2219 				return -ENOMEM;
2220 			first_pte = pte;
2221 
2222 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2223 
2224 			/* It is large page*/
2225 			if (largepage_lvl > 1) {
2226 				unsigned long end_pfn;
2227 				unsigned long pages_to_remove;
2228 
2229 				pteval |= DMA_PTE_LARGE_PAGE;
2230 				pages_to_remove = min_t(unsigned long, nr_pages,
2231 							nr_pte_to_next_page(pte) * lvl_pages);
2232 				end_pfn = iov_pfn + pages_to_remove - 1;
2233 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2234 			} else {
2235 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2236 			}
2237 
2238 		}
2239 		/* We don't need lock here, nobody else
2240 		 * touches the iova range
2241 		 */
2242 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2243 		if (tmp) {
2244 			static int dumps = 5;
2245 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2246 				iov_pfn, tmp, (unsigned long long)pteval);
2247 			if (dumps) {
2248 				dumps--;
2249 				debug_dma_dump_mappings(NULL);
2250 			}
2251 			WARN_ON(1);
2252 		}
2253 
2254 		nr_pages -= lvl_pages;
2255 		iov_pfn += lvl_pages;
2256 		phys_pfn += lvl_pages;
2257 		pteval += lvl_pages * VTD_PAGE_SIZE;
2258 
2259 		/* If the next PTE would be the first in a new page, then we
2260 		 * need to flush the cache on the entries we've just written.
2261 		 * And then we'll need to recalculate 'pte', so clear it and
2262 		 * let it get set again in the if (!pte) block above.
2263 		 *
2264 		 * If we're done (!nr_pages) we need to flush the cache too.
2265 		 *
2266 		 * Also if we've been setting superpages, we may need to
2267 		 * recalculate 'pte' and switch back to smaller pages for the
2268 		 * end of the mapping, if the trailing size is not enough to
2269 		 * use another superpage (i.e. nr_pages < lvl_pages).
2270 		 */
2271 		pte++;
2272 		if (!nr_pages || first_pte_in_page(pte) ||
2273 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2274 			domain_flush_cache(domain, first_pte,
2275 					   (void *)pte - (void *)first_pte);
2276 			pte = NULL;
2277 		}
2278 	}
2279 
2280 	return 0;
2281 }
2282 
2283 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2284 {
2285 	struct intel_iommu *iommu = info->iommu;
2286 	struct context_entry *context;
2287 	u16 did_old;
2288 
2289 	if (!iommu)
2290 		return;
2291 
2292 	spin_lock(&iommu->lock);
2293 	context = iommu_context_addr(iommu, bus, devfn, 0);
2294 	if (!context) {
2295 		spin_unlock(&iommu->lock);
2296 		return;
2297 	}
2298 
2299 	if (sm_supported(iommu)) {
2300 		if (hw_pass_through && domain_type_is_si(info->domain))
2301 			did_old = FLPT_DEFAULT_DID;
2302 		else
2303 			did_old = domain_id_iommu(info->domain, iommu);
2304 	} else {
2305 		did_old = context_domain_id(context);
2306 	}
2307 
2308 	context_clear_entry(context);
2309 	__iommu_flush_cache(iommu, context, sizeof(*context));
2310 	spin_unlock(&iommu->lock);
2311 	iommu->flush.flush_context(iommu,
2312 				   did_old,
2313 				   (((u16)bus) << 8) | devfn,
2314 				   DMA_CCMD_MASK_NOBIT,
2315 				   DMA_CCMD_DEVICE_INVL);
2316 
2317 	if (sm_supported(iommu))
2318 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2319 
2320 	iommu->flush.flush_iotlb(iommu,
2321 				 did_old,
2322 				 0,
2323 				 0,
2324 				 DMA_TLB_DSI_FLUSH);
2325 
2326 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2327 }
2328 
2329 static int domain_setup_first_level(struct intel_iommu *iommu,
2330 				    struct dmar_domain *domain,
2331 				    struct device *dev,
2332 				    u32 pasid)
2333 {
2334 	struct dma_pte *pgd = domain->pgd;
2335 	int agaw, level;
2336 	int flags = 0;
2337 
2338 	/*
2339 	 * Skip top levels of page tables for iommu which has
2340 	 * less agaw than default. Unnecessary for PT mode.
2341 	 */
2342 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2343 		pgd = phys_to_virt(dma_pte_addr(pgd));
2344 		if (!dma_pte_present(pgd))
2345 			return -ENOMEM;
2346 	}
2347 
2348 	level = agaw_to_level(agaw);
2349 	if (level != 4 && level != 5)
2350 		return -EINVAL;
2351 
2352 	if (level == 5)
2353 		flags |= PASID_FLAG_FL5LP;
2354 
2355 	if (domain->force_snooping)
2356 		flags |= PASID_FLAG_PAGE_SNOOP;
2357 
2358 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2359 					     domain_id_iommu(domain, iommu),
2360 					     flags);
2361 }
2362 
2363 static bool dev_is_real_dma_subdevice(struct device *dev)
2364 {
2365 	return dev && dev_is_pci(dev) &&
2366 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2367 }
2368 
2369 static int iommu_domain_identity_map(struct dmar_domain *domain,
2370 				     unsigned long first_vpfn,
2371 				     unsigned long last_vpfn)
2372 {
2373 	/*
2374 	 * RMRR range might have overlap with physical memory range,
2375 	 * clear it first
2376 	 */
2377 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2378 
2379 	return __domain_mapping(domain, first_vpfn,
2380 				first_vpfn, last_vpfn - first_vpfn + 1,
2381 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2382 }
2383 
2384 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2385 
2386 static int __init si_domain_init(int hw)
2387 {
2388 	struct dmar_rmrr_unit *rmrr;
2389 	struct device *dev;
2390 	int i, nid, ret;
2391 
2392 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2393 	if (!si_domain)
2394 		return -EFAULT;
2395 
2396 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2397 		domain_exit(si_domain);
2398 		si_domain = NULL;
2399 		return -EFAULT;
2400 	}
2401 
2402 	if (hw)
2403 		return 0;
2404 
2405 	for_each_online_node(nid) {
2406 		unsigned long start_pfn, end_pfn;
2407 		int i;
2408 
2409 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2410 			ret = iommu_domain_identity_map(si_domain,
2411 					mm_to_dma_pfn_start(start_pfn),
2412 					mm_to_dma_pfn_end(end_pfn));
2413 			if (ret)
2414 				return ret;
2415 		}
2416 	}
2417 
2418 	/*
2419 	 * Identity map the RMRRs so that devices with RMRRs could also use
2420 	 * the si_domain.
2421 	 */
2422 	for_each_rmrr_units(rmrr) {
2423 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2424 					  i, dev) {
2425 			unsigned long long start = rmrr->base_address;
2426 			unsigned long long end = rmrr->end_address;
2427 
2428 			if (WARN_ON(end < start ||
2429 				    end >> agaw_to_width(si_domain->agaw)))
2430 				continue;
2431 
2432 			ret = iommu_domain_identity_map(si_domain,
2433 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2434 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2435 			if (ret)
2436 				return ret;
2437 		}
2438 	}
2439 
2440 	return 0;
2441 }
2442 
2443 static int dmar_domain_attach_device(struct dmar_domain *domain,
2444 				     struct device *dev)
2445 {
2446 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2447 	struct intel_iommu *iommu;
2448 	unsigned long flags;
2449 	u8 bus, devfn;
2450 	int ret;
2451 
2452 	iommu = device_to_iommu(dev, &bus, &devfn);
2453 	if (!iommu)
2454 		return -ENODEV;
2455 
2456 	ret = domain_attach_iommu(domain, iommu);
2457 	if (ret)
2458 		return ret;
2459 	info->domain = domain;
2460 	spin_lock_irqsave(&domain->lock, flags);
2461 	list_add(&info->link, &domain->devices);
2462 	spin_unlock_irqrestore(&domain->lock, flags);
2463 
2464 	/* PASID table is mandatory for a PCI device in scalable mode. */
2465 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2466 		/* Setup the PASID entry for requests without PASID: */
2467 		if (hw_pass_through && domain_type_is_si(domain))
2468 			ret = intel_pasid_setup_pass_through(iommu, domain,
2469 					dev, IOMMU_NO_PASID);
2470 		else if (domain->use_first_level)
2471 			ret = domain_setup_first_level(iommu, domain, dev,
2472 					IOMMU_NO_PASID);
2473 		else
2474 			ret = intel_pasid_setup_second_level(iommu, domain,
2475 					dev, IOMMU_NO_PASID);
2476 		if (ret) {
2477 			dev_err(dev, "Setup RID2PASID failed\n");
2478 			device_block_translation(dev);
2479 			return ret;
2480 		}
2481 	}
2482 
2483 	ret = domain_context_mapping(domain, dev);
2484 	if (ret) {
2485 		dev_err(dev, "Domain context map failed\n");
2486 		device_block_translation(dev);
2487 		return ret;
2488 	}
2489 
2490 	iommu_enable_pci_caps(info);
2491 
2492 	return 0;
2493 }
2494 
2495 /**
2496  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2497  * is relaxable (ie. is allowed to be not enforced under some conditions)
2498  * @dev: device handle
2499  *
2500  * We assume that PCI USB devices with RMRRs have them largely
2501  * for historical reasons and that the RMRR space is not actively used post
2502  * boot.  This exclusion may change if vendors begin to abuse it.
2503  *
2504  * The same exception is made for graphics devices, with the requirement that
2505  * any use of the RMRR regions will be torn down before assigning the device
2506  * to a guest.
2507  *
2508  * Return: true if the RMRR is relaxable, false otherwise
2509  */
2510 static bool device_rmrr_is_relaxable(struct device *dev)
2511 {
2512 	struct pci_dev *pdev;
2513 
2514 	if (!dev_is_pci(dev))
2515 		return false;
2516 
2517 	pdev = to_pci_dev(dev);
2518 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2519 		return true;
2520 	else
2521 		return false;
2522 }
2523 
2524 /*
2525  * Return the required default domain type for a specific device.
2526  *
2527  * @dev: the device in query
2528  * @startup: true if this is during early boot
2529  *
2530  * Returns:
2531  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2532  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2533  *  - 0: both identity and dynamic domains work for this device
2534  */
2535 static int device_def_domain_type(struct device *dev)
2536 {
2537 	if (dev_is_pci(dev)) {
2538 		struct pci_dev *pdev = to_pci_dev(dev);
2539 
2540 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2541 			return IOMMU_DOMAIN_IDENTITY;
2542 
2543 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2544 			return IOMMU_DOMAIN_IDENTITY;
2545 	}
2546 
2547 	return 0;
2548 }
2549 
2550 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2551 {
2552 	/*
2553 	 * Start from the sane iommu hardware state.
2554 	 * If the queued invalidation is already initialized by us
2555 	 * (for example, while enabling interrupt-remapping) then
2556 	 * we got the things already rolling from a sane state.
2557 	 */
2558 	if (!iommu->qi) {
2559 		/*
2560 		 * Clear any previous faults.
2561 		 */
2562 		dmar_fault(-1, iommu);
2563 		/*
2564 		 * Disable queued invalidation if supported and already enabled
2565 		 * before OS handover.
2566 		 */
2567 		dmar_disable_qi(iommu);
2568 	}
2569 
2570 	if (dmar_enable_qi(iommu)) {
2571 		/*
2572 		 * Queued Invalidate not enabled, use Register Based Invalidate
2573 		 */
2574 		iommu->flush.flush_context = __iommu_flush_context;
2575 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2576 		pr_info("%s: Using Register based invalidation\n",
2577 			iommu->name);
2578 	} else {
2579 		iommu->flush.flush_context = qi_flush_context;
2580 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2581 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2582 	}
2583 }
2584 
2585 static int copy_context_table(struct intel_iommu *iommu,
2586 			      struct root_entry *old_re,
2587 			      struct context_entry **tbl,
2588 			      int bus, bool ext)
2589 {
2590 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2591 	struct context_entry *new_ce = NULL, ce;
2592 	struct context_entry *old_ce = NULL;
2593 	struct root_entry re;
2594 	phys_addr_t old_ce_phys;
2595 
2596 	tbl_idx = ext ? bus * 2 : bus;
2597 	memcpy(&re, old_re, sizeof(re));
2598 
2599 	for (devfn = 0; devfn < 256; devfn++) {
2600 		/* First calculate the correct index */
2601 		idx = (ext ? devfn * 2 : devfn) % 256;
2602 
2603 		if (idx == 0) {
2604 			/* First save what we may have and clean up */
2605 			if (new_ce) {
2606 				tbl[tbl_idx] = new_ce;
2607 				__iommu_flush_cache(iommu, new_ce,
2608 						    VTD_PAGE_SIZE);
2609 				pos = 1;
2610 			}
2611 
2612 			if (old_ce)
2613 				memunmap(old_ce);
2614 
2615 			ret = 0;
2616 			if (devfn < 0x80)
2617 				old_ce_phys = root_entry_lctp(&re);
2618 			else
2619 				old_ce_phys = root_entry_uctp(&re);
2620 
2621 			if (!old_ce_phys) {
2622 				if (ext && devfn == 0) {
2623 					/* No LCTP, try UCTP */
2624 					devfn = 0x7f;
2625 					continue;
2626 				} else {
2627 					goto out;
2628 				}
2629 			}
2630 
2631 			ret = -ENOMEM;
2632 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2633 					MEMREMAP_WB);
2634 			if (!old_ce)
2635 				goto out;
2636 
2637 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2638 			if (!new_ce)
2639 				goto out_unmap;
2640 
2641 			ret = 0;
2642 		}
2643 
2644 		/* Now copy the context entry */
2645 		memcpy(&ce, old_ce + idx, sizeof(ce));
2646 
2647 		if (!context_present(&ce))
2648 			continue;
2649 
2650 		did = context_domain_id(&ce);
2651 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2652 			set_bit(did, iommu->domain_ids);
2653 
2654 		set_context_copied(iommu, bus, devfn);
2655 		new_ce[idx] = ce;
2656 	}
2657 
2658 	tbl[tbl_idx + pos] = new_ce;
2659 
2660 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2661 
2662 out_unmap:
2663 	memunmap(old_ce);
2664 
2665 out:
2666 	return ret;
2667 }
2668 
2669 static int copy_translation_tables(struct intel_iommu *iommu)
2670 {
2671 	struct context_entry **ctxt_tbls;
2672 	struct root_entry *old_rt;
2673 	phys_addr_t old_rt_phys;
2674 	int ctxt_table_entries;
2675 	u64 rtaddr_reg;
2676 	int bus, ret;
2677 	bool new_ext, ext;
2678 
2679 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2680 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2681 	new_ext    = !!sm_supported(iommu);
2682 
2683 	/*
2684 	 * The RTT bit can only be changed when translation is disabled,
2685 	 * but disabling translation means to open a window for data
2686 	 * corruption. So bail out and don't copy anything if we would
2687 	 * have to change the bit.
2688 	 */
2689 	if (new_ext != ext)
2690 		return -EINVAL;
2691 
2692 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2693 	if (!iommu->copied_tables)
2694 		return -ENOMEM;
2695 
2696 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2697 	if (!old_rt_phys)
2698 		return -EINVAL;
2699 
2700 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2701 	if (!old_rt)
2702 		return -ENOMEM;
2703 
2704 	/* This is too big for the stack - allocate it from slab */
2705 	ctxt_table_entries = ext ? 512 : 256;
2706 	ret = -ENOMEM;
2707 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2708 	if (!ctxt_tbls)
2709 		goto out_unmap;
2710 
2711 	for (bus = 0; bus < 256; bus++) {
2712 		ret = copy_context_table(iommu, &old_rt[bus],
2713 					 ctxt_tbls, bus, ext);
2714 		if (ret) {
2715 			pr_err("%s: Failed to copy context table for bus %d\n",
2716 				iommu->name, bus);
2717 			continue;
2718 		}
2719 	}
2720 
2721 	spin_lock(&iommu->lock);
2722 
2723 	/* Context tables are copied, now write them to the root_entry table */
2724 	for (bus = 0; bus < 256; bus++) {
2725 		int idx = ext ? bus * 2 : bus;
2726 		u64 val;
2727 
2728 		if (ctxt_tbls[idx]) {
2729 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2730 			iommu->root_entry[bus].lo = val;
2731 		}
2732 
2733 		if (!ext || !ctxt_tbls[idx + 1])
2734 			continue;
2735 
2736 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2737 		iommu->root_entry[bus].hi = val;
2738 	}
2739 
2740 	spin_unlock(&iommu->lock);
2741 
2742 	kfree(ctxt_tbls);
2743 
2744 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2745 
2746 	ret = 0;
2747 
2748 out_unmap:
2749 	memunmap(old_rt);
2750 
2751 	return ret;
2752 }
2753 
2754 static int __init init_dmars(void)
2755 {
2756 	struct dmar_drhd_unit *drhd;
2757 	struct intel_iommu *iommu;
2758 	int ret;
2759 
2760 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2761 	if (ret)
2762 		goto free_iommu;
2763 
2764 	for_each_iommu(iommu, drhd) {
2765 		if (drhd->ignored) {
2766 			iommu_disable_translation(iommu);
2767 			continue;
2768 		}
2769 
2770 		/*
2771 		 * Find the max pasid size of all IOMMU's in the system.
2772 		 * We need to ensure the system pasid table is no bigger
2773 		 * than the smallest supported.
2774 		 */
2775 		if (pasid_supported(iommu)) {
2776 			u32 temp = 2 << ecap_pss(iommu->ecap);
2777 
2778 			intel_pasid_max_id = min_t(u32, temp,
2779 						   intel_pasid_max_id);
2780 		}
2781 
2782 		intel_iommu_init_qi(iommu);
2783 
2784 		ret = iommu_init_domains(iommu);
2785 		if (ret)
2786 			goto free_iommu;
2787 
2788 		init_translation_status(iommu);
2789 
2790 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2791 			iommu_disable_translation(iommu);
2792 			clear_translation_pre_enabled(iommu);
2793 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2794 				iommu->name);
2795 		}
2796 
2797 		/*
2798 		 * TBD:
2799 		 * we could share the same root & context tables
2800 		 * among all IOMMU's. Need to Split it later.
2801 		 */
2802 		ret = iommu_alloc_root_entry(iommu);
2803 		if (ret)
2804 			goto free_iommu;
2805 
2806 		if (translation_pre_enabled(iommu)) {
2807 			pr_info("Translation already enabled - trying to copy translation structures\n");
2808 
2809 			ret = copy_translation_tables(iommu);
2810 			if (ret) {
2811 				/*
2812 				 * We found the IOMMU with translation
2813 				 * enabled - but failed to copy over the
2814 				 * old root-entry table. Try to proceed
2815 				 * by disabling translation now and
2816 				 * allocating a clean root-entry table.
2817 				 * This might cause DMAR faults, but
2818 				 * probably the dump will still succeed.
2819 				 */
2820 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2821 				       iommu->name);
2822 				iommu_disable_translation(iommu);
2823 				clear_translation_pre_enabled(iommu);
2824 			} else {
2825 				pr_info("Copied translation tables from previous kernel for %s\n",
2826 					iommu->name);
2827 			}
2828 		}
2829 
2830 		if (!ecap_pass_through(iommu->ecap))
2831 			hw_pass_through = 0;
2832 		intel_svm_check(iommu);
2833 	}
2834 
2835 	/*
2836 	 * Now that qi is enabled on all iommus, set the root entry and flush
2837 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2838 	 * flush_context function will loop forever and the boot hangs.
2839 	 */
2840 	for_each_active_iommu(iommu, drhd) {
2841 		iommu_flush_write_buffer(iommu);
2842 		iommu_set_root_entry(iommu);
2843 	}
2844 
2845 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2846 	dmar_map_gfx = 0;
2847 #endif
2848 
2849 	if (!dmar_map_gfx)
2850 		iommu_identity_mapping |= IDENTMAP_GFX;
2851 
2852 	check_tylersburg_isoch();
2853 
2854 	ret = si_domain_init(hw_pass_through);
2855 	if (ret)
2856 		goto free_iommu;
2857 
2858 	/*
2859 	 * for each drhd
2860 	 *   enable fault log
2861 	 *   global invalidate context cache
2862 	 *   global invalidate iotlb
2863 	 *   enable translation
2864 	 */
2865 	for_each_iommu(iommu, drhd) {
2866 		if (drhd->ignored) {
2867 			/*
2868 			 * we always have to disable PMRs or DMA may fail on
2869 			 * this device
2870 			 */
2871 			if (force_on)
2872 				iommu_disable_protect_mem_regions(iommu);
2873 			continue;
2874 		}
2875 
2876 		iommu_flush_write_buffer(iommu);
2877 
2878 #ifdef CONFIG_INTEL_IOMMU_SVM
2879 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2880 			/*
2881 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2882 			 * could cause possible lock race condition.
2883 			 */
2884 			up_write(&dmar_global_lock);
2885 			ret = intel_svm_enable_prq(iommu);
2886 			down_write(&dmar_global_lock);
2887 			if (ret)
2888 				goto free_iommu;
2889 		}
2890 #endif
2891 		ret = dmar_set_interrupt(iommu);
2892 		if (ret)
2893 			goto free_iommu;
2894 	}
2895 
2896 	return 0;
2897 
2898 free_iommu:
2899 	for_each_active_iommu(iommu, drhd) {
2900 		disable_dmar_iommu(iommu);
2901 		free_dmar_iommu(iommu);
2902 	}
2903 	if (si_domain) {
2904 		domain_exit(si_domain);
2905 		si_domain = NULL;
2906 	}
2907 
2908 	return ret;
2909 }
2910 
2911 static void __init init_no_remapping_devices(void)
2912 {
2913 	struct dmar_drhd_unit *drhd;
2914 	struct device *dev;
2915 	int i;
2916 
2917 	for_each_drhd_unit(drhd) {
2918 		if (!drhd->include_all) {
2919 			for_each_active_dev_scope(drhd->devices,
2920 						  drhd->devices_cnt, i, dev)
2921 				break;
2922 			/* ignore DMAR unit if no devices exist */
2923 			if (i == drhd->devices_cnt)
2924 				drhd->ignored = 1;
2925 		}
2926 	}
2927 
2928 	for_each_active_drhd_unit(drhd) {
2929 		if (drhd->include_all)
2930 			continue;
2931 
2932 		for_each_active_dev_scope(drhd->devices,
2933 					  drhd->devices_cnt, i, dev)
2934 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2935 				break;
2936 		if (i < drhd->devices_cnt)
2937 			continue;
2938 
2939 		/* This IOMMU has *only* gfx devices. Either bypass it or
2940 		   set the gfx_mapped flag, as appropriate */
2941 		drhd->gfx_dedicated = 1;
2942 		if (!dmar_map_gfx)
2943 			drhd->ignored = 1;
2944 	}
2945 }
2946 
2947 #ifdef CONFIG_SUSPEND
2948 static int init_iommu_hw(void)
2949 {
2950 	struct dmar_drhd_unit *drhd;
2951 	struct intel_iommu *iommu = NULL;
2952 	int ret;
2953 
2954 	for_each_active_iommu(iommu, drhd) {
2955 		if (iommu->qi) {
2956 			ret = dmar_reenable_qi(iommu);
2957 			if (ret)
2958 				return ret;
2959 		}
2960 	}
2961 
2962 	for_each_iommu(iommu, drhd) {
2963 		if (drhd->ignored) {
2964 			/*
2965 			 * we always have to disable PMRs or DMA may fail on
2966 			 * this device
2967 			 */
2968 			if (force_on)
2969 				iommu_disable_protect_mem_regions(iommu);
2970 			continue;
2971 		}
2972 
2973 		iommu_flush_write_buffer(iommu);
2974 		iommu_set_root_entry(iommu);
2975 		iommu_enable_translation(iommu);
2976 		iommu_disable_protect_mem_regions(iommu);
2977 	}
2978 
2979 	return 0;
2980 }
2981 
2982 static void iommu_flush_all(void)
2983 {
2984 	struct dmar_drhd_unit *drhd;
2985 	struct intel_iommu *iommu;
2986 
2987 	for_each_active_iommu(iommu, drhd) {
2988 		iommu->flush.flush_context(iommu, 0, 0, 0,
2989 					   DMA_CCMD_GLOBAL_INVL);
2990 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2991 					 DMA_TLB_GLOBAL_FLUSH);
2992 	}
2993 }
2994 
2995 static int iommu_suspend(void)
2996 {
2997 	struct dmar_drhd_unit *drhd;
2998 	struct intel_iommu *iommu = NULL;
2999 	unsigned long flag;
3000 
3001 	for_each_active_iommu(iommu, drhd) {
3002 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3003 					     GFP_KERNEL);
3004 		if (!iommu->iommu_state)
3005 			goto nomem;
3006 	}
3007 
3008 	iommu_flush_all();
3009 
3010 	for_each_active_iommu(iommu, drhd) {
3011 		iommu_disable_translation(iommu);
3012 
3013 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3014 
3015 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3016 			readl(iommu->reg + DMAR_FECTL_REG);
3017 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3018 			readl(iommu->reg + DMAR_FEDATA_REG);
3019 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3020 			readl(iommu->reg + DMAR_FEADDR_REG);
3021 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3022 			readl(iommu->reg + DMAR_FEUADDR_REG);
3023 
3024 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3025 	}
3026 	return 0;
3027 
3028 nomem:
3029 	for_each_active_iommu(iommu, drhd)
3030 		kfree(iommu->iommu_state);
3031 
3032 	return -ENOMEM;
3033 }
3034 
3035 static void iommu_resume(void)
3036 {
3037 	struct dmar_drhd_unit *drhd;
3038 	struct intel_iommu *iommu = NULL;
3039 	unsigned long flag;
3040 
3041 	if (init_iommu_hw()) {
3042 		if (force_on)
3043 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3044 		else
3045 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3046 		return;
3047 	}
3048 
3049 	for_each_active_iommu(iommu, drhd) {
3050 
3051 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3052 
3053 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3054 			iommu->reg + DMAR_FECTL_REG);
3055 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3056 			iommu->reg + DMAR_FEDATA_REG);
3057 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3058 			iommu->reg + DMAR_FEADDR_REG);
3059 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3060 			iommu->reg + DMAR_FEUADDR_REG);
3061 
3062 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3063 	}
3064 
3065 	for_each_active_iommu(iommu, drhd)
3066 		kfree(iommu->iommu_state);
3067 }
3068 
3069 static struct syscore_ops iommu_syscore_ops = {
3070 	.resume		= iommu_resume,
3071 	.suspend	= iommu_suspend,
3072 };
3073 
3074 static void __init init_iommu_pm_ops(void)
3075 {
3076 	register_syscore_ops(&iommu_syscore_ops);
3077 }
3078 
3079 #else
3080 static inline void init_iommu_pm_ops(void) {}
3081 #endif	/* CONFIG_PM */
3082 
3083 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3084 {
3085 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3086 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3087 	    rmrr->end_address <= rmrr->base_address ||
3088 	    arch_rmrr_sanity_check(rmrr))
3089 		return -EINVAL;
3090 
3091 	return 0;
3092 }
3093 
3094 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3095 {
3096 	struct acpi_dmar_reserved_memory *rmrr;
3097 	struct dmar_rmrr_unit *rmrru;
3098 
3099 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3100 	if (rmrr_sanity_check(rmrr)) {
3101 		pr_warn(FW_BUG
3102 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3103 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3104 			   rmrr->base_address, rmrr->end_address,
3105 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3106 			   dmi_get_system_info(DMI_BIOS_VERSION),
3107 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3108 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3109 	}
3110 
3111 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3112 	if (!rmrru)
3113 		goto out;
3114 
3115 	rmrru->hdr = header;
3116 
3117 	rmrru->base_address = rmrr->base_address;
3118 	rmrru->end_address = rmrr->end_address;
3119 
3120 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3121 				((void *)rmrr) + rmrr->header.length,
3122 				&rmrru->devices_cnt);
3123 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3124 		goto free_rmrru;
3125 
3126 	list_add(&rmrru->list, &dmar_rmrr_units);
3127 
3128 	return 0;
3129 free_rmrru:
3130 	kfree(rmrru);
3131 out:
3132 	return -ENOMEM;
3133 }
3134 
3135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3136 {
3137 	struct dmar_atsr_unit *atsru;
3138 	struct acpi_dmar_atsr *tmp;
3139 
3140 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3141 				dmar_rcu_check()) {
3142 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3143 		if (atsr->segment != tmp->segment)
3144 			continue;
3145 		if (atsr->header.length != tmp->header.length)
3146 			continue;
3147 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3148 			return atsru;
3149 	}
3150 
3151 	return NULL;
3152 }
3153 
3154 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3155 {
3156 	struct acpi_dmar_atsr *atsr;
3157 	struct dmar_atsr_unit *atsru;
3158 
3159 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3160 		return 0;
3161 
3162 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3163 	atsru = dmar_find_atsr(atsr);
3164 	if (atsru)
3165 		return 0;
3166 
3167 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3168 	if (!atsru)
3169 		return -ENOMEM;
3170 
3171 	/*
3172 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3173 	 * copy the memory content because the memory buffer will be freed
3174 	 * on return.
3175 	 */
3176 	atsru->hdr = (void *)(atsru + 1);
3177 	memcpy(atsru->hdr, hdr, hdr->length);
3178 	atsru->include_all = atsr->flags & 0x1;
3179 	if (!atsru->include_all) {
3180 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3181 				(void *)atsr + atsr->header.length,
3182 				&atsru->devices_cnt);
3183 		if (atsru->devices_cnt && atsru->devices == NULL) {
3184 			kfree(atsru);
3185 			return -ENOMEM;
3186 		}
3187 	}
3188 
3189 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3190 
3191 	return 0;
3192 }
3193 
3194 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3195 {
3196 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3197 	kfree(atsru);
3198 }
3199 
3200 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3201 {
3202 	struct acpi_dmar_atsr *atsr;
3203 	struct dmar_atsr_unit *atsru;
3204 
3205 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3206 	atsru = dmar_find_atsr(atsr);
3207 	if (atsru) {
3208 		list_del_rcu(&atsru->list);
3209 		synchronize_rcu();
3210 		intel_iommu_free_atsr(atsru);
3211 	}
3212 
3213 	return 0;
3214 }
3215 
3216 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3217 {
3218 	int i;
3219 	struct device *dev;
3220 	struct acpi_dmar_atsr *atsr;
3221 	struct dmar_atsr_unit *atsru;
3222 
3223 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3224 	atsru = dmar_find_atsr(atsr);
3225 	if (!atsru)
3226 		return 0;
3227 
3228 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3229 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3230 					  i, dev)
3231 			return -EBUSY;
3232 	}
3233 
3234 	return 0;
3235 }
3236 
3237 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3238 {
3239 	struct dmar_satc_unit *satcu;
3240 	struct acpi_dmar_satc *tmp;
3241 
3242 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3243 				dmar_rcu_check()) {
3244 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3245 		if (satc->segment != tmp->segment)
3246 			continue;
3247 		if (satc->header.length != tmp->header.length)
3248 			continue;
3249 		if (memcmp(satc, tmp, satc->header.length) == 0)
3250 			return satcu;
3251 	}
3252 
3253 	return NULL;
3254 }
3255 
3256 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3257 {
3258 	struct acpi_dmar_satc *satc;
3259 	struct dmar_satc_unit *satcu;
3260 
3261 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3262 		return 0;
3263 
3264 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3265 	satcu = dmar_find_satc(satc);
3266 	if (satcu)
3267 		return 0;
3268 
3269 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3270 	if (!satcu)
3271 		return -ENOMEM;
3272 
3273 	satcu->hdr = (void *)(satcu + 1);
3274 	memcpy(satcu->hdr, hdr, hdr->length);
3275 	satcu->atc_required = satc->flags & 0x1;
3276 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3277 					      (void *)satc + satc->header.length,
3278 					      &satcu->devices_cnt);
3279 	if (satcu->devices_cnt && !satcu->devices) {
3280 		kfree(satcu);
3281 		return -ENOMEM;
3282 	}
3283 	list_add_rcu(&satcu->list, &dmar_satc_units);
3284 
3285 	return 0;
3286 }
3287 
3288 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3289 {
3290 	int sp, ret;
3291 	struct intel_iommu *iommu = dmaru->iommu;
3292 
3293 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3294 	if (ret)
3295 		goto out;
3296 
3297 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3298 		pr_warn("%s: Doesn't support hardware pass through.\n",
3299 			iommu->name);
3300 		return -ENXIO;
3301 	}
3302 
3303 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3304 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3305 		pr_warn("%s: Doesn't support large page.\n",
3306 			iommu->name);
3307 		return -ENXIO;
3308 	}
3309 
3310 	/*
3311 	 * Disable translation if already enabled prior to OS handover.
3312 	 */
3313 	if (iommu->gcmd & DMA_GCMD_TE)
3314 		iommu_disable_translation(iommu);
3315 
3316 	ret = iommu_init_domains(iommu);
3317 	if (ret == 0)
3318 		ret = iommu_alloc_root_entry(iommu);
3319 	if (ret)
3320 		goto out;
3321 
3322 	intel_svm_check(iommu);
3323 
3324 	if (dmaru->ignored) {
3325 		/*
3326 		 * we always have to disable PMRs or DMA may fail on this device
3327 		 */
3328 		if (force_on)
3329 			iommu_disable_protect_mem_regions(iommu);
3330 		return 0;
3331 	}
3332 
3333 	intel_iommu_init_qi(iommu);
3334 	iommu_flush_write_buffer(iommu);
3335 
3336 #ifdef CONFIG_INTEL_IOMMU_SVM
3337 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3338 		ret = intel_svm_enable_prq(iommu);
3339 		if (ret)
3340 			goto disable_iommu;
3341 	}
3342 #endif
3343 	ret = dmar_set_interrupt(iommu);
3344 	if (ret)
3345 		goto disable_iommu;
3346 
3347 	iommu_set_root_entry(iommu);
3348 	iommu_enable_translation(iommu);
3349 
3350 	iommu_disable_protect_mem_regions(iommu);
3351 	return 0;
3352 
3353 disable_iommu:
3354 	disable_dmar_iommu(iommu);
3355 out:
3356 	free_dmar_iommu(iommu);
3357 	return ret;
3358 }
3359 
3360 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3361 {
3362 	int ret = 0;
3363 	struct intel_iommu *iommu = dmaru->iommu;
3364 
3365 	if (!intel_iommu_enabled)
3366 		return 0;
3367 	if (iommu == NULL)
3368 		return -EINVAL;
3369 
3370 	if (insert) {
3371 		ret = intel_iommu_add(dmaru);
3372 	} else {
3373 		disable_dmar_iommu(iommu);
3374 		free_dmar_iommu(iommu);
3375 	}
3376 
3377 	return ret;
3378 }
3379 
3380 static void intel_iommu_free_dmars(void)
3381 {
3382 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3383 	struct dmar_atsr_unit *atsru, *atsr_n;
3384 	struct dmar_satc_unit *satcu, *satc_n;
3385 
3386 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3387 		list_del(&rmrru->list);
3388 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3389 		kfree(rmrru);
3390 	}
3391 
3392 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3393 		list_del(&atsru->list);
3394 		intel_iommu_free_atsr(atsru);
3395 	}
3396 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3397 		list_del(&satcu->list);
3398 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3399 		kfree(satcu);
3400 	}
3401 }
3402 
3403 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3404 {
3405 	struct dmar_satc_unit *satcu;
3406 	struct acpi_dmar_satc *satc;
3407 	struct device *tmp;
3408 	int i;
3409 
3410 	dev = pci_physfn(dev);
3411 	rcu_read_lock();
3412 
3413 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3414 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3415 		if (satc->segment != pci_domain_nr(dev->bus))
3416 			continue;
3417 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3418 			if (to_pci_dev(tmp) == dev)
3419 				goto out;
3420 	}
3421 	satcu = NULL;
3422 out:
3423 	rcu_read_unlock();
3424 	return satcu;
3425 }
3426 
3427 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3428 {
3429 	int i, ret = 1;
3430 	struct pci_bus *bus;
3431 	struct pci_dev *bridge = NULL;
3432 	struct device *tmp;
3433 	struct acpi_dmar_atsr *atsr;
3434 	struct dmar_atsr_unit *atsru;
3435 	struct dmar_satc_unit *satcu;
3436 
3437 	dev = pci_physfn(dev);
3438 	satcu = dmar_find_matched_satc_unit(dev);
3439 	if (satcu)
3440 		/*
3441 		 * This device supports ATS as it is in SATC table.
3442 		 * When IOMMU is in legacy mode, enabling ATS is done
3443 		 * automatically by HW for the device that requires
3444 		 * ATS, hence OS should not enable this device ATS
3445 		 * to avoid duplicated TLB invalidation.
3446 		 */
3447 		return !(satcu->atc_required && !sm_supported(iommu));
3448 
3449 	for (bus = dev->bus; bus; bus = bus->parent) {
3450 		bridge = bus->self;
3451 		/* If it's an integrated device, allow ATS */
3452 		if (!bridge)
3453 			return 1;
3454 		/* Connected via non-PCIe: no ATS */
3455 		if (!pci_is_pcie(bridge) ||
3456 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3457 			return 0;
3458 		/* If we found the root port, look it up in the ATSR */
3459 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3460 			break;
3461 	}
3462 
3463 	rcu_read_lock();
3464 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3465 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3466 		if (atsr->segment != pci_domain_nr(dev->bus))
3467 			continue;
3468 
3469 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3470 			if (tmp == &bridge->dev)
3471 				goto out;
3472 
3473 		if (atsru->include_all)
3474 			goto out;
3475 	}
3476 	ret = 0;
3477 out:
3478 	rcu_read_unlock();
3479 
3480 	return ret;
3481 }
3482 
3483 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3484 {
3485 	int ret;
3486 	struct dmar_rmrr_unit *rmrru;
3487 	struct dmar_atsr_unit *atsru;
3488 	struct dmar_satc_unit *satcu;
3489 	struct acpi_dmar_atsr *atsr;
3490 	struct acpi_dmar_reserved_memory *rmrr;
3491 	struct acpi_dmar_satc *satc;
3492 
3493 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3494 		return 0;
3495 
3496 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3497 		rmrr = container_of(rmrru->hdr,
3498 				    struct acpi_dmar_reserved_memory, header);
3499 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3500 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3501 				((void *)rmrr) + rmrr->header.length,
3502 				rmrr->segment, rmrru->devices,
3503 				rmrru->devices_cnt);
3504 			if (ret < 0)
3505 				return ret;
3506 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3507 			dmar_remove_dev_scope(info, rmrr->segment,
3508 				rmrru->devices, rmrru->devices_cnt);
3509 		}
3510 	}
3511 
3512 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3513 		if (atsru->include_all)
3514 			continue;
3515 
3516 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3517 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3518 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3519 					(void *)atsr + atsr->header.length,
3520 					atsr->segment, atsru->devices,
3521 					atsru->devices_cnt);
3522 			if (ret > 0)
3523 				break;
3524 			else if (ret < 0)
3525 				return ret;
3526 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3527 			if (dmar_remove_dev_scope(info, atsr->segment,
3528 					atsru->devices, atsru->devices_cnt))
3529 				break;
3530 		}
3531 	}
3532 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3533 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3535 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3536 					(void *)satc + satc->header.length,
3537 					satc->segment, satcu->devices,
3538 					satcu->devices_cnt);
3539 			if (ret > 0)
3540 				break;
3541 			else if (ret < 0)
3542 				return ret;
3543 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3544 			if (dmar_remove_dev_scope(info, satc->segment,
3545 					satcu->devices, satcu->devices_cnt))
3546 				break;
3547 		}
3548 	}
3549 
3550 	return 0;
3551 }
3552 
3553 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3554 				       unsigned long val, void *v)
3555 {
3556 	struct memory_notify *mhp = v;
3557 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3558 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3559 			mhp->nr_pages - 1);
3560 
3561 	switch (val) {
3562 	case MEM_GOING_ONLINE:
3563 		if (iommu_domain_identity_map(si_domain,
3564 					      start_vpfn, last_vpfn)) {
3565 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3566 				start_vpfn, last_vpfn);
3567 			return NOTIFY_BAD;
3568 		}
3569 		break;
3570 
3571 	case MEM_OFFLINE:
3572 	case MEM_CANCEL_ONLINE:
3573 		{
3574 			struct dmar_drhd_unit *drhd;
3575 			struct intel_iommu *iommu;
3576 			LIST_HEAD(freelist);
3577 
3578 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3579 
3580 			rcu_read_lock();
3581 			for_each_active_iommu(iommu, drhd)
3582 				iommu_flush_iotlb_psi(iommu, si_domain,
3583 					start_vpfn, mhp->nr_pages,
3584 					list_empty(&freelist), 0);
3585 			rcu_read_unlock();
3586 			put_pages_list(&freelist);
3587 		}
3588 		break;
3589 	}
3590 
3591 	return NOTIFY_OK;
3592 }
3593 
3594 static struct notifier_block intel_iommu_memory_nb = {
3595 	.notifier_call = intel_iommu_memory_notifier,
3596 	.priority = 0
3597 };
3598 
3599 static void intel_disable_iommus(void)
3600 {
3601 	struct intel_iommu *iommu = NULL;
3602 	struct dmar_drhd_unit *drhd;
3603 
3604 	for_each_iommu(iommu, drhd)
3605 		iommu_disable_translation(iommu);
3606 }
3607 
3608 void intel_iommu_shutdown(void)
3609 {
3610 	struct dmar_drhd_unit *drhd;
3611 	struct intel_iommu *iommu = NULL;
3612 
3613 	if (no_iommu || dmar_disabled)
3614 		return;
3615 
3616 	down_write(&dmar_global_lock);
3617 
3618 	/* Disable PMRs explicitly here. */
3619 	for_each_iommu(iommu, drhd)
3620 		iommu_disable_protect_mem_regions(iommu);
3621 
3622 	/* Make sure the IOMMUs are switched off */
3623 	intel_disable_iommus();
3624 
3625 	up_write(&dmar_global_lock);
3626 }
3627 
3628 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3629 {
3630 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3631 
3632 	return container_of(iommu_dev, struct intel_iommu, iommu);
3633 }
3634 
3635 static ssize_t version_show(struct device *dev,
3636 			    struct device_attribute *attr, char *buf)
3637 {
3638 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3639 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3640 	return sysfs_emit(buf, "%d:%d\n",
3641 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3642 }
3643 static DEVICE_ATTR_RO(version);
3644 
3645 static ssize_t address_show(struct device *dev,
3646 			    struct device_attribute *attr, char *buf)
3647 {
3648 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3649 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3650 }
3651 static DEVICE_ATTR_RO(address);
3652 
3653 static ssize_t cap_show(struct device *dev,
3654 			struct device_attribute *attr, char *buf)
3655 {
3656 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3657 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3658 }
3659 static DEVICE_ATTR_RO(cap);
3660 
3661 static ssize_t ecap_show(struct device *dev,
3662 			 struct device_attribute *attr, char *buf)
3663 {
3664 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3665 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3666 }
3667 static DEVICE_ATTR_RO(ecap);
3668 
3669 static ssize_t domains_supported_show(struct device *dev,
3670 				      struct device_attribute *attr, char *buf)
3671 {
3672 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3673 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3674 }
3675 static DEVICE_ATTR_RO(domains_supported);
3676 
3677 static ssize_t domains_used_show(struct device *dev,
3678 				 struct device_attribute *attr, char *buf)
3679 {
3680 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3681 	return sysfs_emit(buf, "%d\n",
3682 			  bitmap_weight(iommu->domain_ids,
3683 					cap_ndoms(iommu->cap)));
3684 }
3685 static DEVICE_ATTR_RO(domains_used);
3686 
3687 static struct attribute *intel_iommu_attrs[] = {
3688 	&dev_attr_version.attr,
3689 	&dev_attr_address.attr,
3690 	&dev_attr_cap.attr,
3691 	&dev_attr_ecap.attr,
3692 	&dev_attr_domains_supported.attr,
3693 	&dev_attr_domains_used.attr,
3694 	NULL,
3695 };
3696 
3697 static struct attribute_group intel_iommu_group = {
3698 	.name = "intel-iommu",
3699 	.attrs = intel_iommu_attrs,
3700 };
3701 
3702 const struct attribute_group *intel_iommu_groups[] = {
3703 	&intel_iommu_group,
3704 	NULL,
3705 };
3706 
3707 static inline bool has_external_pci(void)
3708 {
3709 	struct pci_dev *pdev = NULL;
3710 
3711 	for_each_pci_dev(pdev)
3712 		if (pdev->external_facing) {
3713 			pci_dev_put(pdev);
3714 			return true;
3715 		}
3716 
3717 	return false;
3718 }
3719 
3720 static int __init platform_optin_force_iommu(void)
3721 {
3722 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3723 		return 0;
3724 
3725 	if (no_iommu || dmar_disabled)
3726 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3727 
3728 	/*
3729 	 * If Intel-IOMMU is disabled by default, we will apply identity
3730 	 * map for all devices except those marked as being untrusted.
3731 	 */
3732 	if (dmar_disabled)
3733 		iommu_set_default_passthrough(false);
3734 
3735 	dmar_disabled = 0;
3736 	no_iommu = 0;
3737 
3738 	return 1;
3739 }
3740 
3741 static int __init probe_acpi_namespace_devices(void)
3742 {
3743 	struct dmar_drhd_unit *drhd;
3744 	/* To avoid a -Wunused-but-set-variable warning. */
3745 	struct intel_iommu *iommu __maybe_unused;
3746 	struct device *dev;
3747 	int i, ret = 0;
3748 
3749 	for_each_active_iommu(iommu, drhd) {
3750 		for_each_active_dev_scope(drhd->devices,
3751 					  drhd->devices_cnt, i, dev) {
3752 			struct acpi_device_physical_node *pn;
3753 			struct acpi_device *adev;
3754 
3755 			if (dev->bus != &acpi_bus_type)
3756 				continue;
3757 
3758 			adev = to_acpi_device(dev);
3759 			mutex_lock(&adev->physical_node_lock);
3760 			list_for_each_entry(pn,
3761 					    &adev->physical_node_list, node) {
3762 				ret = iommu_probe_device(pn->dev);
3763 				if (ret)
3764 					break;
3765 			}
3766 			mutex_unlock(&adev->physical_node_lock);
3767 
3768 			if (ret)
3769 				return ret;
3770 		}
3771 	}
3772 
3773 	return 0;
3774 }
3775 
3776 static __init int tboot_force_iommu(void)
3777 {
3778 	if (!tboot_enabled())
3779 		return 0;
3780 
3781 	if (no_iommu || dmar_disabled)
3782 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3783 
3784 	dmar_disabled = 0;
3785 	no_iommu = 0;
3786 
3787 	return 1;
3788 }
3789 
3790 int __init intel_iommu_init(void)
3791 {
3792 	int ret = -ENODEV;
3793 	struct dmar_drhd_unit *drhd;
3794 	struct intel_iommu *iommu;
3795 
3796 	/*
3797 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3798 	 * opt in, so enforce that.
3799 	 */
3800 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3801 		    platform_optin_force_iommu();
3802 
3803 	down_write(&dmar_global_lock);
3804 	if (dmar_table_init()) {
3805 		if (force_on)
3806 			panic("tboot: Failed to initialize DMAR table\n");
3807 		goto out_free_dmar;
3808 	}
3809 
3810 	if (dmar_dev_scope_init() < 0) {
3811 		if (force_on)
3812 			panic("tboot: Failed to initialize DMAR device scope\n");
3813 		goto out_free_dmar;
3814 	}
3815 
3816 	up_write(&dmar_global_lock);
3817 
3818 	/*
3819 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3820 	 * complain later when we register it under the lock.
3821 	 */
3822 	dmar_register_bus_notifier();
3823 
3824 	down_write(&dmar_global_lock);
3825 
3826 	if (!no_iommu)
3827 		intel_iommu_debugfs_init();
3828 
3829 	if (no_iommu || dmar_disabled) {
3830 		/*
3831 		 * We exit the function here to ensure IOMMU's remapping and
3832 		 * mempool aren't setup, which means that the IOMMU's PMRs
3833 		 * won't be disabled via the call to init_dmars(). So disable
3834 		 * it explicitly here. The PMRs were setup by tboot prior to
3835 		 * calling SENTER, but the kernel is expected to reset/tear
3836 		 * down the PMRs.
3837 		 */
3838 		if (intel_iommu_tboot_noforce) {
3839 			for_each_iommu(iommu, drhd)
3840 				iommu_disable_protect_mem_regions(iommu);
3841 		}
3842 
3843 		/*
3844 		 * Make sure the IOMMUs are switched off, even when we
3845 		 * boot into a kexec kernel and the previous kernel left
3846 		 * them enabled
3847 		 */
3848 		intel_disable_iommus();
3849 		goto out_free_dmar;
3850 	}
3851 
3852 	if (list_empty(&dmar_rmrr_units))
3853 		pr_info("No RMRR found\n");
3854 
3855 	if (list_empty(&dmar_atsr_units))
3856 		pr_info("No ATSR found\n");
3857 
3858 	if (list_empty(&dmar_satc_units))
3859 		pr_info("No SATC found\n");
3860 
3861 	init_no_remapping_devices();
3862 
3863 	ret = init_dmars();
3864 	if (ret) {
3865 		if (force_on)
3866 			panic("tboot: Failed to initialize DMARs\n");
3867 		pr_err("Initialization failed\n");
3868 		goto out_free_dmar;
3869 	}
3870 	up_write(&dmar_global_lock);
3871 
3872 	init_iommu_pm_ops();
3873 
3874 	down_read(&dmar_global_lock);
3875 	for_each_active_iommu(iommu, drhd) {
3876 		/*
3877 		 * The flush queue implementation does not perform
3878 		 * page-selective invalidations that are required for efficient
3879 		 * TLB flushes in virtual environments.  The benefit of batching
3880 		 * is likely to be much lower than the overhead of synchronizing
3881 		 * the virtual and physical IOMMU page-tables.
3882 		 */
3883 		if (cap_caching_mode(iommu->cap) &&
3884 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3885 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3886 			iommu_set_dma_strict();
3887 		}
3888 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3889 				       intel_iommu_groups,
3890 				       "%s", iommu->name);
3891 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3892 
3893 		iommu_pmu_register(iommu);
3894 	}
3895 	up_read(&dmar_global_lock);
3896 
3897 	if (si_domain && !hw_pass_through)
3898 		register_memory_notifier(&intel_iommu_memory_nb);
3899 
3900 	down_read(&dmar_global_lock);
3901 	if (probe_acpi_namespace_devices())
3902 		pr_warn("ACPI name space devices didn't probe correctly\n");
3903 
3904 	/* Finally, we enable the DMA remapping hardware. */
3905 	for_each_iommu(iommu, drhd) {
3906 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3907 			iommu_enable_translation(iommu);
3908 
3909 		iommu_disable_protect_mem_regions(iommu);
3910 	}
3911 	up_read(&dmar_global_lock);
3912 
3913 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3914 
3915 	intel_iommu_enabled = 1;
3916 
3917 	return 0;
3918 
3919 out_free_dmar:
3920 	intel_iommu_free_dmars();
3921 	up_write(&dmar_global_lock);
3922 	return ret;
3923 }
3924 
3925 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3926 {
3927 	struct device_domain_info *info = opaque;
3928 
3929 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3930 	return 0;
3931 }
3932 
3933 /*
3934  * NB - intel-iommu lacks any sort of reference counting for the users of
3935  * dependent devices.  If multiple endpoints have intersecting dependent
3936  * devices, unbinding the driver from any one of them will possibly leave
3937  * the others unable to operate.
3938  */
3939 static void domain_context_clear(struct device_domain_info *info)
3940 {
3941 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3942 		return;
3943 
3944 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3945 			       &domain_context_clear_one_cb, info);
3946 }
3947 
3948 static void dmar_remove_one_dev_info(struct device *dev)
3949 {
3950 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3951 	struct dmar_domain *domain = info->domain;
3952 	struct intel_iommu *iommu = info->iommu;
3953 	unsigned long flags;
3954 
3955 	if (!dev_is_real_dma_subdevice(info->dev)) {
3956 		if (dev_is_pci(info->dev) && sm_supported(iommu))
3957 			intel_pasid_tear_down_entry(iommu, info->dev,
3958 					IOMMU_NO_PASID, false);
3959 
3960 		iommu_disable_pci_caps(info);
3961 		domain_context_clear(info);
3962 	}
3963 
3964 	spin_lock_irqsave(&domain->lock, flags);
3965 	list_del(&info->link);
3966 	spin_unlock_irqrestore(&domain->lock, flags);
3967 
3968 	domain_detach_iommu(domain, iommu);
3969 	info->domain = NULL;
3970 }
3971 
3972 /*
3973  * Clear the page table pointer in context or pasid table entries so that
3974  * all DMA requests without PASID from the device are blocked. If the page
3975  * table has been set, clean up the data structures.
3976  */
3977 static void device_block_translation(struct device *dev)
3978 {
3979 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3980 	struct intel_iommu *iommu = info->iommu;
3981 	unsigned long flags;
3982 
3983 	iommu_disable_pci_caps(info);
3984 	if (!dev_is_real_dma_subdevice(dev)) {
3985 		if (sm_supported(iommu))
3986 			intel_pasid_tear_down_entry(iommu, dev,
3987 						    IOMMU_NO_PASID, false);
3988 		else
3989 			domain_context_clear(info);
3990 	}
3991 
3992 	if (!info->domain)
3993 		return;
3994 
3995 	spin_lock_irqsave(&info->domain->lock, flags);
3996 	list_del(&info->link);
3997 	spin_unlock_irqrestore(&info->domain->lock, flags);
3998 
3999 	domain_detach_iommu(info->domain, iommu);
4000 	info->domain = NULL;
4001 }
4002 
4003 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4004 {
4005 	int adjust_width;
4006 
4007 	/* calculate AGAW */
4008 	domain->gaw = guest_width;
4009 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4010 	domain->agaw = width_to_agaw(adjust_width);
4011 
4012 	domain->iommu_coherency = false;
4013 	domain->iommu_superpage = 0;
4014 	domain->max_addr = 0;
4015 
4016 	/* always allocate the top pgd */
4017 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4018 	if (!domain->pgd)
4019 		return -ENOMEM;
4020 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4021 	return 0;
4022 }
4023 
4024 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4025 				      struct device *dev)
4026 {
4027 	device_block_translation(dev);
4028 	return 0;
4029 }
4030 
4031 static struct iommu_domain blocking_domain = {
4032 	.ops = &(const struct iommu_domain_ops) {
4033 		.attach_dev	= blocking_domain_attach_dev,
4034 		.free		= intel_iommu_domain_free
4035 	}
4036 };
4037 
4038 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4039 {
4040 	struct dmar_domain *dmar_domain;
4041 	struct iommu_domain *domain;
4042 
4043 	switch (type) {
4044 	case IOMMU_DOMAIN_BLOCKED:
4045 		return &blocking_domain;
4046 	case IOMMU_DOMAIN_DMA:
4047 	case IOMMU_DOMAIN_UNMANAGED:
4048 		dmar_domain = alloc_domain(type);
4049 		if (!dmar_domain) {
4050 			pr_err("Can't allocate dmar_domain\n");
4051 			return NULL;
4052 		}
4053 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4054 			pr_err("Domain initialization failed\n");
4055 			domain_exit(dmar_domain);
4056 			return NULL;
4057 		}
4058 
4059 		domain = &dmar_domain->domain;
4060 		domain->geometry.aperture_start = 0;
4061 		domain->geometry.aperture_end   =
4062 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4063 		domain->geometry.force_aperture = true;
4064 
4065 		return domain;
4066 	case IOMMU_DOMAIN_IDENTITY:
4067 		return &si_domain->domain;
4068 	case IOMMU_DOMAIN_SVA:
4069 		return intel_svm_domain_alloc();
4070 	default:
4071 		return NULL;
4072 	}
4073 
4074 	return NULL;
4075 }
4076 
4077 static void intel_iommu_domain_free(struct iommu_domain *domain)
4078 {
4079 	if (domain != &si_domain->domain && domain != &blocking_domain)
4080 		domain_exit(to_dmar_domain(domain));
4081 }
4082 
4083 static int prepare_domain_attach_device(struct iommu_domain *domain,
4084 					struct device *dev)
4085 {
4086 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4087 	struct intel_iommu *iommu;
4088 	int addr_width;
4089 
4090 	iommu = device_to_iommu(dev, NULL, NULL);
4091 	if (!iommu)
4092 		return -ENODEV;
4093 
4094 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4095 		return -EINVAL;
4096 
4097 	/* check if this iommu agaw is sufficient for max mapped address */
4098 	addr_width = agaw_to_width(iommu->agaw);
4099 	if (addr_width > cap_mgaw(iommu->cap))
4100 		addr_width = cap_mgaw(iommu->cap);
4101 
4102 	if (dmar_domain->max_addr > (1LL << addr_width))
4103 		return -EINVAL;
4104 	dmar_domain->gaw = addr_width;
4105 
4106 	/*
4107 	 * Knock out extra levels of page tables if necessary
4108 	 */
4109 	while (iommu->agaw < dmar_domain->agaw) {
4110 		struct dma_pte *pte;
4111 
4112 		pte = dmar_domain->pgd;
4113 		if (dma_pte_present(pte)) {
4114 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4115 			free_pgtable_page(pte);
4116 		}
4117 		dmar_domain->agaw--;
4118 	}
4119 
4120 	return 0;
4121 }
4122 
4123 static int intel_iommu_attach_device(struct iommu_domain *domain,
4124 				     struct device *dev)
4125 {
4126 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4127 	int ret;
4128 
4129 	if (info->domain)
4130 		device_block_translation(dev);
4131 
4132 	ret = prepare_domain_attach_device(domain, dev);
4133 	if (ret)
4134 		return ret;
4135 
4136 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4137 }
4138 
4139 static int intel_iommu_map(struct iommu_domain *domain,
4140 			   unsigned long iova, phys_addr_t hpa,
4141 			   size_t size, int iommu_prot, gfp_t gfp)
4142 {
4143 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4144 	u64 max_addr;
4145 	int prot = 0;
4146 
4147 	if (iommu_prot & IOMMU_READ)
4148 		prot |= DMA_PTE_READ;
4149 	if (iommu_prot & IOMMU_WRITE)
4150 		prot |= DMA_PTE_WRITE;
4151 	if (dmar_domain->set_pte_snp)
4152 		prot |= DMA_PTE_SNP;
4153 
4154 	max_addr = iova + size;
4155 	if (dmar_domain->max_addr < max_addr) {
4156 		u64 end;
4157 
4158 		/* check if minimum agaw is sufficient for mapped address */
4159 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4160 		if (end < max_addr) {
4161 			pr_err("%s: iommu width (%d) is not "
4162 			       "sufficient for the mapped address (%llx)\n",
4163 			       __func__, dmar_domain->gaw, max_addr);
4164 			return -EFAULT;
4165 		}
4166 		dmar_domain->max_addr = max_addr;
4167 	}
4168 	/* Round up size to next multiple of PAGE_SIZE, if it and
4169 	   the low bits of hpa would take us onto the next page */
4170 	size = aligned_nrpages(hpa, size);
4171 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4172 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4173 }
4174 
4175 static int intel_iommu_map_pages(struct iommu_domain *domain,
4176 				 unsigned long iova, phys_addr_t paddr,
4177 				 size_t pgsize, size_t pgcount,
4178 				 int prot, gfp_t gfp, size_t *mapped)
4179 {
4180 	unsigned long pgshift = __ffs(pgsize);
4181 	size_t size = pgcount << pgshift;
4182 	int ret;
4183 
4184 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4185 		return -EINVAL;
4186 
4187 	if (!IS_ALIGNED(iova | paddr, pgsize))
4188 		return -EINVAL;
4189 
4190 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4191 	if (!ret && mapped)
4192 		*mapped = size;
4193 
4194 	return ret;
4195 }
4196 
4197 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4198 				unsigned long iova, size_t size,
4199 				struct iommu_iotlb_gather *gather)
4200 {
4201 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4202 	unsigned long start_pfn, last_pfn;
4203 	int level = 0;
4204 
4205 	/* Cope with horrid API which requires us to unmap more than the
4206 	   size argument if it happens to be a large-page mapping. */
4207 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4208 				     &level, GFP_ATOMIC)))
4209 		return 0;
4210 
4211 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4212 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4213 
4214 	start_pfn = iova >> VTD_PAGE_SHIFT;
4215 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4216 
4217 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4218 
4219 	if (dmar_domain->max_addr == iova + size)
4220 		dmar_domain->max_addr = iova;
4221 
4222 	/*
4223 	 * We do not use page-selective IOTLB invalidation in flush queue,
4224 	 * so there is no need to track page and sync iotlb.
4225 	 */
4226 	if (!iommu_iotlb_gather_queued(gather))
4227 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4228 
4229 	return size;
4230 }
4231 
4232 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4233 				      unsigned long iova,
4234 				      size_t pgsize, size_t pgcount,
4235 				      struct iommu_iotlb_gather *gather)
4236 {
4237 	unsigned long pgshift = __ffs(pgsize);
4238 	size_t size = pgcount << pgshift;
4239 
4240 	return intel_iommu_unmap(domain, iova, size, gather);
4241 }
4242 
4243 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4244 				 struct iommu_iotlb_gather *gather)
4245 {
4246 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4247 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4248 	size_t size = gather->end - gather->start;
4249 	struct iommu_domain_info *info;
4250 	unsigned long start_pfn;
4251 	unsigned long nrpages;
4252 	unsigned long i;
4253 
4254 	nrpages = aligned_nrpages(gather->start, size);
4255 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4256 
4257 	xa_for_each(&dmar_domain->iommu_array, i, info)
4258 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4259 				      start_pfn, nrpages,
4260 				      list_empty(&gather->freelist), 0);
4261 
4262 	put_pages_list(&gather->freelist);
4263 }
4264 
4265 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4266 					    dma_addr_t iova)
4267 {
4268 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4269 	struct dma_pte *pte;
4270 	int level = 0;
4271 	u64 phys = 0;
4272 
4273 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4274 			     GFP_ATOMIC);
4275 	if (pte && dma_pte_present(pte))
4276 		phys = dma_pte_addr(pte) +
4277 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4278 						VTD_PAGE_SHIFT) - 1));
4279 
4280 	return phys;
4281 }
4282 
4283 static bool domain_support_force_snooping(struct dmar_domain *domain)
4284 {
4285 	struct device_domain_info *info;
4286 	bool support = true;
4287 
4288 	assert_spin_locked(&domain->lock);
4289 	list_for_each_entry(info, &domain->devices, link) {
4290 		if (!ecap_sc_support(info->iommu->ecap)) {
4291 			support = false;
4292 			break;
4293 		}
4294 	}
4295 
4296 	return support;
4297 }
4298 
4299 static void domain_set_force_snooping(struct dmar_domain *domain)
4300 {
4301 	struct device_domain_info *info;
4302 
4303 	assert_spin_locked(&domain->lock);
4304 	/*
4305 	 * Second level page table supports per-PTE snoop control. The
4306 	 * iommu_map() interface will handle this by setting SNP bit.
4307 	 */
4308 	if (!domain->use_first_level) {
4309 		domain->set_pte_snp = true;
4310 		return;
4311 	}
4312 
4313 	list_for_each_entry(info, &domain->devices, link)
4314 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4315 						     IOMMU_NO_PASID);
4316 }
4317 
4318 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4319 {
4320 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4321 	unsigned long flags;
4322 
4323 	if (dmar_domain->force_snooping)
4324 		return true;
4325 
4326 	spin_lock_irqsave(&dmar_domain->lock, flags);
4327 	if (!domain_support_force_snooping(dmar_domain)) {
4328 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4329 		return false;
4330 	}
4331 
4332 	domain_set_force_snooping(dmar_domain);
4333 	dmar_domain->force_snooping = true;
4334 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4335 
4336 	return true;
4337 }
4338 
4339 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4340 {
4341 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4342 
4343 	switch (cap) {
4344 	case IOMMU_CAP_CACHE_COHERENCY:
4345 	case IOMMU_CAP_DEFERRED_FLUSH:
4346 		return true;
4347 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4348 		return dmar_platform_optin();
4349 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4350 		return ecap_sc_support(info->iommu->ecap);
4351 	default:
4352 		return false;
4353 	}
4354 }
4355 
4356 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4357 {
4358 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4359 	struct device_domain_info *info;
4360 	struct intel_iommu *iommu;
4361 	u8 bus, devfn;
4362 	int ret;
4363 
4364 	iommu = device_to_iommu(dev, &bus, &devfn);
4365 	if (!iommu || !iommu->iommu.ops)
4366 		return ERR_PTR(-ENODEV);
4367 
4368 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4369 	if (!info)
4370 		return ERR_PTR(-ENOMEM);
4371 
4372 	if (dev_is_real_dma_subdevice(dev)) {
4373 		info->bus = pdev->bus->number;
4374 		info->devfn = pdev->devfn;
4375 		info->segment = pci_domain_nr(pdev->bus);
4376 	} else {
4377 		info->bus = bus;
4378 		info->devfn = devfn;
4379 		info->segment = iommu->segment;
4380 	}
4381 
4382 	info->dev = dev;
4383 	info->iommu = iommu;
4384 	if (dev_is_pci(dev)) {
4385 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4386 		    pci_ats_supported(pdev) &&
4387 		    dmar_ats_supported(pdev, iommu)) {
4388 			info->ats_supported = 1;
4389 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4390 
4391 			/*
4392 			 * For IOMMU that supports device IOTLB throttling
4393 			 * (DIT), we assign PFSID to the invalidation desc
4394 			 * of a VF such that IOMMU HW can gauge queue depth
4395 			 * at PF level. If DIT is not set, PFSID will be
4396 			 * treated as reserved, which should be set to 0.
4397 			 */
4398 			if (ecap_dit(iommu->ecap))
4399 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4400 			info->ats_qdep = pci_ats_queue_depth(pdev);
4401 		}
4402 		if (sm_supported(iommu)) {
4403 			if (pasid_supported(iommu)) {
4404 				int features = pci_pasid_features(pdev);
4405 
4406 				if (features >= 0)
4407 					info->pasid_supported = features | 1;
4408 			}
4409 
4410 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4411 			    pci_pri_supported(pdev))
4412 				info->pri_supported = 1;
4413 		}
4414 	}
4415 
4416 	dev_iommu_priv_set(dev, info);
4417 
4418 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4419 		ret = intel_pasid_alloc_table(dev);
4420 		if (ret) {
4421 			dev_err(dev, "PASID table allocation failed\n");
4422 			dev_iommu_priv_set(dev, NULL);
4423 			kfree(info);
4424 			return ERR_PTR(ret);
4425 		}
4426 	}
4427 
4428 	return &iommu->iommu;
4429 }
4430 
4431 static void intel_iommu_release_device(struct device *dev)
4432 {
4433 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4434 
4435 	dmar_remove_one_dev_info(dev);
4436 	intel_pasid_free_table(dev);
4437 	dev_iommu_priv_set(dev, NULL);
4438 	kfree(info);
4439 	set_dma_ops(dev, NULL);
4440 }
4441 
4442 static void intel_iommu_probe_finalize(struct device *dev)
4443 {
4444 	set_dma_ops(dev, NULL);
4445 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4446 }
4447 
4448 static void intel_iommu_get_resv_regions(struct device *device,
4449 					 struct list_head *head)
4450 {
4451 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4452 	struct iommu_resv_region *reg;
4453 	struct dmar_rmrr_unit *rmrr;
4454 	struct device *i_dev;
4455 	int i;
4456 
4457 	rcu_read_lock();
4458 	for_each_rmrr_units(rmrr) {
4459 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4460 					  i, i_dev) {
4461 			struct iommu_resv_region *resv;
4462 			enum iommu_resv_type type;
4463 			size_t length;
4464 
4465 			if (i_dev != device &&
4466 			    !is_downstream_to_pci_bridge(device, i_dev))
4467 				continue;
4468 
4469 			length = rmrr->end_address - rmrr->base_address + 1;
4470 
4471 			type = device_rmrr_is_relaxable(device) ?
4472 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4473 
4474 			resv = iommu_alloc_resv_region(rmrr->base_address,
4475 						       length, prot, type,
4476 						       GFP_ATOMIC);
4477 			if (!resv)
4478 				break;
4479 
4480 			list_add_tail(&resv->list, head);
4481 		}
4482 	}
4483 	rcu_read_unlock();
4484 
4485 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4486 	if (dev_is_pci(device)) {
4487 		struct pci_dev *pdev = to_pci_dev(device);
4488 
4489 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4490 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4491 					IOMMU_RESV_DIRECT_RELAXABLE,
4492 					GFP_KERNEL);
4493 			if (reg)
4494 				list_add_tail(&reg->list, head);
4495 		}
4496 	}
4497 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4498 
4499 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4500 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4501 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4502 	if (!reg)
4503 		return;
4504 	list_add_tail(&reg->list, head);
4505 }
4506 
4507 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4508 {
4509 	if (dev_is_pci(dev))
4510 		return pci_device_group(dev);
4511 	return generic_device_group(dev);
4512 }
4513 
4514 static int intel_iommu_enable_sva(struct device *dev)
4515 {
4516 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4517 	struct intel_iommu *iommu;
4518 
4519 	if (!info || dmar_disabled)
4520 		return -EINVAL;
4521 
4522 	iommu = info->iommu;
4523 	if (!iommu)
4524 		return -EINVAL;
4525 
4526 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4527 		return -ENODEV;
4528 
4529 	if (!info->pasid_enabled || !info->ats_enabled)
4530 		return -EINVAL;
4531 
4532 	/*
4533 	 * Devices having device-specific I/O fault handling should not
4534 	 * support PCI/PRI. The IOMMU side has no means to check the
4535 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4536 	 * default that if the device driver enables SVA on a non-PRI
4537 	 * device, it will handle IOPF in its own way.
4538 	 */
4539 	if (!info->pri_supported)
4540 		return 0;
4541 
4542 	/* Devices supporting PRI should have it enabled. */
4543 	if (!info->pri_enabled)
4544 		return -EINVAL;
4545 
4546 	return 0;
4547 }
4548 
4549 static int intel_iommu_enable_iopf(struct device *dev)
4550 {
4551 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4552 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4553 	struct intel_iommu *iommu;
4554 	int ret;
4555 
4556 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4557 		return -ENODEV;
4558 
4559 	if (info->pri_enabled)
4560 		return -EBUSY;
4561 
4562 	iommu = info->iommu;
4563 	if (!iommu)
4564 		return -EINVAL;
4565 
4566 	/* PASID is required in PRG Response Message. */
4567 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4568 		return -EINVAL;
4569 
4570 	ret = pci_reset_pri(pdev);
4571 	if (ret)
4572 		return ret;
4573 
4574 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4575 	if (ret)
4576 		return ret;
4577 
4578 	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4579 	if (ret)
4580 		goto iopf_remove_device;
4581 
4582 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4583 	if (ret)
4584 		goto iopf_unregister_handler;
4585 	info->pri_enabled = 1;
4586 
4587 	return 0;
4588 
4589 iopf_unregister_handler:
4590 	iommu_unregister_device_fault_handler(dev);
4591 iopf_remove_device:
4592 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4593 
4594 	return ret;
4595 }
4596 
4597 static int intel_iommu_disable_iopf(struct device *dev)
4598 {
4599 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4600 	struct intel_iommu *iommu = info->iommu;
4601 
4602 	if (!info->pri_enabled)
4603 		return -EINVAL;
4604 
4605 	/*
4606 	 * PCIe spec states that by clearing PRI enable bit, the Page
4607 	 * Request Interface will not issue new page requests, but has
4608 	 * outstanding page requests that have been transmitted or are
4609 	 * queued for transmission. This is supposed to be called after
4610 	 * the device driver has stopped DMA, all PASIDs have been
4611 	 * unbound and the outstanding PRQs have been drained.
4612 	 */
4613 	pci_disable_pri(to_pci_dev(dev));
4614 	info->pri_enabled = 0;
4615 
4616 	/*
4617 	 * With PRI disabled and outstanding PRQs drained, unregistering
4618 	 * fault handler and removing device from iopf queue should never
4619 	 * fail.
4620 	 */
4621 	WARN_ON(iommu_unregister_device_fault_handler(dev));
4622 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4623 
4624 	return 0;
4625 }
4626 
4627 static int
4628 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4629 {
4630 	switch (feat) {
4631 	case IOMMU_DEV_FEAT_IOPF:
4632 		return intel_iommu_enable_iopf(dev);
4633 
4634 	case IOMMU_DEV_FEAT_SVA:
4635 		return intel_iommu_enable_sva(dev);
4636 
4637 	default:
4638 		return -ENODEV;
4639 	}
4640 }
4641 
4642 static int
4643 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4644 {
4645 	switch (feat) {
4646 	case IOMMU_DEV_FEAT_IOPF:
4647 		return intel_iommu_disable_iopf(dev);
4648 
4649 	case IOMMU_DEV_FEAT_SVA:
4650 		return 0;
4651 
4652 	default:
4653 		return -ENODEV;
4654 	}
4655 }
4656 
4657 static bool intel_iommu_is_attach_deferred(struct device *dev)
4658 {
4659 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4660 
4661 	return translation_pre_enabled(info->iommu) && !info->domain;
4662 }
4663 
4664 /*
4665  * Check that the device does not live on an external facing PCI port that is
4666  * marked as untrusted. Such devices should not be able to apply quirks and
4667  * thus not be able to bypass the IOMMU restrictions.
4668  */
4669 static bool risky_device(struct pci_dev *pdev)
4670 {
4671 	if (pdev->untrusted) {
4672 		pci_info(pdev,
4673 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4674 			 pdev->vendor, pdev->device);
4675 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4676 		return true;
4677 	}
4678 	return false;
4679 }
4680 
4681 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4682 				       unsigned long iova, size_t size)
4683 {
4684 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4685 	unsigned long pages = aligned_nrpages(iova, size);
4686 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4687 	struct iommu_domain_info *info;
4688 	unsigned long i;
4689 
4690 	xa_for_each(&dmar_domain->iommu_array, i, info)
4691 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4692 }
4693 
4694 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4695 {
4696 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4697 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4698 	struct dmar_domain *dmar_domain;
4699 	struct iommu_domain *domain;
4700 	unsigned long flags;
4701 
4702 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4703 	if (WARN_ON_ONCE(!domain))
4704 		goto out_tear_down;
4705 
4706 	/*
4707 	 * The SVA implementation needs to handle its own stuffs like the mm
4708 	 * notification. Before consolidating that code into iommu core, let
4709 	 * the intel sva code handle it.
4710 	 */
4711 	if (domain->type == IOMMU_DOMAIN_SVA) {
4712 		intel_svm_remove_dev_pasid(dev, pasid);
4713 		goto out_tear_down;
4714 	}
4715 
4716 	dmar_domain = to_dmar_domain(domain);
4717 	spin_lock_irqsave(&dmar_domain->lock, flags);
4718 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4719 		if (curr->dev == dev && curr->pasid == pasid) {
4720 			list_del(&curr->link_domain);
4721 			dev_pasid = curr;
4722 			break;
4723 		}
4724 	}
4725 	WARN_ON_ONCE(!dev_pasid);
4726 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4727 
4728 	domain_detach_iommu(dmar_domain, iommu);
4729 	kfree(dev_pasid);
4730 out_tear_down:
4731 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4732 	intel_drain_pasid_prq(dev, pasid);
4733 }
4734 
4735 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4736 				     struct device *dev, ioasid_t pasid)
4737 {
4738 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4739 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4740 	struct intel_iommu *iommu = info->iommu;
4741 	struct dev_pasid_info *dev_pasid;
4742 	unsigned long flags;
4743 	int ret;
4744 
4745 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4746 		return -EOPNOTSUPP;
4747 
4748 	if (context_copied(iommu, info->bus, info->devfn))
4749 		return -EBUSY;
4750 
4751 	ret = prepare_domain_attach_device(domain, dev);
4752 	if (ret)
4753 		return ret;
4754 
4755 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4756 	if (!dev_pasid)
4757 		return -ENOMEM;
4758 
4759 	ret = domain_attach_iommu(dmar_domain, iommu);
4760 	if (ret)
4761 		goto out_free;
4762 
4763 	if (domain_type_is_si(dmar_domain))
4764 		ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4765 						     dev, pasid);
4766 	else if (dmar_domain->use_first_level)
4767 		ret = domain_setup_first_level(iommu, dmar_domain,
4768 					       dev, pasid);
4769 	else
4770 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4771 						     dev, pasid);
4772 	if (ret)
4773 		goto out_detach_iommu;
4774 
4775 	dev_pasid->dev = dev;
4776 	dev_pasid->pasid = pasid;
4777 	spin_lock_irqsave(&dmar_domain->lock, flags);
4778 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4779 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4780 
4781 	return 0;
4782 out_detach_iommu:
4783 	domain_detach_iommu(dmar_domain, iommu);
4784 out_free:
4785 	kfree(dev_pasid);
4786 	return ret;
4787 }
4788 
4789 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4790 {
4791 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4792 	struct intel_iommu *iommu = info->iommu;
4793 	struct iommu_hw_info_vtd *vtd;
4794 
4795 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4796 	if (!vtd)
4797 		return ERR_PTR(-ENOMEM);
4798 
4799 	vtd->cap_reg = iommu->cap;
4800 	vtd->ecap_reg = iommu->ecap;
4801 	*length = sizeof(*vtd);
4802 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4803 	return vtd;
4804 }
4805 
4806 const struct iommu_ops intel_iommu_ops = {
4807 	.capable		= intel_iommu_capable,
4808 	.hw_info		= intel_iommu_hw_info,
4809 	.domain_alloc		= intel_iommu_domain_alloc,
4810 	.probe_device		= intel_iommu_probe_device,
4811 	.probe_finalize		= intel_iommu_probe_finalize,
4812 	.release_device		= intel_iommu_release_device,
4813 	.get_resv_regions	= intel_iommu_get_resv_regions,
4814 	.device_group		= intel_iommu_device_group,
4815 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4816 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4817 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4818 	.def_domain_type	= device_def_domain_type,
4819 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4820 	.pgsize_bitmap		= SZ_4K,
4821 #ifdef CONFIG_INTEL_IOMMU_SVM
4822 	.page_response		= intel_svm_page_response,
4823 #endif
4824 	.default_domain_ops = &(const struct iommu_domain_ops) {
4825 		.attach_dev		= intel_iommu_attach_device,
4826 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4827 		.map_pages		= intel_iommu_map_pages,
4828 		.unmap_pages		= intel_iommu_unmap_pages,
4829 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4830 		.flush_iotlb_all        = intel_flush_iotlb_all,
4831 		.iotlb_sync		= intel_iommu_tlb_sync,
4832 		.iova_to_phys		= intel_iommu_iova_to_phys,
4833 		.free			= intel_iommu_domain_free,
4834 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4835 	}
4836 };
4837 
4838 static void quirk_iommu_igfx(struct pci_dev *dev)
4839 {
4840 	if (risky_device(dev))
4841 		return;
4842 
4843 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4844 	dmar_map_gfx = 0;
4845 }
4846 
4847 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4855 
4856 /* Broadwell igfx malfunctions with dmar */
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4881 
4882 static void quirk_iommu_rwbf(struct pci_dev *dev)
4883 {
4884 	if (risky_device(dev))
4885 		return;
4886 
4887 	/*
4888 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4889 	 * but needs it. Same seems to hold for the desktop versions.
4890 	 */
4891 	pci_info(dev, "Forcing write-buffer flush capability\n");
4892 	rwbf_quirk = 1;
4893 }
4894 
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4902 
4903 #define GGC 0x52
4904 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4905 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4906 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4907 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4908 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4909 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4910 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4911 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4912 
4913 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4914 {
4915 	unsigned short ggc;
4916 
4917 	if (risky_device(dev))
4918 		return;
4919 
4920 	if (pci_read_config_word(dev, GGC, &ggc))
4921 		return;
4922 
4923 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4924 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4925 		dmar_map_gfx = 0;
4926 	} else if (dmar_map_gfx) {
4927 		/* we have to ensure the gfx device is idle before we flush */
4928 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4929 		iommu_set_dma_strict();
4930 	}
4931 }
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4936 
4937 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4938 {
4939 	unsigned short ver;
4940 
4941 	if (!IS_GFX_DEVICE(dev))
4942 		return;
4943 
4944 	ver = (dev->device >> 8) & 0xff;
4945 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4946 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4947 	    ver != 0x9a && ver != 0xa7)
4948 		return;
4949 
4950 	if (risky_device(dev))
4951 		return;
4952 
4953 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4954 	iommu_skip_te_disable = 1;
4955 }
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4957 
4958 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4959    ISOCH DMAR unit for the Azalia sound device, but not give it any
4960    TLB entries, which causes it to deadlock. Check for that.  We do
4961    this in a function called from init_dmars(), instead of in a PCI
4962    quirk, because we don't want to print the obnoxious "BIOS broken"
4963    message if VT-d is actually disabled.
4964 */
4965 static void __init check_tylersburg_isoch(void)
4966 {
4967 	struct pci_dev *pdev;
4968 	uint32_t vtisochctrl;
4969 
4970 	/* If there's no Azalia in the system anyway, forget it. */
4971 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4972 	if (!pdev)
4973 		return;
4974 
4975 	if (risky_device(pdev)) {
4976 		pci_dev_put(pdev);
4977 		return;
4978 	}
4979 
4980 	pci_dev_put(pdev);
4981 
4982 	/* System Management Registers. Might be hidden, in which case
4983 	   we can't do the sanity check. But that's OK, because the
4984 	   known-broken BIOSes _don't_ actually hide it, so far. */
4985 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4986 	if (!pdev)
4987 		return;
4988 
4989 	if (risky_device(pdev)) {
4990 		pci_dev_put(pdev);
4991 		return;
4992 	}
4993 
4994 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4995 		pci_dev_put(pdev);
4996 		return;
4997 	}
4998 
4999 	pci_dev_put(pdev);
5000 
5001 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5002 	if (vtisochctrl & 1)
5003 		return;
5004 
5005 	/* Drop all bits other than the number of TLB entries */
5006 	vtisochctrl &= 0x1c;
5007 
5008 	/* If we have the recommended number of TLB entries (16), fine. */
5009 	if (vtisochctrl == 0x10)
5010 		return;
5011 
5012 	/* Zero TLB entries? You get to ride the short bus to school. */
5013 	if (!vtisochctrl) {
5014 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5015 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5016 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5017 		     dmi_get_system_info(DMI_BIOS_VERSION),
5018 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5019 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5020 		return;
5021 	}
5022 
5023 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5024 	       vtisochctrl);
5025 }
5026 
5027 /*
5028  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5029  * invalidation completion before posted writes initiated with translated address
5030  * that utilized translations matching the invalidation address range, violating
5031  * the invalidation completion ordering.
5032  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5033  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5034  * under the control of the trusted/privileged host device driver must use this
5035  * quirk.
5036  * Device TLBs are invalidated under the following six conditions:
5037  * 1. Device driver does DMA API unmap IOVA
5038  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5039  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5040  *    exit_mmap() due to crash
5041  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5042  *    VM has to free pages that were unmapped
5043  * 5. Userspace driver unmaps a DMA buffer
5044  * 6. Cache invalidation in vSVA usage (upcoming)
5045  *
5046  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5047  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5048  * invalidate TLB the same way as normal user unmap which will use this quirk.
5049  * The dTLB invalidation after PASID cache flush does not need this quirk.
5050  *
5051  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5052  */
5053 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5054 			       unsigned long address, unsigned long mask,
5055 			       u32 pasid, u16 qdep)
5056 {
5057 	u16 sid;
5058 
5059 	if (likely(!info->dtlb_extra_inval))
5060 		return;
5061 
5062 	sid = PCI_DEVID(info->bus, info->devfn);
5063 	if (pasid == IOMMU_NO_PASID) {
5064 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5065 				   qdep, address, mask);
5066 	} else {
5067 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5068 					 pasid, qdep, address, mask);
5069 	}
5070 }
5071 
5072 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5073 
5074 /*
5075  * Function to submit a command to the enhanced command interface. The
5076  * valid enhanced command descriptions are defined in Table 47 of the
5077  * VT-d spec. The VT-d hardware implementation may support some but not
5078  * all commands, which can be determined by checking the Enhanced
5079  * Command Capability Register.
5080  *
5081  * Return values:
5082  *  - 0: Command successful without any error;
5083  *  - Negative: software error value;
5084  *  - Nonzero positive: failure status code defined in Table 48.
5085  */
5086 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5087 {
5088 	unsigned long flags;
5089 	u64 res;
5090 	int ret;
5091 
5092 	if (!cap_ecmds(iommu->cap))
5093 		return -ENODEV;
5094 
5095 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5096 
5097 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5098 	if (res & DMA_ECMD_ECRSP_IP) {
5099 		ret = -EBUSY;
5100 		goto err;
5101 	}
5102 
5103 	/*
5104 	 * Unconditionally write the operand B, because
5105 	 * - There is no side effect if an ecmd doesn't require an
5106 	 *   operand B, but we set the register to some value.
5107 	 * - It's not invoked in any critical path. The extra MMIO
5108 	 *   write doesn't bring any performance concerns.
5109 	 */
5110 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5111 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5112 
5113 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5114 		      !(res & DMA_ECMD_ECRSP_IP), res);
5115 
5116 	if (res & DMA_ECMD_ECRSP_IP) {
5117 		ret = -ETIMEDOUT;
5118 		goto err;
5119 	}
5120 
5121 	ret = ecmd_get_status_code(res);
5122 err:
5123 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5124 
5125 	return ret;
5126 }
5127