xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision f80be457)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-svm.h>
21 #include <linux/memory.h>
22 #include <linux/pci.h>
23 #include <linux/pci-ats.h>
24 #include <linux/spinlock.h>
25 #include <linux/syscore_ops.h>
26 #include <linux/tboot.h>
27 
28 #include "iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
166 static inline void context_clear_pasid_enable(struct context_entry *context)
167 {
168 	context->lo &= ~(1ULL << 11);
169 }
170 
171 static inline bool context_pasid_enabled(struct context_entry *context)
172 {
173 	return !!(context->lo & (1ULL << 11));
174 }
175 
176 static inline void context_set_copied(struct context_entry *context)
177 {
178 	context->hi |= (1ull << 3);
179 }
180 
181 static inline bool context_copied(struct context_entry *context)
182 {
183 	return !!(context->hi & (1ULL << 3));
184 }
185 
186 static inline bool __context_present(struct context_entry *context)
187 {
188 	return (context->lo & 1);
189 }
190 
191 bool context_present(struct context_entry *context)
192 {
193 	return context_pasid_enabled(context) ?
194 	     __context_present(context) :
195 	     __context_present(context) && !context_copied(context);
196 }
197 
198 static inline void context_set_present(struct context_entry *context)
199 {
200 	context->lo |= 1;
201 }
202 
203 static inline void context_set_fault_enable(struct context_entry *context)
204 {
205 	context->lo &= (((u64)-1) << 2) | 1;
206 }
207 
208 static inline void context_set_translation_type(struct context_entry *context,
209 						unsigned long value)
210 {
211 	context->lo &= (((u64)-1) << 4) | 3;
212 	context->lo |= (value & 3) << 2;
213 }
214 
215 static inline void context_set_address_root(struct context_entry *context,
216 					    unsigned long value)
217 {
218 	context->lo &= ~VTD_PAGE_MASK;
219 	context->lo |= value & VTD_PAGE_MASK;
220 }
221 
222 static inline void context_set_address_width(struct context_entry *context,
223 					     unsigned long value)
224 {
225 	context->hi |= value & 7;
226 }
227 
228 static inline void context_set_domain_id(struct context_entry *context,
229 					 unsigned long value)
230 {
231 	context->hi |= (value & ((1 << 16) - 1)) << 8;
232 }
233 
234 static inline int context_domain_id(struct context_entry *c)
235 {
236 	return((c->hi >> 8) & 0xffff);
237 }
238 
239 static inline void context_clear_entry(struct context_entry *context)
240 {
241 	context->lo = 0;
242 	context->hi = 0;
243 }
244 
245 /*
246  * This domain is a statically identity mapping domain.
247  *	1. This domain creats a static 1:1 mapping to all usable memory.
248  * 	2. It maps to each iommu if successful.
249  *	3. Each iommu mapps to this domain if successful.
250  */
251 static struct dmar_domain *si_domain;
252 static int hw_pass_through = 1;
253 
254 struct dmar_rmrr_unit {
255 	struct list_head list;		/* list of rmrr units	*/
256 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
257 	u64	base_address;		/* reserved base address*/
258 	u64	end_address;		/* reserved end address */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int	devices_cnt;		/* target device count */
261 };
262 
263 struct dmar_atsr_unit {
264 	struct list_head list;		/* list of ATSR units */
265 	struct acpi_dmar_header *hdr;	/* ACPI header */
266 	struct dmar_dev_scope *devices;	/* target devices */
267 	int devices_cnt;		/* target device count */
268 	u8 include_all:1;		/* include all ports */
269 };
270 
271 struct dmar_satc_unit {
272 	struct list_head list;		/* list of SATC units */
273 	struct acpi_dmar_header *hdr;	/* ACPI header */
274 	struct dmar_dev_scope *devices;	/* target devices */
275 	struct intel_iommu *iommu;	/* the corresponding iommu */
276 	int devices_cnt;		/* target device count */
277 	u8 atc_required:1;		/* ATS is required */
278 };
279 
280 static LIST_HEAD(dmar_atsr_units);
281 static LIST_HEAD(dmar_rmrr_units);
282 static LIST_HEAD(dmar_satc_units);
283 
284 #define for_each_rmrr_units(rmrr) \
285 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
286 
287 static void dmar_remove_one_dev_info(struct device *dev);
288 
289 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
290 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
291 
292 int intel_iommu_enabled = 0;
293 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
294 
295 static int dmar_map_gfx = 1;
296 static int intel_iommu_superpage = 1;
297 static int iommu_identity_mapping;
298 static int iommu_skip_te_disable;
299 
300 #define IDENTMAP_GFX		2
301 #define IDENTMAP_AZALIA		4
302 
303 const struct iommu_ops intel_iommu_ops;
304 
305 static bool translation_pre_enabled(struct intel_iommu *iommu)
306 {
307 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
308 }
309 
310 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
311 {
312 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
313 }
314 
315 static void init_translation_status(struct intel_iommu *iommu)
316 {
317 	u32 gsts;
318 
319 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
320 	if (gsts & DMA_GSTS_TES)
321 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
322 }
323 
324 static int __init intel_iommu_setup(char *str)
325 {
326 	if (!str)
327 		return -EINVAL;
328 
329 	while (*str) {
330 		if (!strncmp(str, "on", 2)) {
331 			dmar_disabled = 0;
332 			pr_info("IOMMU enabled\n");
333 		} else if (!strncmp(str, "off", 3)) {
334 			dmar_disabled = 1;
335 			no_platform_optin = 1;
336 			pr_info("IOMMU disabled\n");
337 		} else if (!strncmp(str, "igfx_off", 8)) {
338 			dmar_map_gfx = 0;
339 			pr_info("Disable GFX device mapping\n");
340 		} else if (!strncmp(str, "forcedac", 8)) {
341 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
342 			iommu_dma_forcedac = true;
343 		} else if (!strncmp(str, "strict", 6)) {
344 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
345 			iommu_set_dma_strict();
346 		} else if (!strncmp(str, "sp_off", 6)) {
347 			pr_info("Disable supported super page\n");
348 			intel_iommu_superpage = 0;
349 		} else if (!strncmp(str, "sm_on", 5)) {
350 			pr_info("Enable scalable mode if hardware supports\n");
351 			intel_iommu_sm = 1;
352 		} else if (!strncmp(str, "sm_off", 6)) {
353 			pr_info("Scalable mode is disallowed\n");
354 			intel_iommu_sm = 0;
355 		} else if (!strncmp(str, "tboot_noforce", 13)) {
356 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
357 			intel_iommu_tboot_noforce = 1;
358 		} else {
359 			pr_notice("Unknown option - '%s'\n", str);
360 		}
361 
362 		str += strcspn(str, ",");
363 		while (*str == ',')
364 			str++;
365 	}
366 
367 	return 1;
368 }
369 __setup("intel_iommu=", intel_iommu_setup);
370 
371 void *alloc_pgtable_page(int node)
372 {
373 	struct page *page;
374 	void *vaddr = NULL;
375 
376 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
377 	if (page)
378 		vaddr = page_address(page);
379 	return vaddr;
380 }
381 
382 void free_pgtable_page(void *vaddr)
383 {
384 	free_page((unsigned long)vaddr);
385 }
386 
387 static inline int domain_type_is_si(struct dmar_domain *domain)
388 {
389 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
390 }
391 
392 static inline bool domain_use_first_level(struct dmar_domain *domain)
393 {
394 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
395 }
396 
397 static inline int domain_pfn_supported(struct dmar_domain *domain,
398 				       unsigned long pfn)
399 {
400 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
401 
402 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
403 }
404 
405 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
406 {
407 	unsigned long sagaw;
408 	int agaw;
409 
410 	sagaw = cap_sagaw(iommu->cap);
411 	for (agaw = width_to_agaw(max_gaw);
412 	     agaw >= 0; agaw--) {
413 		if (test_bit(agaw, &sagaw))
414 			break;
415 	}
416 
417 	return agaw;
418 }
419 
420 /*
421  * Calculate max SAGAW for each iommu.
422  */
423 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
424 {
425 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
426 }
427 
428 /*
429  * calculate agaw for each iommu.
430  * "SAGAW" may be different across iommus, use a default agaw, and
431  * get a supported less agaw for iommus that don't support the default agaw.
432  */
433 int iommu_calculate_agaw(struct intel_iommu *iommu)
434 {
435 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
436 }
437 
438 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
439 {
440 	return sm_supported(iommu) ?
441 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
442 }
443 
444 static void domain_update_iommu_coherency(struct dmar_domain *domain)
445 {
446 	struct iommu_domain_info *info;
447 	struct dmar_drhd_unit *drhd;
448 	struct intel_iommu *iommu;
449 	bool found = false;
450 	unsigned long i;
451 
452 	domain->iommu_coherency = true;
453 	xa_for_each(&domain->iommu_array, i, info) {
454 		found = true;
455 		if (!iommu_paging_structure_coherency(info->iommu)) {
456 			domain->iommu_coherency = false;
457 			break;
458 		}
459 	}
460 	if (found)
461 		return;
462 
463 	/* No hardware attached; use lowest common denominator */
464 	rcu_read_lock();
465 	for_each_active_iommu(iommu, drhd) {
466 		if (!iommu_paging_structure_coherency(iommu)) {
467 			domain->iommu_coherency = false;
468 			break;
469 		}
470 	}
471 	rcu_read_unlock();
472 }
473 
474 static int domain_update_iommu_superpage(struct dmar_domain *domain,
475 					 struct intel_iommu *skip)
476 {
477 	struct dmar_drhd_unit *drhd;
478 	struct intel_iommu *iommu;
479 	int mask = 0x3;
480 
481 	if (!intel_iommu_superpage)
482 		return 0;
483 
484 	/* set iommu_superpage to the smallest common denominator */
485 	rcu_read_lock();
486 	for_each_active_iommu(iommu, drhd) {
487 		if (iommu != skip) {
488 			if (domain && domain_use_first_level(domain)) {
489 				if (!cap_fl1gp_support(iommu->cap))
490 					mask = 0x1;
491 			} else {
492 				mask &= cap_super_page_val(iommu->cap);
493 			}
494 
495 			if (!mask)
496 				break;
497 		}
498 	}
499 	rcu_read_unlock();
500 
501 	return fls(mask);
502 }
503 
504 static int domain_update_device_node(struct dmar_domain *domain)
505 {
506 	struct device_domain_info *info;
507 	int nid = NUMA_NO_NODE;
508 
509 	spin_lock(&domain->lock);
510 	list_for_each_entry(info, &domain->devices, link) {
511 		/*
512 		 * There could possibly be multiple device numa nodes as devices
513 		 * within the same domain may sit behind different IOMMUs. There
514 		 * isn't perfect answer in such situation, so we select first
515 		 * come first served policy.
516 		 */
517 		nid = dev_to_node(info->dev);
518 		if (nid != NUMA_NO_NODE)
519 			break;
520 	}
521 	spin_unlock(&domain->lock);
522 
523 	return nid;
524 }
525 
526 static void domain_update_iotlb(struct dmar_domain *domain);
527 
528 /* Return the super pagesize bitmap if supported. */
529 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
530 {
531 	unsigned long bitmap = 0;
532 
533 	/*
534 	 * 1-level super page supports page size of 2MiB, 2-level super page
535 	 * supports page size of both 2MiB and 1GiB.
536 	 */
537 	if (domain->iommu_superpage == 1)
538 		bitmap |= SZ_2M;
539 	else if (domain->iommu_superpage == 2)
540 		bitmap |= SZ_2M | SZ_1G;
541 
542 	return bitmap;
543 }
544 
545 /* Some capabilities may be different across iommus */
546 static void domain_update_iommu_cap(struct dmar_domain *domain)
547 {
548 	domain_update_iommu_coherency(domain);
549 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
550 
551 	/*
552 	 * If RHSA is missing, we should default to the device numa domain
553 	 * as fall back.
554 	 */
555 	if (domain->nid == NUMA_NO_NODE)
556 		domain->nid = domain_update_device_node(domain);
557 
558 	/*
559 	 * First-level translation restricts the input-address to a
560 	 * canonical address (i.e., address bits 63:N have the same
561 	 * value as address bit [N-1], where N is 48-bits with 4-level
562 	 * paging and 57-bits with 5-level paging). Hence, skip bit
563 	 * [N-1].
564 	 */
565 	if (domain_use_first_level(domain))
566 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
567 	else
568 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
569 
570 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
571 	domain_update_iotlb(domain);
572 }
573 
574 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
575 					 u8 devfn, int alloc)
576 {
577 	struct root_entry *root = &iommu->root_entry[bus];
578 	struct context_entry *context;
579 	u64 *entry;
580 
581 	entry = &root->lo;
582 	if (sm_supported(iommu)) {
583 		if (devfn >= 0x80) {
584 			devfn -= 0x80;
585 			entry = &root->hi;
586 		}
587 		devfn *= 2;
588 	}
589 	if (*entry & 1)
590 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
591 	else {
592 		unsigned long phy_addr;
593 		if (!alloc)
594 			return NULL;
595 
596 		context = alloc_pgtable_page(iommu->node);
597 		if (!context)
598 			return NULL;
599 
600 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
601 		phy_addr = virt_to_phys((void *)context);
602 		*entry = phy_addr | 1;
603 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
604 	}
605 	return &context[devfn];
606 }
607 
608 /**
609  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
610  *				 sub-hierarchy of a candidate PCI-PCI bridge
611  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
612  * @bridge: the candidate PCI-PCI bridge
613  *
614  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
615  */
616 static bool
617 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
618 {
619 	struct pci_dev *pdev, *pbridge;
620 
621 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
622 		return false;
623 
624 	pdev = to_pci_dev(dev);
625 	pbridge = to_pci_dev(bridge);
626 
627 	if (pbridge->subordinate &&
628 	    pbridge->subordinate->number <= pdev->bus->number &&
629 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
630 		return true;
631 
632 	return false;
633 }
634 
635 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
636 {
637 	struct dmar_drhd_unit *drhd;
638 	u32 vtbar;
639 	int rc;
640 
641 	/* We know that this device on this chipset has its own IOMMU.
642 	 * If we find it under a different IOMMU, then the BIOS is lying
643 	 * to us. Hope that the IOMMU for this device is actually
644 	 * disabled, and it needs no translation...
645 	 */
646 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
647 	if (rc) {
648 		/* "can't" happen */
649 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
650 		return false;
651 	}
652 	vtbar &= 0xffff0000;
653 
654 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
655 	drhd = dmar_find_matched_drhd_unit(pdev);
656 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
657 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
658 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
659 		return true;
660 	}
661 
662 	return false;
663 }
664 
665 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
666 {
667 	if (!iommu || iommu->drhd->ignored)
668 		return true;
669 
670 	if (dev_is_pci(dev)) {
671 		struct pci_dev *pdev = to_pci_dev(dev);
672 
673 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
674 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
675 		    quirk_ioat_snb_local_iommu(pdev))
676 			return true;
677 	}
678 
679 	return false;
680 }
681 
682 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
683 {
684 	struct dmar_drhd_unit *drhd = NULL;
685 	struct pci_dev *pdev = NULL;
686 	struct intel_iommu *iommu;
687 	struct device *tmp;
688 	u16 segment = 0;
689 	int i;
690 
691 	if (!dev)
692 		return NULL;
693 
694 	if (dev_is_pci(dev)) {
695 		struct pci_dev *pf_pdev;
696 
697 		pdev = pci_real_dma_dev(to_pci_dev(dev));
698 
699 		/* VFs aren't listed in scope tables; we need to look up
700 		 * the PF instead to find the IOMMU. */
701 		pf_pdev = pci_physfn(pdev);
702 		dev = &pf_pdev->dev;
703 		segment = pci_domain_nr(pdev->bus);
704 	} else if (has_acpi_companion(dev))
705 		dev = &ACPI_COMPANION(dev)->dev;
706 
707 	rcu_read_lock();
708 	for_each_iommu(iommu, drhd) {
709 		if (pdev && segment != drhd->segment)
710 			continue;
711 
712 		for_each_active_dev_scope(drhd->devices,
713 					  drhd->devices_cnt, i, tmp) {
714 			if (tmp == dev) {
715 				/* For a VF use its original BDF# not that of the PF
716 				 * which we used for the IOMMU lookup. Strictly speaking
717 				 * we could do this for all PCI devices; we only need to
718 				 * get the BDF# from the scope table for ACPI matches. */
719 				if (pdev && pdev->is_virtfn)
720 					goto got_pdev;
721 
722 				if (bus && devfn) {
723 					*bus = drhd->devices[i].bus;
724 					*devfn = drhd->devices[i].devfn;
725 				}
726 				goto out;
727 			}
728 
729 			if (is_downstream_to_pci_bridge(dev, tmp))
730 				goto got_pdev;
731 		}
732 
733 		if (pdev && drhd->include_all) {
734 got_pdev:
735 			if (bus && devfn) {
736 				*bus = pdev->bus->number;
737 				*devfn = pdev->devfn;
738 			}
739 			goto out;
740 		}
741 	}
742 	iommu = NULL;
743 out:
744 	if (iommu_is_dummy(iommu, dev))
745 		iommu = NULL;
746 
747 	rcu_read_unlock();
748 
749 	return iommu;
750 }
751 
752 static void domain_flush_cache(struct dmar_domain *domain,
753 			       void *addr, int size)
754 {
755 	if (!domain->iommu_coherency)
756 		clflush_cache_range(addr, size);
757 }
758 
759 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
760 {
761 	struct context_entry *context;
762 	int ret = 0;
763 
764 	spin_lock(&iommu->lock);
765 	context = iommu_context_addr(iommu, bus, devfn, 0);
766 	if (context)
767 		ret = context_present(context);
768 	spin_unlock(&iommu->lock);
769 	return ret;
770 }
771 
772 static void free_context_table(struct intel_iommu *iommu)
773 {
774 	struct context_entry *context;
775 	int i;
776 
777 	if (!iommu->root_entry)
778 		return;
779 
780 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
781 		context = iommu_context_addr(iommu, i, 0, 0);
782 		if (context)
783 			free_pgtable_page(context);
784 
785 		if (!sm_supported(iommu))
786 			continue;
787 
788 		context = iommu_context_addr(iommu, i, 0x80, 0);
789 		if (context)
790 			free_pgtable_page(context);
791 	}
792 
793 	free_pgtable_page(iommu->root_entry);
794 	iommu->root_entry = NULL;
795 }
796 
797 #ifdef CONFIG_DMAR_DEBUG
798 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
799 {
800 	struct device_domain_info *info;
801 	struct dma_pte *parent, *pte;
802 	struct dmar_domain *domain;
803 	struct pci_dev *pdev;
804 	int offset, level;
805 
806 	pdev = pci_get_domain_bus_and_slot(iommu->segment, bus, devfn);
807 	if (!pdev)
808 		return;
809 
810 	info = dev_iommu_priv_get(&pdev->dev);
811 	if (!info || !info->domain) {
812 		pr_info("device [%02x:%02x.%d] not probed\n",
813 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
814 		return;
815 	}
816 
817 	domain = info->domain;
818 	level = agaw_to_level(domain->agaw);
819 	parent = domain->pgd;
820 	if (!parent) {
821 		pr_info("no page table setup\n");
822 		return;
823 	}
824 
825 	while (1) {
826 		offset = pfn_level_offset(pfn, level);
827 		pte = &parent[offset];
828 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
829 			pr_info("PTE not present at level %d\n", level);
830 			break;
831 		}
832 
833 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
834 
835 		if (level == 1)
836 			break;
837 
838 		parent = phys_to_virt(dma_pte_addr(pte));
839 		level--;
840 	}
841 }
842 
843 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
844 			  unsigned long long addr, u32 pasid)
845 {
846 	struct pasid_dir_entry *dir, *pde;
847 	struct pasid_entry *entries, *pte;
848 	struct context_entry *ctx_entry;
849 	struct root_entry *rt_entry;
850 	u8 devfn = source_id & 0xff;
851 	u8 bus = source_id >> 8;
852 	int i, dir_index, index;
853 
854 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
855 
856 	/* root entry dump */
857 	rt_entry = &iommu->root_entry[bus];
858 	if (!rt_entry) {
859 		pr_info("root table entry is not present\n");
860 		return;
861 	}
862 
863 	if (sm_supported(iommu))
864 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
865 			rt_entry->hi, rt_entry->lo);
866 	else
867 		pr_info("root entry: 0x%016llx", rt_entry->lo);
868 
869 	/* context entry dump */
870 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
871 	if (!ctx_entry) {
872 		pr_info("context table entry is not present\n");
873 		return;
874 	}
875 
876 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
877 		ctx_entry->hi, ctx_entry->lo);
878 
879 	/* legacy mode does not require PASID entries */
880 	if (!sm_supported(iommu))
881 		goto pgtable_walk;
882 
883 	/* get the pointer to pasid directory entry */
884 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
885 	if (!dir) {
886 		pr_info("pasid directory entry is not present\n");
887 		return;
888 	}
889 	/* For request-without-pasid, get the pasid from context entry */
890 	if (intel_iommu_sm && pasid == INVALID_IOASID)
891 		pasid = PASID_RID2PASID;
892 
893 	dir_index = pasid >> PASID_PDE_SHIFT;
894 	pde = &dir[dir_index];
895 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
896 
897 	/* get the pointer to the pasid table entry */
898 	entries = get_pasid_table_from_pde(pde);
899 	if (!entries) {
900 		pr_info("pasid table entry is not present\n");
901 		return;
902 	}
903 	index = pasid & PASID_PTE_MASK;
904 	pte = &entries[index];
905 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
906 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
907 
908 pgtable_walk:
909 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
910 }
911 #endif
912 
913 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
914 				      unsigned long pfn, int *target_level)
915 {
916 	struct dma_pte *parent, *pte;
917 	int level = agaw_to_level(domain->agaw);
918 	int offset;
919 
920 	BUG_ON(!domain->pgd);
921 
922 	if (!domain_pfn_supported(domain, pfn))
923 		/* Address beyond IOMMU's addressing capabilities. */
924 		return NULL;
925 
926 	parent = domain->pgd;
927 
928 	while (1) {
929 		void *tmp_page;
930 
931 		offset = pfn_level_offset(pfn, level);
932 		pte = &parent[offset];
933 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
934 			break;
935 		if (level == *target_level)
936 			break;
937 
938 		if (!dma_pte_present(pte)) {
939 			uint64_t pteval;
940 
941 			tmp_page = alloc_pgtable_page(domain->nid);
942 
943 			if (!tmp_page)
944 				return NULL;
945 
946 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
947 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
948 			if (domain_use_first_level(domain)) {
949 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
950 				if (iommu_is_dma_domain(&domain->domain))
951 					pteval |= DMA_FL_PTE_ACCESS;
952 			}
953 			if (cmpxchg64(&pte->val, 0ULL, pteval))
954 				/* Someone else set it while we were thinking; use theirs. */
955 				free_pgtable_page(tmp_page);
956 			else
957 				domain_flush_cache(domain, pte, sizeof(*pte));
958 		}
959 		if (level == 1)
960 			break;
961 
962 		parent = phys_to_virt(dma_pte_addr(pte));
963 		level--;
964 	}
965 
966 	if (!*target_level)
967 		*target_level = level;
968 
969 	return pte;
970 }
971 
972 /* return address's pte at specific level */
973 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
974 					 unsigned long pfn,
975 					 int level, int *large_page)
976 {
977 	struct dma_pte *parent, *pte;
978 	int total = agaw_to_level(domain->agaw);
979 	int offset;
980 
981 	parent = domain->pgd;
982 	while (level <= total) {
983 		offset = pfn_level_offset(pfn, total);
984 		pte = &parent[offset];
985 		if (level == total)
986 			return pte;
987 
988 		if (!dma_pte_present(pte)) {
989 			*large_page = total;
990 			break;
991 		}
992 
993 		if (dma_pte_superpage(pte)) {
994 			*large_page = total;
995 			return pte;
996 		}
997 
998 		parent = phys_to_virt(dma_pte_addr(pte));
999 		total--;
1000 	}
1001 	return NULL;
1002 }
1003 
1004 /* clear last level pte, a tlb flush should be followed */
1005 static void dma_pte_clear_range(struct dmar_domain *domain,
1006 				unsigned long start_pfn,
1007 				unsigned long last_pfn)
1008 {
1009 	unsigned int large_page;
1010 	struct dma_pte *first_pte, *pte;
1011 
1012 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1013 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1014 	BUG_ON(start_pfn > last_pfn);
1015 
1016 	/* we don't need lock here; nobody else touches the iova range */
1017 	do {
1018 		large_page = 1;
1019 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1020 		if (!pte) {
1021 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1022 			continue;
1023 		}
1024 		do {
1025 			dma_clear_pte(pte);
1026 			start_pfn += lvl_to_nr_pages(large_page);
1027 			pte++;
1028 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1029 
1030 		domain_flush_cache(domain, first_pte,
1031 				   (void *)pte - (void *)first_pte);
1032 
1033 	} while (start_pfn && start_pfn <= last_pfn);
1034 }
1035 
1036 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1037 			       int retain_level, struct dma_pte *pte,
1038 			       unsigned long pfn, unsigned long start_pfn,
1039 			       unsigned long last_pfn)
1040 {
1041 	pfn = max(start_pfn, pfn);
1042 	pte = &pte[pfn_level_offset(pfn, level)];
1043 
1044 	do {
1045 		unsigned long level_pfn;
1046 		struct dma_pte *level_pte;
1047 
1048 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1049 			goto next;
1050 
1051 		level_pfn = pfn & level_mask(level);
1052 		level_pte = phys_to_virt(dma_pte_addr(pte));
1053 
1054 		if (level > 2) {
1055 			dma_pte_free_level(domain, level - 1, retain_level,
1056 					   level_pte, level_pfn, start_pfn,
1057 					   last_pfn);
1058 		}
1059 
1060 		/*
1061 		 * Free the page table if we're below the level we want to
1062 		 * retain and the range covers the entire table.
1063 		 */
1064 		if (level < retain_level && !(start_pfn > level_pfn ||
1065 		      last_pfn < level_pfn + level_size(level) - 1)) {
1066 			dma_clear_pte(pte);
1067 			domain_flush_cache(domain, pte, sizeof(*pte));
1068 			free_pgtable_page(level_pte);
1069 		}
1070 next:
1071 		pfn += level_size(level);
1072 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1073 }
1074 
1075 /*
1076  * clear last level (leaf) ptes and free page table pages below the
1077  * level we wish to keep intact.
1078  */
1079 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1080 				   unsigned long start_pfn,
1081 				   unsigned long last_pfn,
1082 				   int retain_level)
1083 {
1084 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1085 
1086 	/* We don't need lock here; nobody else touches the iova range */
1087 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1088 			   domain->pgd, 0, start_pfn, last_pfn);
1089 
1090 	/* free pgd */
1091 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1092 		free_pgtable_page(domain->pgd);
1093 		domain->pgd = NULL;
1094 	}
1095 }
1096 
1097 /* When a page at a given level is being unlinked from its parent, we don't
1098    need to *modify* it at all. All we need to do is make a list of all the
1099    pages which can be freed just as soon as we've flushed the IOTLB and we
1100    know the hardware page-walk will no longer touch them.
1101    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1102    be freed. */
1103 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1104 				    int level, struct dma_pte *pte,
1105 				    struct list_head *freelist)
1106 {
1107 	struct page *pg;
1108 
1109 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1110 	list_add_tail(&pg->lru, freelist);
1111 
1112 	if (level == 1)
1113 		return;
1114 
1115 	pte = page_address(pg);
1116 	do {
1117 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1118 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1119 		pte++;
1120 	} while (!first_pte_in_page(pte));
1121 }
1122 
1123 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1124 				struct dma_pte *pte, unsigned long pfn,
1125 				unsigned long start_pfn, unsigned long last_pfn,
1126 				struct list_head *freelist)
1127 {
1128 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1129 
1130 	pfn = max(start_pfn, pfn);
1131 	pte = &pte[pfn_level_offset(pfn, level)];
1132 
1133 	do {
1134 		unsigned long level_pfn = pfn & level_mask(level);
1135 
1136 		if (!dma_pte_present(pte))
1137 			goto next;
1138 
1139 		/* If range covers entire pagetable, free it */
1140 		if (start_pfn <= level_pfn &&
1141 		    last_pfn >= level_pfn + level_size(level) - 1) {
1142 			/* These suborbinate page tables are going away entirely. Don't
1143 			   bother to clear them; we're just going to *free* them. */
1144 			if (level > 1 && !dma_pte_superpage(pte))
1145 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1146 
1147 			dma_clear_pte(pte);
1148 			if (!first_pte)
1149 				first_pte = pte;
1150 			last_pte = pte;
1151 		} else if (level > 1) {
1152 			/* Recurse down into a level that isn't *entirely* obsolete */
1153 			dma_pte_clear_level(domain, level - 1,
1154 					    phys_to_virt(dma_pte_addr(pte)),
1155 					    level_pfn, start_pfn, last_pfn,
1156 					    freelist);
1157 		}
1158 next:
1159 		pfn = level_pfn + level_size(level);
1160 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161 
1162 	if (first_pte)
1163 		domain_flush_cache(domain, first_pte,
1164 				   (void *)++last_pte - (void *)first_pte);
1165 }
1166 
1167 /* We can't just free the pages because the IOMMU may still be walking
1168    the page tables, and may have cached the intermediate levels. The
1169    pages can only be freed after the IOTLB flush has been done. */
1170 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1171 			 unsigned long last_pfn, struct list_head *freelist)
1172 {
1173 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1174 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1175 	BUG_ON(start_pfn > last_pfn);
1176 
1177 	/* we don't need lock here; nobody else touches the iova range */
1178 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1179 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1180 
1181 	/* free pgd */
1182 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1183 		struct page *pgd_page = virt_to_page(domain->pgd);
1184 		list_add_tail(&pgd_page->lru, freelist);
1185 		domain->pgd = NULL;
1186 	}
1187 }
1188 
1189 /* iommu handling */
1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191 {
1192 	struct root_entry *root;
1193 
1194 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1195 	if (!root) {
1196 		pr_err("Allocating root entry for %s failed\n",
1197 			iommu->name);
1198 		return -ENOMEM;
1199 	}
1200 
1201 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1202 	iommu->root_entry = root;
1203 
1204 	return 0;
1205 }
1206 
1207 static void iommu_set_root_entry(struct intel_iommu *iommu)
1208 {
1209 	u64 addr;
1210 	u32 sts;
1211 	unsigned long flag;
1212 
1213 	addr = virt_to_phys(iommu->root_entry);
1214 	if (sm_supported(iommu))
1215 		addr |= DMA_RTADDR_SMT;
1216 
1217 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1218 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1219 
1220 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1221 
1222 	/* Make sure hardware complete it */
1223 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224 		      readl, (sts & DMA_GSTS_RTPS), sts);
1225 
1226 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 
1228 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1229 	if (sm_supported(iommu))
1230 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1231 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1232 }
1233 
1234 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1235 {
1236 	u32 val;
1237 	unsigned long flag;
1238 
1239 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1240 		return;
1241 
1242 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1243 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1244 
1245 	/* Make sure hardware complete it */
1246 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1247 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1248 
1249 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1250 }
1251 
1252 /* return value determine if we need a write buffer flush */
1253 static void __iommu_flush_context(struct intel_iommu *iommu,
1254 				  u16 did, u16 source_id, u8 function_mask,
1255 				  u64 type)
1256 {
1257 	u64 val = 0;
1258 	unsigned long flag;
1259 
1260 	switch (type) {
1261 	case DMA_CCMD_GLOBAL_INVL:
1262 		val = DMA_CCMD_GLOBAL_INVL;
1263 		break;
1264 	case DMA_CCMD_DOMAIN_INVL:
1265 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1266 		break;
1267 	case DMA_CCMD_DEVICE_INVL:
1268 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1269 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1270 		break;
1271 	default:
1272 		BUG();
1273 	}
1274 	val |= DMA_CCMD_ICC;
1275 
1276 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1277 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1278 
1279 	/* Make sure hardware complete it */
1280 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1281 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1282 
1283 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1284 }
1285 
1286 /* return value determine if we need a write buffer flush */
1287 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1288 				u64 addr, unsigned int size_order, u64 type)
1289 {
1290 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1291 	u64 val = 0, val_iva = 0;
1292 	unsigned long flag;
1293 
1294 	switch (type) {
1295 	case DMA_TLB_GLOBAL_FLUSH:
1296 		/* global flush doesn't need set IVA_REG */
1297 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1298 		break;
1299 	case DMA_TLB_DSI_FLUSH:
1300 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1301 		break;
1302 	case DMA_TLB_PSI_FLUSH:
1303 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1304 		/* IH bit is passed in as part of address */
1305 		val_iva = size_order | addr;
1306 		break;
1307 	default:
1308 		BUG();
1309 	}
1310 	/* Note: set drain read/write */
1311 #if 0
1312 	/*
1313 	 * This is probably to be super secure.. Looks like we can
1314 	 * ignore it without any impact.
1315 	 */
1316 	if (cap_read_drain(iommu->cap))
1317 		val |= DMA_TLB_READ_DRAIN;
1318 #endif
1319 	if (cap_write_drain(iommu->cap))
1320 		val |= DMA_TLB_WRITE_DRAIN;
1321 
1322 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1323 	/* Note: Only uses first TLB reg currently */
1324 	if (val_iva)
1325 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1326 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1327 
1328 	/* Make sure hardware complete it */
1329 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1330 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1331 
1332 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1333 
1334 	/* check IOTLB invalidation granularity */
1335 	if (DMA_TLB_IAIG(val) == 0)
1336 		pr_err("Flush IOTLB failed\n");
1337 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1338 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1339 			(unsigned long long)DMA_TLB_IIRG(type),
1340 			(unsigned long long)DMA_TLB_IAIG(val));
1341 }
1342 
1343 static struct device_domain_info *
1344 iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu,
1345 			u8 bus, u8 devfn)
1346 {
1347 	struct device_domain_info *info;
1348 
1349 	if (!iommu->qi)
1350 		return NULL;
1351 
1352 	spin_lock(&domain->lock);
1353 	list_for_each_entry(info, &domain->devices, link) {
1354 		if (info->iommu == iommu && info->bus == bus &&
1355 		    info->devfn == devfn) {
1356 			spin_unlock(&domain->lock);
1357 			return info->ats_supported ? info : NULL;
1358 		}
1359 	}
1360 	spin_unlock(&domain->lock);
1361 
1362 	return NULL;
1363 }
1364 
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367 	struct device_domain_info *info;
1368 	bool has_iotlb_device = false;
1369 
1370 	spin_lock(&domain->lock);
1371 	list_for_each_entry(info, &domain->devices, link) {
1372 		if (info->ats_enabled) {
1373 			has_iotlb_device = true;
1374 			break;
1375 		}
1376 	}
1377 	domain->has_iotlb_device = has_iotlb_device;
1378 	spin_unlock(&domain->lock);
1379 }
1380 
1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1382 {
1383 	struct pci_dev *pdev;
1384 
1385 	if (!info || !dev_is_pci(info->dev))
1386 		return;
1387 
1388 	pdev = to_pci_dev(info->dev);
1389 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1390 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1391 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1392 	 * reserved, which should be set to 0.
1393 	 */
1394 	if (!ecap_dit(info->iommu->ecap))
1395 		info->pfsid = 0;
1396 	else {
1397 		struct pci_dev *pf_pdev;
1398 
1399 		/* pdev will be returned if device is not a vf */
1400 		pf_pdev = pci_physfn(pdev);
1401 		info->pfsid = pci_dev_id(pf_pdev);
1402 	}
1403 
1404 #ifdef CONFIG_INTEL_IOMMU_SVM
1405 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1406 	   the device if you enable PASID support after ATS support is
1407 	   undefined. So always enable PASID support on devices which
1408 	   have it, even if we can't yet know if we're ever going to
1409 	   use it. */
1410 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1411 		info->pasid_enabled = 1;
1412 
1413 	if (info->pri_supported &&
1414 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1415 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1416 		info->pri_enabled = 1;
1417 #endif
1418 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1419 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1420 		info->ats_enabled = 1;
1421 		domain_update_iotlb(info->domain);
1422 		info->ats_qdep = pci_ats_queue_depth(pdev);
1423 	}
1424 }
1425 
1426 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1427 {
1428 	struct pci_dev *pdev;
1429 
1430 	if (!dev_is_pci(info->dev))
1431 		return;
1432 
1433 	pdev = to_pci_dev(info->dev);
1434 
1435 	if (info->ats_enabled) {
1436 		pci_disable_ats(pdev);
1437 		info->ats_enabled = 0;
1438 		domain_update_iotlb(info->domain);
1439 	}
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441 	if (info->pri_enabled) {
1442 		pci_disable_pri(pdev);
1443 		info->pri_enabled = 0;
1444 	}
1445 	if (info->pasid_enabled) {
1446 		pci_disable_pasid(pdev);
1447 		info->pasid_enabled = 0;
1448 	}
1449 #endif
1450 }
1451 
1452 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1453 				    u64 addr, unsigned int mask)
1454 {
1455 	u16 sid, qdep;
1456 
1457 	if (!info || !info->ats_enabled)
1458 		return;
1459 
1460 	sid = info->bus << 8 | info->devfn;
1461 	qdep = info->ats_qdep;
1462 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1463 			   qdep, addr, mask);
1464 }
1465 
1466 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1467 				  u64 addr, unsigned mask)
1468 {
1469 	struct device_domain_info *info;
1470 
1471 	if (!domain->has_iotlb_device)
1472 		return;
1473 
1474 	spin_lock(&domain->lock);
1475 	list_for_each_entry(info, &domain->devices, link)
1476 		__iommu_flush_dev_iotlb(info, addr, mask);
1477 	spin_unlock(&domain->lock);
1478 }
1479 
1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1481 				  struct dmar_domain *domain,
1482 				  unsigned long pfn, unsigned int pages,
1483 				  int ih, int map)
1484 {
1485 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1486 	unsigned int mask = ilog2(aligned_pages);
1487 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1488 	u16 did = domain_id_iommu(domain, iommu);
1489 
1490 	BUG_ON(pages == 0);
1491 
1492 	if (ih)
1493 		ih = 1 << 6;
1494 
1495 	if (domain_use_first_level(domain)) {
1496 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1497 	} else {
1498 		unsigned long bitmask = aligned_pages - 1;
1499 
1500 		/*
1501 		 * PSI masks the low order bits of the base address. If the
1502 		 * address isn't aligned to the mask, then compute a mask value
1503 		 * needed to ensure the target range is flushed.
1504 		 */
1505 		if (unlikely(bitmask & pfn)) {
1506 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1507 
1508 			/*
1509 			 * Since end_pfn <= pfn + bitmask, the only way bits
1510 			 * higher than bitmask can differ in pfn and end_pfn is
1511 			 * by carrying. This means after masking out bitmask,
1512 			 * high bits starting with the first set bit in
1513 			 * shared_bits are all equal in both pfn and end_pfn.
1514 			 */
1515 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1516 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1517 		}
1518 
1519 		/*
1520 		 * Fallback to domain selective flush if no PSI support or
1521 		 * the size is too big.
1522 		 */
1523 		if (!cap_pgsel_inv(iommu->cap) ||
1524 		    mask > cap_max_amask_val(iommu->cap))
1525 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1526 							DMA_TLB_DSI_FLUSH);
1527 		else
1528 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1529 							DMA_TLB_PSI_FLUSH);
1530 	}
1531 
1532 	/*
1533 	 * In caching mode, changes of pages from non-present to present require
1534 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1535 	 */
1536 	if (!cap_caching_mode(iommu->cap) || !map)
1537 		iommu_flush_dev_iotlb(domain, addr, mask);
1538 }
1539 
1540 /* Notification for newly created mappings */
1541 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1542 					struct dmar_domain *domain,
1543 					unsigned long pfn, unsigned int pages)
1544 {
1545 	/*
1546 	 * It's a non-present to present mapping. Only flush if caching mode
1547 	 * and second level.
1548 	 */
1549 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1550 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1551 	else
1552 		iommu_flush_write_buffer(iommu);
1553 }
1554 
1555 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1556 {
1557 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1558 	struct iommu_domain_info *info;
1559 	unsigned long idx;
1560 
1561 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1562 		struct intel_iommu *iommu = info->iommu;
1563 		u16 did = domain_id_iommu(dmar_domain, iommu);
1564 
1565 		if (domain_use_first_level(dmar_domain))
1566 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1567 		else
1568 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1569 						 DMA_TLB_DSI_FLUSH);
1570 
1571 		if (!cap_caching_mode(iommu->cap))
1572 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1573 	}
1574 }
1575 
1576 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1577 {
1578 	u32 pmen;
1579 	unsigned long flags;
1580 
1581 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1582 		return;
1583 
1584 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1585 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1586 	pmen &= ~DMA_PMEN_EPM;
1587 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1588 
1589 	/* wait for the protected region status bit to clear */
1590 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1591 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1592 
1593 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1594 }
1595 
1596 static void iommu_enable_translation(struct intel_iommu *iommu)
1597 {
1598 	u32 sts;
1599 	unsigned long flags;
1600 
1601 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1602 	iommu->gcmd |= DMA_GCMD_TE;
1603 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1604 
1605 	/* Make sure hardware complete it */
1606 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1607 		      readl, (sts & DMA_GSTS_TES), sts);
1608 
1609 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1610 }
1611 
1612 static void iommu_disable_translation(struct intel_iommu *iommu)
1613 {
1614 	u32 sts;
1615 	unsigned long flag;
1616 
1617 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1618 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1619 		return;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1622 	iommu->gcmd &= ~DMA_GCMD_TE;
1623 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624 
1625 	/* Make sure hardware complete it */
1626 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1628 
1629 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1630 }
1631 
1632 static int iommu_init_domains(struct intel_iommu *iommu)
1633 {
1634 	u32 ndomains;
1635 
1636 	ndomains = cap_ndoms(iommu->cap);
1637 	pr_debug("%s: Number of Domains supported <%d>\n",
1638 		 iommu->name, ndomains);
1639 
1640 	spin_lock_init(&iommu->lock);
1641 
1642 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1643 	if (!iommu->domain_ids)
1644 		return -ENOMEM;
1645 
1646 	/*
1647 	 * If Caching mode is set, then invalid translations are tagged
1648 	 * with domain-id 0, hence we need to pre-allocate it. We also
1649 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1650 	 * make sure it is not used for a real domain.
1651 	 */
1652 	set_bit(0, iommu->domain_ids);
1653 
1654 	/*
1655 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1656 	 * entry for first-level or pass-through translation modes should
1657 	 * be programmed with a domain id different from those used for
1658 	 * second-level or nested translation. We reserve a domain id for
1659 	 * this purpose.
1660 	 */
1661 	if (sm_supported(iommu))
1662 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1663 
1664 	return 0;
1665 }
1666 
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 {
1669 	if (!iommu->domain_ids)
1670 		return;
1671 
1672 	/*
1673 	 * All iommu domains must have been detached from the devices,
1674 	 * hence there should be no domain IDs in use.
1675 	 */
1676 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1677 		    > NUM_RESERVED_DID))
1678 		return;
1679 
1680 	if (iommu->gcmd & DMA_GCMD_TE)
1681 		iommu_disable_translation(iommu);
1682 }
1683 
1684 static void free_dmar_iommu(struct intel_iommu *iommu)
1685 {
1686 	if (iommu->domain_ids) {
1687 		bitmap_free(iommu->domain_ids);
1688 		iommu->domain_ids = NULL;
1689 	}
1690 
1691 	/* free context mapping */
1692 	free_context_table(iommu);
1693 
1694 #ifdef CONFIG_INTEL_IOMMU_SVM
1695 	if (pasid_supported(iommu)) {
1696 		if (ecap_prs(iommu->ecap))
1697 			intel_svm_finish_prq(iommu);
1698 	}
1699 	if (vccap_pasid(iommu->vccap))
1700 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1701 
1702 #endif
1703 }
1704 
1705 /*
1706  * Check and return whether first level is used by default for
1707  * DMA translation.
1708  */
1709 static bool first_level_by_default(unsigned int type)
1710 {
1711 	/* Only SL is available in legacy mode */
1712 	if (!scalable_mode_support())
1713 		return false;
1714 
1715 	/* Only level (either FL or SL) is available, just use it */
1716 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1717 		return intel_cap_flts_sanity();
1718 
1719 	/* Both levels are available, decide it based on domain type */
1720 	return type != IOMMU_DOMAIN_UNMANAGED;
1721 }
1722 
1723 static struct dmar_domain *alloc_domain(unsigned int type)
1724 {
1725 	struct dmar_domain *domain;
1726 
1727 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1728 	if (!domain)
1729 		return NULL;
1730 
1731 	domain->nid = NUMA_NO_NODE;
1732 	if (first_level_by_default(type))
1733 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1734 	domain->has_iotlb_device = false;
1735 	INIT_LIST_HEAD(&domain->devices);
1736 	spin_lock_init(&domain->lock);
1737 	xa_init(&domain->iommu_array);
1738 
1739 	return domain;
1740 }
1741 
1742 static int domain_attach_iommu(struct dmar_domain *domain,
1743 			       struct intel_iommu *iommu)
1744 {
1745 	struct iommu_domain_info *info, *curr;
1746 	unsigned long ndomains;
1747 	int num, ret = -ENOSPC;
1748 
1749 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1750 	if (!info)
1751 		return -ENOMEM;
1752 
1753 	spin_lock(&iommu->lock);
1754 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1755 	if (curr) {
1756 		curr->refcnt++;
1757 		spin_unlock(&iommu->lock);
1758 		kfree(info);
1759 		return 0;
1760 	}
1761 
1762 	ndomains = cap_ndoms(iommu->cap);
1763 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1764 	if (num >= ndomains) {
1765 		pr_err("%s: No free domain ids\n", iommu->name);
1766 		goto err_unlock;
1767 	}
1768 
1769 	set_bit(num, iommu->domain_ids);
1770 	info->refcnt	= 1;
1771 	info->did	= num;
1772 	info->iommu	= iommu;
1773 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1774 			  NULL, info, GFP_ATOMIC);
1775 	if (curr) {
1776 		ret = xa_err(curr) ? : -EBUSY;
1777 		goto err_clear;
1778 	}
1779 	domain_update_iommu_cap(domain);
1780 
1781 	spin_unlock(&iommu->lock);
1782 	return 0;
1783 
1784 err_clear:
1785 	clear_bit(info->did, iommu->domain_ids);
1786 err_unlock:
1787 	spin_unlock(&iommu->lock);
1788 	kfree(info);
1789 	return ret;
1790 }
1791 
1792 static void domain_detach_iommu(struct dmar_domain *domain,
1793 				struct intel_iommu *iommu)
1794 {
1795 	struct iommu_domain_info *info;
1796 
1797 	spin_lock(&iommu->lock);
1798 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1799 	if (--info->refcnt == 0) {
1800 		clear_bit(info->did, iommu->domain_ids);
1801 		xa_erase(&domain->iommu_array, iommu->seq_id);
1802 		domain->nid = NUMA_NO_NODE;
1803 		domain_update_iommu_cap(domain);
1804 		kfree(info);
1805 	}
1806 	spin_unlock(&iommu->lock);
1807 }
1808 
1809 static inline int guestwidth_to_adjustwidth(int gaw)
1810 {
1811 	int agaw;
1812 	int r = (gaw - 12) % 9;
1813 
1814 	if (r == 0)
1815 		agaw = gaw;
1816 	else
1817 		agaw = gaw + 9 - r;
1818 	if (agaw > 64)
1819 		agaw = 64;
1820 	return agaw;
1821 }
1822 
1823 static void domain_exit(struct dmar_domain *domain)
1824 {
1825 	if (domain->pgd) {
1826 		LIST_HEAD(freelist);
1827 
1828 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1829 		put_pages_list(&freelist);
1830 	}
1831 
1832 	if (WARN_ON(!list_empty(&domain->devices)))
1833 		return;
1834 
1835 	kfree(domain);
1836 }
1837 
1838 /*
1839  * Get the PASID directory size for scalable mode context entry.
1840  * Value of X in the PDTS field of a scalable mode context entry
1841  * indicates PASID directory with 2^(X + 7) entries.
1842  */
1843 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1844 {
1845 	unsigned long pds, max_pde;
1846 
1847 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1848 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1849 	if (pds < 7)
1850 		return 0;
1851 
1852 	return pds - 7;
1853 }
1854 
1855 /*
1856  * Set the RID_PASID field of a scalable mode context entry. The
1857  * IOMMU hardware will use the PASID value set in this field for
1858  * DMA translations of DMA requests without PASID.
1859  */
1860 static inline void
1861 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1862 {
1863 	context->hi |= pasid & ((1 << 20) - 1);
1864 }
1865 
1866 /*
1867  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1868  * entry.
1869  */
1870 static inline void context_set_sm_dte(struct context_entry *context)
1871 {
1872 	context->lo |= (1 << 2);
1873 }
1874 
1875 /*
1876  * Set the PRE(Page Request Enable) field of a scalable mode context
1877  * entry.
1878  */
1879 static inline void context_set_sm_pre(struct context_entry *context)
1880 {
1881 	context->lo |= (1 << 4);
1882 }
1883 
1884 /* Convert value to context PASID directory size field coding. */
1885 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1886 
1887 static int domain_context_mapping_one(struct dmar_domain *domain,
1888 				      struct intel_iommu *iommu,
1889 				      struct pasid_table *table,
1890 				      u8 bus, u8 devfn)
1891 {
1892 	struct device_domain_info *info =
1893 			iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1894 	u16 did = domain_id_iommu(domain, iommu);
1895 	int translation = CONTEXT_TT_MULTI_LEVEL;
1896 	struct context_entry *context;
1897 	int ret;
1898 
1899 	WARN_ON(did == 0);
1900 
1901 	if (hw_pass_through && domain_type_is_si(domain))
1902 		translation = CONTEXT_TT_PASS_THROUGH;
1903 
1904 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1905 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1906 
1907 	BUG_ON(!domain->pgd);
1908 
1909 	spin_lock(&iommu->lock);
1910 	ret = -ENOMEM;
1911 	context = iommu_context_addr(iommu, bus, devfn, 1);
1912 	if (!context)
1913 		goto out_unlock;
1914 
1915 	ret = 0;
1916 	if (context_present(context))
1917 		goto out_unlock;
1918 
1919 	/*
1920 	 * For kdump cases, old valid entries may be cached due to the
1921 	 * in-flight DMA and copied pgtable, but there is no unmapping
1922 	 * behaviour for them, thus we need an explicit cache flush for
1923 	 * the newly-mapped device. For kdump, at this point, the device
1924 	 * is supposed to finish reset at its driver probe stage, so no
1925 	 * in-flight DMA will exist, and we don't need to worry anymore
1926 	 * hereafter.
1927 	 */
1928 	if (context_copied(context)) {
1929 		u16 did_old = context_domain_id(context);
1930 
1931 		if (did_old < cap_ndoms(iommu->cap)) {
1932 			iommu->flush.flush_context(iommu, did_old,
1933 						   (((u16)bus) << 8) | devfn,
1934 						   DMA_CCMD_MASK_NOBIT,
1935 						   DMA_CCMD_DEVICE_INVL);
1936 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1937 						 DMA_TLB_DSI_FLUSH);
1938 		}
1939 	}
1940 
1941 	context_clear_entry(context);
1942 
1943 	if (sm_supported(iommu)) {
1944 		unsigned long pds;
1945 
1946 		WARN_ON(!table);
1947 
1948 		/* Setup the PASID DIR pointer: */
1949 		pds = context_get_sm_pds(table);
1950 		context->lo = (u64)virt_to_phys(table->table) |
1951 				context_pdts(pds);
1952 
1953 		/* Setup the RID_PASID field: */
1954 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1955 
1956 		/*
1957 		 * Setup the Device-TLB enable bit and Page request
1958 		 * Enable bit:
1959 		 */
1960 		if (info && info->ats_supported)
1961 			context_set_sm_dte(context);
1962 		if (info && info->pri_supported)
1963 			context_set_sm_pre(context);
1964 	} else {
1965 		struct dma_pte *pgd = domain->pgd;
1966 		int agaw;
1967 
1968 		context_set_domain_id(context, did);
1969 
1970 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1971 			/*
1972 			 * Skip top levels of page tables for iommu which has
1973 			 * less agaw than default. Unnecessary for PT mode.
1974 			 */
1975 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1976 				ret = -ENOMEM;
1977 				pgd = phys_to_virt(dma_pte_addr(pgd));
1978 				if (!dma_pte_present(pgd))
1979 					goto out_unlock;
1980 			}
1981 
1982 			if (info && info->ats_supported)
1983 				translation = CONTEXT_TT_DEV_IOTLB;
1984 			else
1985 				translation = CONTEXT_TT_MULTI_LEVEL;
1986 
1987 			context_set_address_root(context, virt_to_phys(pgd));
1988 			context_set_address_width(context, agaw);
1989 		} else {
1990 			/*
1991 			 * In pass through mode, AW must be programmed to
1992 			 * indicate the largest AGAW value supported by
1993 			 * hardware. And ASR is ignored by hardware.
1994 			 */
1995 			context_set_address_width(context, iommu->msagaw);
1996 		}
1997 
1998 		context_set_translation_type(context, translation);
1999 	}
2000 
2001 	context_set_fault_enable(context);
2002 	context_set_present(context);
2003 	if (!ecap_coherent(iommu->ecap))
2004 		clflush_cache_range(context, sizeof(*context));
2005 
2006 	/*
2007 	 * It's a non-present to present mapping. If hardware doesn't cache
2008 	 * non-present entry we only need to flush the write-buffer. If the
2009 	 * _does_ cache non-present entries, then it does so in the special
2010 	 * domain #0, which we have to flush:
2011 	 */
2012 	if (cap_caching_mode(iommu->cap)) {
2013 		iommu->flush.flush_context(iommu, 0,
2014 					   (((u16)bus) << 8) | devfn,
2015 					   DMA_CCMD_MASK_NOBIT,
2016 					   DMA_CCMD_DEVICE_INVL);
2017 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2018 	} else {
2019 		iommu_flush_write_buffer(iommu);
2020 	}
2021 	iommu_enable_dev_iotlb(info);
2022 
2023 	ret = 0;
2024 
2025 out_unlock:
2026 	spin_unlock(&iommu->lock);
2027 
2028 	return ret;
2029 }
2030 
2031 struct domain_context_mapping_data {
2032 	struct dmar_domain *domain;
2033 	struct intel_iommu *iommu;
2034 	struct pasid_table *table;
2035 };
2036 
2037 static int domain_context_mapping_cb(struct pci_dev *pdev,
2038 				     u16 alias, void *opaque)
2039 {
2040 	struct domain_context_mapping_data *data = opaque;
2041 
2042 	return domain_context_mapping_one(data->domain, data->iommu,
2043 					  data->table, PCI_BUS_NUM(alias),
2044 					  alias & 0xff);
2045 }
2046 
2047 static int
2048 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2049 {
2050 	struct domain_context_mapping_data data;
2051 	struct pasid_table *table;
2052 	struct intel_iommu *iommu;
2053 	u8 bus, devfn;
2054 
2055 	iommu = device_to_iommu(dev, &bus, &devfn);
2056 	if (!iommu)
2057 		return -ENODEV;
2058 
2059 	table = intel_pasid_get_table(dev);
2060 
2061 	if (!dev_is_pci(dev))
2062 		return domain_context_mapping_one(domain, iommu, table,
2063 						  bus, devfn);
2064 
2065 	data.domain = domain;
2066 	data.iommu = iommu;
2067 	data.table = table;
2068 
2069 	return pci_for_each_dma_alias(to_pci_dev(dev),
2070 				      &domain_context_mapping_cb, &data);
2071 }
2072 
2073 static int domain_context_mapped_cb(struct pci_dev *pdev,
2074 				    u16 alias, void *opaque)
2075 {
2076 	struct intel_iommu *iommu = opaque;
2077 
2078 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2079 }
2080 
2081 static int domain_context_mapped(struct device *dev)
2082 {
2083 	struct intel_iommu *iommu;
2084 	u8 bus, devfn;
2085 
2086 	iommu = device_to_iommu(dev, &bus, &devfn);
2087 	if (!iommu)
2088 		return -ENODEV;
2089 
2090 	if (!dev_is_pci(dev))
2091 		return device_context_mapped(iommu, bus, devfn);
2092 
2093 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2094 				       domain_context_mapped_cb, iommu);
2095 }
2096 
2097 /* Returns a number of VTD pages, but aligned to MM page size */
2098 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2099 					    size_t size)
2100 {
2101 	host_addr &= ~PAGE_MASK;
2102 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2103 }
2104 
2105 /* Return largest possible superpage level for a given mapping */
2106 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2107 					  unsigned long iov_pfn,
2108 					  unsigned long phy_pfn,
2109 					  unsigned long pages)
2110 {
2111 	int support, level = 1;
2112 	unsigned long pfnmerge;
2113 
2114 	support = domain->iommu_superpage;
2115 
2116 	/* To use a large page, the virtual *and* physical addresses
2117 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2118 	   of them will mean we have to use smaller pages. So just
2119 	   merge them and check both at once. */
2120 	pfnmerge = iov_pfn | phy_pfn;
2121 
2122 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2123 		pages >>= VTD_STRIDE_SHIFT;
2124 		if (!pages)
2125 			break;
2126 		pfnmerge >>= VTD_STRIDE_SHIFT;
2127 		level++;
2128 		support--;
2129 	}
2130 	return level;
2131 }
2132 
2133 /*
2134  * Ensure that old small page tables are removed to make room for superpage(s).
2135  * We're going to add new large pages, so make sure we don't remove their parent
2136  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2137  */
2138 static void switch_to_super_page(struct dmar_domain *domain,
2139 				 unsigned long start_pfn,
2140 				 unsigned long end_pfn, int level)
2141 {
2142 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2143 	struct iommu_domain_info *info;
2144 	struct dma_pte *pte = NULL;
2145 	unsigned long i;
2146 
2147 	while (start_pfn <= end_pfn) {
2148 		if (!pte)
2149 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2150 
2151 		if (dma_pte_present(pte)) {
2152 			dma_pte_free_pagetable(domain, start_pfn,
2153 					       start_pfn + lvl_pages - 1,
2154 					       level + 1);
2155 
2156 			xa_for_each(&domain->iommu_array, i, info)
2157 				iommu_flush_iotlb_psi(info->iommu, domain,
2158 						      start_pfn, lvl_pages,
2159 						      0, 0);
2160 		}
2161 
2162 		pte++;
2163 		start_pfn += lvl_pages;
2164 		if (first_pte_in_page(pte))
2165 			pte = NULL;
2166 	}
2167 }
2168 
2169 static int
2170 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2171 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2172 {
2173 	struct dma_pte *first_pte = NULL, *pte = NULL;
2174 	unsigned int largepage_lvl = 0;
2175 	unsigned long lvl_pages = 0;
2176 	phys_addr_t pteval;
2177 	u64 attr;
2178 
2179 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2180 
2181 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2182 		return -EINVAL;
2183 
2184 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2185 	attr |= DMA_FL_PTE_PRESENT;
2186 	if (domain_use_first_level(domain)) {
2187 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2188 		if (prot & DMA_PTE_WRITE)
2189 			attr |= DMA_FL_PTE_DIRTY;
2190 	}
2191 
2192 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2193 
2194 	while (nr_pages > 0) {
2195 		uint64_t tmp;
2196 
2197 		if (!pte) {
2198 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2199 					phys_pfn, nr_pages);
2200 
2201 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2202 			if (!pte)
2203 				return -ENOMEM;
2204 			first_pte = pte;
2205 
2206 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2207 
2208 			/* It is large page*/
2209 			if (largepage_lvl > 1) {
2210 				unsigned long end_pfn;
2211 				unsigned long pages_to_remove;
2212 
2213 				pteval |= DMA_PTE_LARGE_PAGE;
2214 				pages_to_remove = min_t(unsigned long, nr_pages,
2215 							nr_pte_to_next_page(pte) * lvl_pages);
2216 				end_pfn = iov_pfn + pages_to_remove - 1;
2217 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2218 			} else {
2219 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2220 			}
2221 
2222 		}
2223 		/* We don't need lock here, nobody else
2224 		 * touches the iova range
2225 		 */
2226 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2227 		if (tmp) {
2228 			static int dumps = 5;
2229 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2230 				iov_pfn, tmp, (unsigned long long)pteval);
2231 			if (dumps) {
2232 				dumps--;
2233 				debug_dma_dump_mappings(NULL);
2234 			}
2235 			WARN_ON(1);
2236 		}
2237 
2238 		nr_pages -= lvl_pages;
2239 		iov_pfn += lvl_pages;
2240 		phys_pfn += lvl_pages;
2241 		pteval += lvl_pages * VTD_PAGE_SIZE;
2242 
2243 		/* If the next PTE would be the first in a new page, then we
2244 		 * need to flush the cache on the entries we've just written.
2245 		 * And then we'll need to recalculate 'pte', so clear it and
2246 		 * let it get set again in the if (!pte) block above.
2247 		 *
2248 		 * If we're done (!nr_pages) we need to flush the cache too.
2249 		 *
2250 		 * Also if we've been setting superpages, we may need to
2251 		 * recalculate 'pte' and switch back to smaller pages for the
2252 		 * end of the mapping, if the trailing size is not enough to
2253 		 * use another superpage (i.e. nr_pages < lvl_pages).
2254 		 */
2255 		pte++;
2256 		if (!nr_pages || first_pte_in_page(pte) ||
2257 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2258 			domain_flush_cache(domain, first_pte,
2259 					   (void *)pte - (void *)first_pte);
2260 			pte = NULL;
2261 		}
2262 	}
2263 
2264 	return 0;
2265 }
2266 
2267 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2268 {
2269 	struct intel_iommu *iommu = info->iommu;
2270 	struct context_entry *context;
2271 	u16 did_old;
2272 
2273 	if (!iommu)
2274 		return;
2275 
2276 	spin_lock(&iommu->lock);
2277 	context = iommu_context_addr(iommu, bus, devfn, 0);
2278 	if (!context) {
2279 		spin_unlock(&iommu->lock);
2280 		return;
2281 	}
2282 
2283 	if (sm_supported(iommu)) {
2284 		if (hw_pass_through && domain_type_is_si(info->domain))
2285 			did_old = FLPT_DEFAULT_DID;
2286 		else
2287 			did_old = domain_id_iommu(info->domain, iommu);
2288 	} else {
2289 		did_old = context_domain_id(context);
2290 	}
2291 
2292 	context_clear_entry(context);
2293 	__iommu_flush_cache(iommu, context, sizeof(*context));
2294 	spin_unlock(&iommu->lock);
2295 	iommu->flush.flush_context(iommu,
2296 				   did_old,
2297 				   (((u16)bus) << 8) | devfn,
2298 				   DMA_CCMD_MASK_NOBIT,
2299 				   DMA_CCMD_DEVICE_INVL);
2300 
2301 	if (sm_supported(iommu))
2302 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2303 
2304 	iommu->flush.flush_iotlb(iommu,
2305 				 did_old,
2306 				 0,
2307 				 0,
2308 				 DMA_TLB_DSI_FLUSH);
2309 
2310 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2311 }
2312 
2313 static int domain_setup_first_level(struct intel_iommu *iommu,
2314 				    struct dmar_domain *domain,
2315 				    struct device *dev,
2316 				    u32 pasid)
2317 {
2318 	struct dma_pte *pgd = domain->pgd;
2319 	int agaw, level;
2320 	int flags = 0;
2321 
2322 	/*
2323 	 * Skip top levels of page tables for iommu which has
2324 	 * less agaw than default. Unnecessary for PT mode.
2325 	 */
2326 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2327 		pgd = phys_to_virt(dma_pte_addr(pgd));
2328 		if (!dma_pte_present(pgd))
2329 			return -ENOMEM;
2330 	}
2331 
2332 	level = agaw_to_level(agaw);
2333 	if (level != 4 && level != 5)
2334 		return -EINVAL;
2335 
2336 	if (pasid != PASID_RID2PASID)
2337 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2338 	if (level == 5)
2339 		flags |= PASID_FLAG_FL5LP;
2340 
2341 	if (domain->force_snooping)
2342 		flags |= PASID_FLAG_PAGE_SNOOP;
2343 
2344 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2345 					     domain_id_iommu(domain, iommu),
2346 					     flags);
2347 }
2348 
2349 static bool dev_is_real_dma_subdevice(struct device *dev)
2350 {
2351 	return dev && dev_is_pci(dev) &&
2352 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2353 }
2354 
2355 static int iommu_domain_identity_map(struct dmar_domain *domain,
2356 				     unsigned long first_vpfn,
2357 				     unsigned long last_vpfn)
2358 {
2359 	/*
2360 	 * RMRR range might have overlap with physical memory range,
2361 	 * clear it first
2362 	 */
2363 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2364 
2365 	return __domain_mapping(domain, first_vpfn,
2366 				first_vpfn, last_vpfn - first_vpfn + 1,
2367 				DMA_PTE_READ|DMA_PTE_WRITE);
2368 }
2369 
2370 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2371 
2372 static int __init si_domain_init(int hw)
2373 {
2374 	struct dmar_rmrr_unit *rmrr;
2375 	struct device *dev;
2376 	int i, nid, ret;
2377 
2378 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2379 	if (!si_domain)
2380 		return -EFAULT;
2381 
2382 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2383 		domain_exit(si_domain);
2384 		return -EFAULT;
2385 	}
2386 
2387 	if (hw)
2388 		return 0;
2389 
2390 	for_each_online_node(nid) {
2391 		unsigned long start_pfn, end_pfn;
2392 		int i;
2393 
2394 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2395 			ret = iommu_domain_identity_map(si_domain,
2396 					mm_to_dma_pfn(start_pfn),
2397 					mm_to_dma_pfn(end_pfn));
2398 			if (ret)
2399 				return ret;
2400 		}
2401 	}
2402 
2403 	/*
2404 	 * Identity map the RMRRs so that devices with RMRRs could also use
2405 	 * the si_domain.
2406 	 */
2407 	for_each_rmrr_units(rmrr) {
2408 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2409 					  i, dev) {
2410 			unsigned long long start = rmrr->base_address;
2411 			unsigned long long end = rmrr->end_address;
2412 
2413 			if (WARN_ON(end < start ||
2414 				    end >> agaw_to_width(si_domain->agaw)))
2415 				continue;
2416 
2417 			ret = iommu_domain_identity_map(si_domain,
2418 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2419 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2420 			if (ret)
2421 				return ret;
2422 		}
2423 	}
2424 
2425 	return 0;
2426 }
2427 
2428 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2429 {
2430 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2431 	struct intel_iommu *iommu;
2432 	u8 bus, devfn;
2433 	int ret;
2434 
2435 	iommu = device_to_iommu(dev, &bus, &devfn);
2436 	if (!iommu)
2437 		return -ENODEV;
2438 
2439 	ret = domain_attach_iommu(domain, iommu);
2440 	if (ret)
2441 		return ret;
2442 	info->domain = domain;
2443 	spin_lock(&domain->lock);
2444 	list_add(&info->link, &domain->devices);
2445 	spin_unlock(&domain->lock);
2446 
2447 	/* PASID table is mandatory for a PCI device in scalable mode. */
2448 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2449 		ret = intel_pasid_alloc_table(dev);
2450 		if (ret) {
2451 			dev_err(dev, "PASID table allocation failed\n");
2452 			dmar_remove_one_dev_info(dev);
2453 			return ret;
2454 		}
2455 
2456 		/* Setup the PASID entry for requests without PASID: */
2457 		if (hw_pass_through && domain_type_is_si(domain))
2458 			ret = intel_pasid_setup_pass_through(iommu, domain,
2459 					dev, PASID_RID2PASID);
2460 		else if (domain_use_first_level(domain))
2461 			ret = domain_setup_first_level(iommu, domain, dev,
2462 					PASID_RID2PASID);
2463 		else
2464 			ret = intel_pasid_setup_second_level(iommu, domain,
2465 					dev, PASID_RID2PASID);
2466 		if (ret) {
2467 			dev_err(dev, "Setup RID2PASID failed\n");
2468 			dmar_remove_one_dev_info(dev);
2469 			return ret;
2470 		}
2471 	}
2472 
2473 	ret = domain_context_mapping(domain, dev);
2474 	if (ret) {
2475 		dev_err(dev, "Domain context map failed\n");
2476 		dmar_remove_one_dev_info(dev);
2477 		return ret;
2478 	}
2479 
2480 	return 0;
2481 }
2482 
2483 static bool device_has_rmrr(struct device *dev)
2484 {
2485 	struct dmar_rmrr_unit *rmrr;
2486 	struct device *tmp;
2487 	int i;
2488 
2489 	rcu_read_lock();
2490 	for_each_rmrr_units(rmrr) {
2491 		/*
2492 		 * Return TRUE if this RMRR contains the device that
2493 		 * is passed in.
2494 		 */
2495 		for_each_active_dev_scope(rmrr->devices,
2496 					  rmrr->devices_cnt, i, tmp)
2497 			if (tmp == dev ||
2498 			    is_downstream_to_pci_bridge(dev, tmp)) {
2499 				rcu_read_unlock();
2500 				return true;
2501 			}
2502 	}
2503 	rcu_read_unlock();
2504 	return false;
2505 }
2506 
2507 /**
2508  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2509  * is relaxable (ie. is allowed to be not enforced under some conditions)
2510  * @dev: device handle
2511  *
2512  * We assume that PCI USB devices with RMRRs have them largely
2513  * for historical reasons and that the RMRR space is not actively used post
2514  * boot.  This exclusion may change if vendors begin to abuse it.
2515  *
2516  * The same exception is made for graphics devices, with the requirement that
2517  * any use of the RMRR regions will be torn down before assigning the device
2518  * to a guest.
2519  *
2520  * Return: true if the RMRR is relaxable, false otherwise
2521  */
2522 static bool device_rmrr_is_relaxable(struct device *dev)
2523 {
2524 	struct pci_dev *pdev;
2525 
2526 	if (!dev_is_pci(dev))
2527 		return false;
2528 
2529 	pdev = to_pci_dev(dev);
2530 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2531 		return true;
2532 	else
2533 		return false;
2534 }
2535 
2536 /*
2537  * There are a couple cases where we need to restrict the functionality of
2538  * devices associated with RMRRs.  The first is when evaluating a device for
2539  * identity mapping because problems exist when devices are moved in and out
2540  * of domains and their respective RMRR information is lost.  This means that
2541  * a device with associated RMRRs will never be in a "passthrough" domain.
2542  * The second is use of the device through the IOMMU API.  This interface
2543  * expects to have full control of the IOVA space for the device.  We cannot
2544  * satisfy both the requirement that RMRR access is maintained and have an
2545  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2546  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2547  * We therefore prevent devices associated with an RMRR from participating in
2548  * the IOMMU API, which eliminates them from device assignment.
2549  *
2550  * In both cases, devices which have relaxable RMRRs are not concerned by this
2551  * restriction. See device_rmrr_is_relaxable comment.
2552  */
2553 static bool device_is_rmrr_locked(struct device *dev)
2554 {
2555 	if (!device_has_rmrr(dev))
2556 		return false;
2557 
2558 	if (device_rmrr_is_relaxable(dev))
2559 		return false;
2560 
2561 	return true;
2562 }
2563 
2564 /*
2565  * Return the required default domain type for a specific device.
2566  *
2567  * @dev: the device in query
2568  * @startup: true if this is during early boot
2569  *
2570  * Returns:
2571  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2572  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2573  *  - 0: both identity and dynamic domains work for this device
2574  */
2575 static int device_def_domain_type(struct device *dev)
2576 {
2577 	if (dev_is_pci(dev)) {
2578 		struct pci_dev *pdev = to_pci_dev(dev);
2579 
2580 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2581 			return IOMMU_DOMAIN_IDENTITY;
2582 
2583 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2584 			return IOMMU_DOMAIN_IDENTITY;
2585 	}
2586 
2587 	return 0;
2588 }
2589 
2590 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2591 {
2592 	/*
2593 	 * Start from the sane iommu hardware state.
2594 	 * If the queued invalidation is already initialized by us
2595 	 * (for example, while enabling interrupt-remapping) then
2596 	 * we got the things already rolling from a sane state.
2597 	 */
2598 	if (!iommu->qi) {
2599 		/*
2600 		 * Clear any previous faults.
2601 		 */
2602 		dmar_fault(-1, iommu);
2603 		/*
2604 		 * Disable queued invalidation if supported and already enabled
2605 		 * before OS handover.
2606 		 */
2607 		dmar_disable_qi(iommu);
2608 	}
2609 
2610 	if (dmar_enable_qi(iommu)) {
2611 		/*
2612 		 * Queued Invalidate not enabled, use Register Based Invalidate
2613 		 */
2614 		iommu->flush.flush_context = __iommu_flush_context;
2615 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2616 		pr_info("%s: Using Register based invalidation\n",
2617 			iommu->name);
2618 	} else {
2619 		iommu->flush.flush_context = qi_flush_context;
2620 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2621 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2622 	}
2623 }
2624 
2625 static int copy_context_table(struct intel_iommu *iommu,
2626 			      struct root_entry *old_re,
2627 			      struct context_entry **tbl,
2628 			      int bus, bool ext)
2629 {
2630 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2631 	struct context_entry *new_ce = NULL, ce;
2632 	struct context_entry *old_ce = NULL;
2633 	struct root_entry re;
2634 	phys_addr_t old_ce_phys;
2635 
2636 	tbl_idx = ext ? bus * 2 : bus;
2637 	memcpy(&re, old_re, sizeof(re));
2638 
2639 	for (devfn = 0; devfn < 256; devfn++) {
2640 		/* First calculate the correct index */
2641 		idx = (ext ? devfn * 2 : devfn) % 256;
2642 
2643 		if (idx == 0) {
2644 			/* First save what we may have and clean up */
2645 			if (new_ce) {
2646 				tbl[tbl_idx] = new_ce;
2647 				__iommu_flush_cache(iommu, new_ce,
2648 						    VTD_PAGE_SIZE);
2649 				pos = 1;
2650 			}
2651 
2652 			if (old_ce)
2653 				memunmap(old_ce);
2654 
2655 			ret = 0;
2656 			if (devfn < 0x80)
2657 				old_ce_phys = root_entry_lctp(&re);
2658 			else
2659 				old_ce_phys = root_entry_uctp(&re);
2660 
2661 			if (!old_ce_phys) {
2662 				if (ext && devfn == 0) {
2663 					/* No LCTP, try UCTP */
2664 					devfn = 0x7f;
2665 					continue;
2666 				} else {
2667 					goto out;
2668 				}
2669 			}
2670 
2671 			ret = -ENOMEM;
2672 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2673 					MEMREMAP_WB);
2674 			if (!old_ce)
2675 				goto out;
2676 
2677 			new_ce = alloc_pgtable_page(iommu->node);
2678 			if (!new_ce)
2679 				goto out_unmap;
2680 
2681 			ret = 0;
2682 		}
2683 
2684 		/* Now copy the context entry */
2685 		memcpy(&ce, old_ce + idx, sizeof(ce));
2686 
2687 		if (!__context_present(&ce))
2688 			continue;
2689 
2690 		did = context_domain_id(&ce);
2691 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2692 			set_bit(did, iommu->domain_ids);
2693 
2694 		/*
2695 		 * We need a marker for copied context entries. This
2696 		 * marker needs to work for the old format as well as
2697 		 * for extended context entries.
2698 		 *
2699 		 * Bit 67 of the context entry is used. In the old
2700 		 * format this bit is available to software, in the
2701 		 * extended format it is the PGE bit, but PGE is ignored
2702 		 * by HW if PASIDs are disabled (and thus still
2703 		 * available).
2704 		 *
2705 		 * So disable PASIDs first and then mark the entry
2706 		 * copied. This means that we don't copy PASID
2707 		 * translations from the old kernel, but this is fine as
2708 		 * faults there are not fatal.
2709 		 */
2710 		context_clear_pasid_enable(&ce);
2711 		context_set_copied(&ce);
2712 
2713 		new_ce[idx] = ce;
2714 	}
2715 
2716 	tbl[tbl_idx + pos] = new_ce;
2717 
2718 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2719 
2720 out_unmap:
2721 	memunmap(old_ce);
2722 
2723 out:
2724 	return ret;
2725 }
2726 
2727 static int copy_translation_tables(struct intel_iommu *iommu)
2728 {
2729 	struct context_entry **ctxt_tbls;
2730 	struct root_entry *old_rt;
2731 	phys_addr_t old_rt_phys;
2732 	int ctxt_table_entries;
2733 	u64 rtaddr_reg;
2734 	int bus, ret;
2735 	bool new_ext, ext;
2736 
2737 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2738 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2739 	new_ext    = !!ecap_ecs(iommu->ecap);
2740 
2741 	/*
2742 	 * The RTT bit can only be changed when translation is disabled,
2743 	 * but disabling translation means to open a window for data
2744 	 * corruption. So bail out and don't copy anything if we would
2745 	 * have to change the bit.
2746 	 */
2747 	if (new_ext != ext)
2748 		return -EINVAL;
2749 
2750 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2751 	if (!old_rt_phys)
2752 		return -EINVAL;
2753 
2754 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2755 	if (!old_rt)
2756 		return -ENOMEM;
2757 
2758 	/* This is too big for the stack - allocate it from slab */
2759 	ctxt_table_entries = ext ? 512 : 256;
2760 	ret = -ENOMEM;
2761 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2762 	if (!ctxt_tbls)
2763 		goto out_unmap;
2764 
2765 	for (bus = 0; bus < 256; bus++) {
2766 		ret = copy_context_table(iommu, &old_rt[bus],
2767 					 ctxt_tbls, bus, ext);
2768 		if (ret) {
2769 			pr_err("%s: Failed to copy context table for bus %d\n",
2770 				iommu->name, bus);
2771 			continue;
2772 		}
2773 	}
2774 
2775 	spin_lock(&iommu->lock);
2776 
2777 	/* Context tables are copied, now write them to the root_entry table */
2778 	for (bus = 0; bus < 256; bus++) {
2779 		int idx = ext ? bus * 2 : bus;
2780 		u64 val;
2781 
2782 		if (ctxt_tbls[idx]) {
2783 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2784 			iommu->root_entry[bus].lo = val;
2785 		}
2786 
2787 		if (!ext || !ctxt_tbls[idx + 1])
2788 			continue;
2789 
2790 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2791 		iommu->root_entry[bus].hi = val;
2792 	}
2793 
2794 	spin_unlock(&iommu->lock);
2795 
2796 	kfree(ctxt_tbls);
2797 
2798 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2799 
2800 	ret = 0;
2801 
2802 out_unmap:
2803 	memunmap(old_rt);
2804 
2805 	return ret;
2806 }
2807 
2808 #ifdef CONFIG_INTEL_IOMMU_SVM
2809 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2810 {
2811 	struct intel_iommu *iommu = data;
2812 	ioasid_t ioasid;
2813 
2814 	if (!iommu)
2815 		return INVALID_IOASID;
2816 	/*
2817 	 * VT-d virtual command interface always uses the full 20 bit
2818 	 * PASID range. Host can partition guest PASID range based on
2819 	 * policies but it is out of guest's control.
2820 	 */
2821 	if (min < PASID_MIN || max > intel_pasid_max_id)
2822 		return INVALID_IOASID;
2823 
2824 	if (vcmd_alloc_pasid(iommu, &ioasid))
2825 		return INVALID_IOASID;
2826 
2827 	return ioasid;
2828 }
2829 
2830 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2831 {
2832 	struct intel_iommu *iommu = data;
2833 
2834 	if (!iommu)
2835 		return;
2836 	/*
2837 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2838 	 * We can only free the PASID when all the devices are unbound.
2839 	 */
2840 	if (ioasid_find(NULL, ioasid, NULL)) {
2841 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2842 		return;
2843 	}
2844 	vcmd_free_pasid(iommu, ioasid);
2845 }
2846 
2847 static void register_pasid_allocator(struct intel_iommu *iommu)
2848 {
2849 	/*
2850 	 * If we are running in the host, no need for custom allocator
2851 	 * in that PASIDs are allocated from the host system-wide.
2852 	 */
2853 	if (!cap_caching_mode(iommu->cap))
2854 		return;
2855 
2856 	if (!sm_supported(iommu)) {
2857 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2858 		return;
2859 	}
2860 
2861 	/*
2862 	 * Register a custom PASID allocator if we are running in a guest,
2863 	 * guest PASID must be obtained via virtual command interface.
2864 	 * There can be multiple vIOMMUs in each guest but only one allocator
2865 	 * is active. All vIOMMU allocators will eventually be calling the same
2866 	 * host allocator.
2867 	 */
2868 	if (!vccap_pasid(iommu->vccap))
2869 		return;
2870 
2871 	pr_info("Register custom PASID allocator\n");
2872 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2873 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2874 	iommu->pasid_allocator.pdata = (void *)iommu;
2875 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2876 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2877 		/*
2878 		 * Disable scalable mode on this IOMMU if there
2879 		 * is no custom allocator. Mixing SM capable vIOMMU
2880 		 * and non-SM vIOMMU are not supported.
2881 		 */
2882 		intel_iommu_sm = 0;
2883 	}
2884 }
2885 #endif
2886 
2887 static int __init init_dmars(void)
2888 {
2889 	struct dmar_drhd_unit *drhd;
2890 	struct intel_iommu *iommu;
2891 	int ret;
2892 
2893 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2894 	if (ret)
2895 		goto free_iommu;
2896 
2897 	for_each_iommu(iommu, drhd) {
2898 		if (drhd->ignored) {
2899 			iommu_disable_translation(iommu);
2900 			continue;
2901 		}
2902 
2903 		/*
2904 		 * Find the max pasid size of all IOMMU's in the system.
2905 		 * We need to ensure the system pasid table is no bigger
2906 		 * than the smallest supported.
2907 		 */
2908 		if (pasid_supported(iommu)) {
2909 			u32 temp = 2 << ecap_pss(iommu->ecap);
2910 
2911 			intel_pasid_max_id = min_t(u32, temp,
2912 						   intel_pasid_max_id);
2913 		}
2914 
2915 		intel_iommu_init_qi(iommu);
2916 
2917 		ret = iommu_init_domains(iommu);
2918 		if (ret)
2919 			goto free_iommu;
2920 
2921 		init_translation_status(iommu);
2922 
2923 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2924 			iommu_disable_translation(iommu);
2925 			clear_translation_pre_enabled(iommu);
2926 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2927 				iommu->name);
2928 		}
2929 
2930 		/*
2931 		 * TBD:
2932 		 * we could share the same root & context tables
2933 		 * among all IOMMU's. Need to Split it later.
2934 		 */
2935 		ret = iommu_alloc_root_entry(iommu);
2936 		if (ret)
2937 			goto free_iommu;
2938 
2939 		if (translation_pre_enabled(iommu)) {
2940 			pr_info("Translation already enabled - trying to copy translation structures\n");
2941 
2942 			ret = copy_translation_tables(iommu);
2943 			if (ret) {
2944 				/*
2945 				 * We found the IOMMU with translation
2946 				 * enabled - but failed to copy over the
2947 				 * old root-entry table. Try to proceed
2948 				 * by disabling translation now and
2949 				 * allocating a clean root-entry table.
2950 				 * This might cause DMAR faults, but
2951 				 * probably the dump will still succeed.
2952 				 */
2953 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2954 				       iommu->name);
2955 				iommu_disable_translation(iommu);
2956 				clear_translation_pre_enabled(iommu);
2957 			} else {
2958 				pr_info("Copied translation tables from previous kernel for %s\n",
2959 					iommu->name);
2960 			}
2961 		}
2962 
2963 		if (!ecap_pass_through(iommu->ecap))
2964 			hw_pass_through = 0;
2965 		intel_svm_check(iommu);
2966 	}
2967 
2968 	/*
2969 	 * Now that qi is enabled on all iommus, set the root entry and flush
2970 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2971 	 * flush_context function will loop forever and the boot hangs.
2972 	 */
2973 	for_each_active_iommu(iommu, drhd) {
2974 		iommu_flush_write_buffer(iommu);
2975 #ifdef CONFIG_INTEL_IOMMU_SVM
2976 		register_pasid_allocator(iommu);
2977 #endif
2978 		iommu_set_root_entry(iommu);
2979 	}
2980 
2981 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2982 	dmar_map_gfx = 0;
2983 #endif
2984 
2985 	if (!dmar_map_gfx)
2986 		iommu_identity_mapping |= IDENTMAP_GFX;
2987 
2988 	check_tylersburg_isoch();
2989 
2990 	ret = si_domain_init(hw_pass_through);
2991 	if (ret)
2992 		goto free_iommu;
2993 
2994 	/*
2995 	 * for each drhd
2996 	 *   enable fault log
2997 	 *   global invalidate context cache
2998 	 *   global invalidate iotlb
2999 	 *   enable translation
3000 	 */
3001 	for_each_iommu(iommu, drhd) {
3002 		if (drhd->ignored) {
3003 			/*
3004 			 * we always have to disable PMRs or DMA may fail on
3005 			 * this device
3006 			 */
3007 			if (force_on)
3008 				iommu_disable_protect_mem_regions(iommu);
3009 			continue;
3010 		}
3011 
3012 		iommu_flush_write_buffer(iommu);
3013 
3014 #ifdef CONFIG_INTEL_IOMMU_SVM
3015 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3016 			/*
3017 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3018 			 * could cause possible lock race condition.
3019 			 */
3020 			up_write(&dmar_global_lock);
3021 			ret = intel_svm_enable_prq(iommu);
3022 			down_write(&dmar_global_lock);
3023 			if (ret)
3024 				goto free_iommu;
3025 		}
3026 #endif
3027 		ret = dmar_set_interrupt(iommu);
3028 		if (ret)
3029 			goto free_iommu;
3030 	}
3031 
3032 	return 0;
3033 
3034 free_iommu:
3035 	for_each_active_iommu(iommu, drhd) {
3036 		disable_dmar_iommu(iommu);
3037 		free_dmar_iommu(iommu);
3038 	}
3039 
3040 	return ret;
3041 }
3042 
3043 static void __init init_no_remapping_devices(void)
3044 {
3045 	struct dmar_drhd_unit *drhd;
3046 	struct device *dev;
3047 	int i;
3048 
3049 	for_each_drhd_unit(drhd) {
3050 		if (!drhd->include_all) {
3051 			for_each_active_dev_scope(drhd->devices,
3052 						  drhd->devices_cnt, i, dev)
3053 				break;
3054 			/* ignore DMAR unit if no devices exist */
3055 			if (i == drhd->devices_cnt)
3056 				drhd->ignored = 1;
3057 		}
3058 	}
3059 
3060 	for_each_active_drhd_unit(drhd) {
3061 		if (drhd->include_all)
3062 			continue;
3063 
3064 		for_each_active_dev_scope(drhd->devices,
3065 					  drhd->devices_cnt, i, dev)
3066 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3067 				break;
3068 		if (i < drhd->devices_cnt)
3069 			continue;
3070 
3071 		/* This IOMMU has *only* gfx devices. Either bypass it or
3072 		   set the gfx_mapped flag, as appropriate */
3073 		drhd->gfx_dedicated = 1;
3074 		if (!dmar_map_gfx)
3075 			drhd->ignored = 1;
3076 	}
3077 }
3078 
3079 #ifdef CONFIG_SUSPEND
3080 static int init_iommu_hw(void)
3081 {
3082 	struct dmar_drhd_unit *drhd;
3083 	struct intel_iommu *iommu = NULL;
3084 
3085 	for_each_active_iommu(iommu, drhd)
3086 		if (iommu->qi)
3087 			dmar_reenable_qi(iommu);
3088 
3089 	for_each_iommu(iommu, drhd) {
3090 		if (drhd->ignored) {
3091 			/*
3092 			 * we always have to disable PMRs or DMA may fail on
3093 			 * this device
3094 			 */
3095 			if (force_on)
3096 				iommu_disable_protect_mem_regions(iommu);
3097 			continue;
3098 		}
3099 
3100 		iommu_flush_write_buffer(iommu);
3101 		iommu_set_root_entry(iommu);
3102 		iommu_enable_translation(iommu);
3103 		iommu_disable_protect_mem_regions(iommu);
3104 	}
3105 
3106 	return 0;
3107 }
3108 
3109 static void iommu_flush_all(void)
3110 {
3111 	struct dmar_drhd_unit *drhd;
3112 	struct intel_iommu *iommu;
3113 
3114 	for_each_active_iommu(iommu, drhd) {
3115 		iommu->flush.flush_context(iommu, 0, 0, 0,
3116 					   DMA_CCMD_GLOBAL_INVL);
3117 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3118 					 DMA_TLB_GLOBAL_FLUSH);
3119 	}
3120 }
3121 
3122 static int iommu_suspend(void)
3123 {
3124 	struct dmar_drhd_unit *drhd;
3125 	struct intel_iommu *iommu = NULL;
3126 	unsigned long flag;
3127 
3128 	for_each_active_iommu(iommu, drhd) {
3129 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3130 					     GFP_KERNEL);
3131 		if (!iommu->iommu_state)
3132 			goto nomem;
3133 	}
3134 
3135 	iommu_flush_all();
3136 
3137 	for_each_active_iommu(iommu, drhd) {
3138 		iommu_disable_translation(iommu);
3139 
3140 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3141 
3142 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3143 			readl(iommu->reg + DMAR_FECTL_REG);
3144 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3145 			readl(iommu->reg + DMAR_FEDATA_REG);
3146 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3147 			readl(iommu->reg + DMAR_FEADDR_REG);
3148 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3149 			readl(iommu->reg + DMAR_FEUADDR_REG);
3150 
3151 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3152 	}
3153 	return 0;
3154 
3155 nomem:
3156 	for_each_active_iommu(iommu, drhd)
3157 		kfree(iommu->iommu_state);
3158 
3159 	return -ENOMEM;
3160 }
3161 
3162 static void iommu_resume(void)
3163 {
3164 	struct dmar_drhd_unit *drhd;
3165 	struct intel_iommu *iommu = NULL;
3166 	unsigned long flag;
3167 
3168 	if (init_iommu_hw()) {
3169 		if (force_on)
3170 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3171 		else
3172 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3173 		return;
3174 	}
3175 
3176 	for_each_active_iommu(iommu, drhd) {
3177 
3178 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3179 
3180 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3181 			iommu->reg + DMAR_FECTL_REG);
3182 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3183 			iommu->reg + DMAR_FEDATA_REG);
3184 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3185 			iommu->reg + DMAR_FEADDR_REG);
3186 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3187 			iommu->reg + DMAR_FEUADDR_REG);
3188 
3189 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3190 	}
3191 
3192 	for_each_active_iommu(iommu, drhd)
3193 		kfree(iommu->iommu_state);
3194 }
3195 
3196 static struct syscore_ops iommu_syscore_ops = {
3197 	.resume		= iommu_resume,
3198 	.suspend	= iommu_suspend,
3199 };
3200 
3201 static void __init init_iommu_pm_ops(void)
3202 {
3203 	register_syscore_ops(&iommu_syscore_ops);
3204 }
3205 
3206 #else
3207 static inline void init_iommu_pm_ops(void) {}
3208 #endif	/* CONFIG_PM */
3209 
3210 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3211 {
3212 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3213 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3214 	    rmrr->end_address <= rmrr->base_address ||
3215 	    arch_rmrr_sanity_check(rmrr))
3216 		return -EINVAL;
3217 
3218 	return 0;
3219 }
3220 
3221 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3222 {
3223 	struct acpi_dmar_reserved_memory *rmrr;
3224 	struct dmar_rmrr_unit *rmrru;
3225 
3226 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3227 	if (rmrr_sanity_check(rmrr)) {
3228 		pr_warn(FW_BUG
3229 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3230 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3231 			   rmrr->base_address, rmrr->end_address,
3232 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3233 			   dmi_get_system_info(DMI_BIOS_VERSION),
3234 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3235 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3236 	}
3237 
3238 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3239 	if (!rmrru)
3240 		goto out;
3241 
3242 	rmrru->hdr = header;
3243 
3244 	rmrru->base_address = rmrr->base_address;
3245 	rmrru->end_address = rmrr->end_address;
3246 
3247 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3248 				((void *)rmrr) + rmrr->header.length,
3249 				&rmrru->devices_cnt);
3250 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3251 		goto free_rmrru;
3252 
3253 	list_add(&rmrru->list, &dmar_rmrr_units);
3254 
3255 	return 0;
3256 free_rmrru:
3257 	kfree(rmrru);
3258 out:
3259 	return -ENOMEM;
3260 }
3261 
3262 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3263 {
3264 	struct dmar_atsr_unit *atsru;
3265 	struct acpi_dmar_atsr *tmp;
3266 
3267 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3268 				dmar_rcu_check()) {
3269 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3270 		if (atsr->segment != tmp->segment)
3271 			continue;
3272 		if (atsr->header.length != tmp->header.length)
3273 			continue;
3274 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3275 			return atsru;
3276 	}
3277 
3278 	return NULL;
3279 }
3280 
3281 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3282 {
3283 	struct acpi_dmar_atsr *atsr;
3284 	struct dmar_atsr_unit *atsru;
3285 
3286 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3287 		return 0;
3288 
3289 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3290 	atsru = dmar_find_atsr(atsr);
3291 	if (atsru)
3292 		return 0;
3293 
3294 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3295 	if (!atsru)
3296 		return -ENOMEM;
3297 
3298 	/*
3299 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3300 	 * copy the memory content because the memory buffer will be freed
3301 	 * on return.
3302 	 */
3303 	atsru->hdr = (void *)(atsru + 1);
3304 	memcpy(atsru->hdr, hdr, hdr->length);
3305 	atsru->include_all = atsr->flags & 0x1;
3306 	if (!atsru->include_all) {
3307 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3308 				(void *)atsr + atsr->header.length,
3309 				&atsru->devices_cnt);
3310 		if (atsru->devices_cnt && atsru->devices == NULL) {
3311 			kfree(atsru);
3312 			return -ENOMEM;
3313 		}
3314 	}
3315 
3316 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3317 
3318 	return 0;
3319 }
3320 
3321 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3322 {
3323 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3324 	kfree(atsru);
3325 }
3326 
3327 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3328 {
3329 	struct acpi_dmar_atsr *atsr;
3330 	struct dmar_atsr_unit *atsru;
3331 
3332 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3333 	atsru = dmar_find_atsr(atsr);
3334 	if (atsru) {
3335 		list_del_rcu(&atsru->list);
3336 		synchronize_rcu();
3337 		intel_iommu_free_atsr(atsru);
3338 	}
3339 
3340 	return 0;
3341 }
3342 
3343 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3344 {
3345 	int i;
3346 	struct device *dev;
3347 	struct acpi_dmar_atsr *atsr;
3348 	struct dmar_atsr_unit *atsru;
3349 
3350 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3351 	atsru = dmar_find_atsr(atsr);
3352 	if (!atsru)
3353 		return 0;
3354 
3355 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3356 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3357 					  i, dev)
3358 			return -EBUSY;
3359 	}
3360 
3361 	return 0;
3362 }
3363 
3364 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3365 {
3366 	struct dmar_satc_unit *satcu;
3367 	struct acpi_dmar_satc *tmp;
3368 
3369 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3370 				dmar_rcu_check()) {
3371 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3372 		if (satc->segment != tmp->segment)
3373 			continue;
3374 		if (satc->header.length != tmp->header.length)
3375 			continue;
3376 		if (memcmp(satc, tmp, satc->header.length) == 0)
3377 			return satcu;
3378 	}
3379 
3380 	return NULL;
3381 }
3382 
3383 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3384 {
3385 	struct acpi_dmar_satc *satc;
3386 	struct dmar_satc_unit *satcu;
3387 
3388 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3389 		return 0;
3390 
3391 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3392 	satcu = dmar_find_satc(satc);
3393 	if (satcu)
3394 		return 0;
3395 
3396 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3397 	if (!satcu)
3398 		return -ENOMEM;
3399 
3400 	satcu->hdr = (void *)(satcu + 1);
3401 	memcpy(satcu->hdr, hdr, hdr->length);
3402 	satcu->atc_required = satc->flags & 0x1;
3403 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3404 					      (void *)satc + satc->header.length,
3405 					      &satcu->devices_cnt);
3406 	if (satcu->devices_cnt && !satcu->devices) {
3407 		kfree(satcu);
3408 		return -ENOMEM;
3409 	}
3410 	list_add_rcu(&satcu->list, &dmar_satc_units);
3411 
3412 	return 0;
3413 }
3414 
3415 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3416 {
3417 	int sp, ret;
3418 	struct intel_iommu *iommu = dmaru->iommu;
3419 
3420 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3421 	if (ret)
3422 		goto out;
3423 
3424 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3425 		pr_warn("%s: Doesn't support hardware pass through.\n",
3426 			iommu->name);
3427 		return -ENXIO;
3428 	}
3429 
3430 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3431 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3432 		pr_warn("%s: Doesn't support large page.\n",
3433 			iommu->name);
3434 		return -ENXIO;
3435 	}
3436 
3437 	/*
3438 	 * Disable translation if already enabled prior to OS handover.
3439 	 */
3440 	if (iommu->gcmd & DMA_GCMD_TE)
3441 		iommu_disable_translation(iommu);
3442 
3443 	ret = iommu_init_domains(iommu);
3444 	if (ret == 0)
3445 		ret = iommu_alloc_root_entry(iommu);
3446 	if (ret)
3447 		goto out;
3448 
3449 	intel_svm_check(iommu);
3450 
3451 	if (dmaru->ignored) {
3452 		/*
3453 		 * we always have to disable PMRs or DMA may fail on this device
3454 		 */
3455 		if (force_on)
3456 			iommu_disable_protect_mem_regions(iommu);
3457 		return 0;
3458 	}
3459 
3460 	intel_iommu_init_qi(iommu);
3461 	iommu_flush_write_buffer(iommu);
3462 
3463 #ifdef CONFIG_INTEL_IOMMU_SVM
3464 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3465 		ret = intel_svm_enable_prq(iommu);
3466 		if (ret)
3467 			goto disable_iommu;
3468 	}
3469 #endif
3470 	ret = dmar_set_interrupt(iommu);
3471 	if (ret)
3472 		goto disable_iommu;
3473 
3474 	iommu_set_root_entry(iommu);
3475 	iommu_enable_translation(iommu);
3476 
3477 	iommu_disable_protect_mem_regions(iommu);
3478 	return 0;
3479 
3480 disable_iommu:
3481 	disable_dmar_iommu(iommu);
3482 out:
3483 	free_dmar_iommu(iommu);
3484 	return ret;
3485 }
3486 
3487 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3488 {
3489 	int ret = 0;
3490 	struct intel_iommu *iommu = dmaru->iommu;
3491 
3492 	if (!intel_iommu_enabled)
3493 		return 0;
3494 	if (iommu == NULL)
3495 		return -EINVAL;
3496 
3497 	if (insert) {
3498 		ret = intel_iommu_add(dmaru);
3499 	} else {
3500 		disable_dmar_iommu(iommu);
3501 		free_dmar_iommu(iommu);
3502 	}
3503 
3504 	return ret;
3505 }
3506 
3507 static void intel_iommu_free_dmars(void)
3508 {
3509 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3510 	struct dmar_atsr_unit *atsru, *atsr_n;
3511 	struct dmar_satc_unit *satcu, *satc_n;
3512 
3513 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3514 		list_del(&rmrru->list);
3515 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3516 		kfree(rmrru);
3517 	}
3518 
3519 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3520 		list_del(&atsru->list);
3521 		intel_iommu_free_atsr(atsru);
3522 	}
3523 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3524 		list_del(&satcu->list);
3525 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3526 		kfree(satcu);
3527 	}
3528 }
3529 
3530 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3531 {
3532 	struct dmar_satc_unit *satcu;
3533 	struct acpi_dmar_satc *satc;
3534 	struct device *tmp;
3535 	int i;
3536 
3537 	dev = pci_physfn(dev);
3538 	rcu_read_lock();
3539 
3540 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3541 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3542 		if (satc->segment != pci_domain_nr(dev->bus))
3543 			continue;
3544 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3545 			if (to_pci_dev(tmp) == dev)
3546 				goto out;
3547 	}
3548 	satcu = NULL;
3549 out:
3550 	rcu_read_unlock();
3551 	return satcu;
3552 }
3553 
3554 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3555 {
3556 	int i, ret = 1;
3557 	struct pci_bus *bus;
3558 	struct pci_dev *bridge = NULL;
3559 	struct device *tmp;
3560 	struct acpi_dmar_atsr *atsr;
3561 	struct dmar_atsr_unit *atsru;
3562 	struct dmar_satc_unit *satcu;
3563 
3564 	dev = pci_physfn(dev);
3565 	satcu = dmar_find_matched_satc_unit(dev);
3566 	if (satcu)
3567 		/*
3568 		 * This device supports ATS as it is in SATC table.
3569 		 * When IOMMU is in legacy mode, enabling ATS is done
3570 		 * automatically by HW for the device that requires
3571 		 * ATS, hence OS should not enable this device ATS
3572 		 * to avoid duplicated TLB invalidation.
3573 		 */
3574 		return !(satcu->atc_required && !sm_supported(iommu));
3575 
3576 	for (bus = dev->bus; bus; bus = bus->parent) {
3577 		bridge = bus->self;
3578 		/* If it's an integrated device, allow ATS */
3579 		if (!bridge)
3580 			return 1;
3581 		/* Connected via non-PCIe: no ATS */
3582 		if (!pci_is_pcie(bridge) ||
3583 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3584 			return 0;
3585 		/* If we found the root port, look it up in the ATSR */
3586 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3587 			break;
3588 	}
3589 
3590 	rcu_read_lock();
3591 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3592 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3593 		if (atsr->segment != pci_domain_nr(dev->bus))
3594 			continue;
3595 
3596 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3597 			if (tmp == &bridge->dev)
3598 				goto out;
3599 
3600 		if (atsru->include_all)
3601 			goto out;
3602 	}
3603 	ret = 0;
3604 out:
3605 	rcu_read_unlock();
3606 
3607 	return ret;
3608 }
3609 
3610 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3611 {
3612 	int ret;
3613 	struct dmar_rmrr_unit *rmrru;
3614 	struct dmar_atsr_unit *atsru;
3615 	struct dmar_satc_unit *satcu;
3616 	struct acpi_dmar_atsr *atsr;
3617 	struct acpi_dmar_reserved_memory *rmrr;
3618 	struct acpi_dmar_satc *satc;
3619 
3620 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3621 		return 0;
3622 
3623 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3624 		rmrr = container_of(rmrru->hdr,
3625 				    struct acpi_dmar_reserved_memory, header);
3626 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3627 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3628 				((void *)rmrr) + rmrr->header.length,
3629 				rmrr->segment, rmrru->devices,
3630 				rmrru->devices_cnt);
3631 			if (ret < 0)
3632 				return ret;
3633 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3634 			dmar_remove_dev_scope(info, rmrr->segment,
3635 				rmrru->devices, rmrru->devices_cnt);
3636 		}
3637 	}
3638 
3639 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3640 		if (atsru->include_all)
3641 			continue;
3642 
3643 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3644 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3645 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3646 					(void *)atsr + atsr->header.length,
3647 					atsr->segment, atsru->devices,
3648 					atsru->devices_cnt);
3649 			if (ret > 0)
3650 				break;
3651 			else if (ret < 0)
3652 				return ret;
3653 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3654 			if (dmar_remove_dev_scope(info, atsr->segment,
3655 					atsru->devices, atsru->devices_cnt))
3656 				break;
3657 		}
3658 	}
3659 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3660 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3661 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3662 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3663 					(void *)satc + satc->header.length,
3664 					satc->segment, satcu->devices,
3665 					satcu->devices_cnt);
3666 			if (ret > 0)
3667 				break;
3668 			else if (ret < 0)
3669 				return ret;
3670 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3671 			if (dmar_remove_dev_scope(info, satc->segment,
3672 					satcu->devices, satcu->devices_cnt))
3673 				break;
3674 		}
3675 	}
3676 
3677 	return 0;
3678 }
3679 
3680 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3681 				       unsigned long val, void *v)
3682 {
3683 	struct memory_notify *mhp = v;
3684 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3685 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3686 			mhp->nr_pages - 1);
3687 
3688 	switch (val) {
3689 	case MEM_GOING_ONLINE:
3690 		if (iommu_domain_identity_map(si_domain,
3691 					      start_vpfn, last_vpfn)) {
3692 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3693 				start_vpfn, last_vpfn);
3694 			return NOTIFY_BAD;
3695 		}
3696 		break;
3697 
3698 	case MEM_OFFLINE:
3699 	case MEM_CANCEL_ONLINE:
3700 		{
3701 			struct dmar_drhd_unit *drhd;
3702 			struct intel_iommu *iommu;
3703 			LIST_HEAD(freelist);
3704 
3705 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3706 
3707 			rcu_read_lock();
3708 			for_each_active_iommu(iommu, drhd)
3709 				iommu_flush_iotlb_psi(iommu, si_domain,
3710 					start_vpfn, mhp->nr_pages,
3711 					list_empty(&freelist), 0);
3712 			rcu_read_unlock();
3713 			put_pages_list(&freelist);
3714 		}
3715 		break;
3716 	}
3717 
3718 	return NOTIFY_OK;
3719 }
3720 
3721 static struct notifier_block intel_iommu_memory_nb = {
3722 	.notifier_call = intel_iommu_memory_notifier,
3723 	.priority = 0
3724 };
3725 
3726 static void intel_disable_iommus(void)
3727 {
3728 	struct intel_iommu *iommu = NULL;
3729 	struct dmar_drhd_unit *drhd;
3730 
3731 	for_each_iommu(iommu, drhd)
3732 		iommu_disable_translation(iommu);
3733 }
3734 
3735 void intel_iommu_shutdown(void)
3736 {
3737 	struct dmar_drhd_unit *drhd;
3738 	struct intel_iommu *iommu = NULL;
3739 
3740 	if (no_iommu || dmar_disabled)
3741 		return;
3742 
3743 	down_write(&dmar_global_lock);
3744 
3745 	/* Disable PMRs explicitly here. */
3746 	for_each_iommu(iommu, drhd)
3747 		iommu_disable_protect_mem_regions(iommu);
3748 
3749 	/* Make sure the IOMMUs are switched off */
3750 	intel_disable_iommus();
3751 
3752 	up_write(&dmar_global_lock);
3753 }
3754 
3755 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3756 {
3757 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3758 
3759 	return container_of(iommu_dev, struct intel_iommu, iommu);
3760 }
3761 
3762 static ssize_t version_show(struct device *dev,
3763 			    struct device_attribute *attr, char *buf)
3764 {
3765 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3766 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3767 	return sprintf(buf, "%d:%d\n",
3768 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3769 }
3770 static DEVICE_ATTR_RO(version);
3771 
3772 static ssize_t address_show(struct device *dev,
3773 			    struct device_attribute *attr, char *buf)
3774 {
3775 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3777 }
3778 static DEVICE_ATTR_RO(address);
3779 
3780 static ssize_t cap_show(struct device *dev,
3781 			struct device_attribute *attr, char *buf)
3782 {
3783 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784 	return sprintf(buf, "%llx\n", iommu->cap);
3785 }
3786 static DEVICE_ATTR_RO(cap);
3787 
3788 static ssize_t ecap_show(struct device *dev,
3789 			 struct device_attribute *attr, char *buf)
3790 {
3791 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 	return sprintf(buf, "%llx\n", iommu->ecap);
3793 }
3794 static DEVICE_ATTR_RO(ecap);
3795 
3796 static ssize_t domains_supported_show(struct device *dev,
3797 				      struct device_attribute *attr, char *buf)
3798 {
3799 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3801 }
3802 static DEVICE_ATTR_RO(domains_supported);
3803 
3804 static ssize_t domains_used_show(struct device *dev,
3805 				 struct device_attribute *attr, char *buf)
3806 {
3807 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3808 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3809 						  cap_ndoms(iommu->cap)));
3810 }
3811 static DEVICE_ATTR_RO(domains_used);
3812 
3813 static struct attribute *intel_iommu_attrs[] = {
3814 	&dev_attr_version.attr,
3815 	&dev_attr_address.attr,
3816 	&dev_attr_cap.attr,
3817 	&dev_attr_ecap.attr,
3818 	&dev_attr_domains_supported.attr,
3819 	&dev_attr_domains_used.attr,
3820 	NULL,
3821 };
3822 
3823 static struct attribute_group intel_iommu_group = {
3824 	.name = "intel-iommu",
3825 	.attrs = intel_iommu_attrs,
3826 };
3827 
3828 const struct attribute_group *intel_iommu_groups[] = {
3829 	&intel_iommu_group,
3830 	NULL,
3831 };
3832 
3833 static inline bool has_external_pci(void)
3834 {
3835 	struct pci_dev *pdev = NULL;
3836 
3837 	for_each_pci_dev(pdev)
3838 		if (pdev->external_facing)
3839 			return true;
3840 
3841 	return false;
3842 }
3843 
3844 static int __init platform_optin_force_iommu(void)
3845 {
3846 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3847 		return 0;
3848 
3849 	if (no_iommu || dmar_disabled)
3850 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3851 
3852 	/*
3853 	 * If Intel-IOMMU is disabled by default, we will apply identity
3854 	 * map for all devices except those marked as being untrusted.
3855 	 */
3856 	if (dmar_disabled)
3857 		iommu_set_default_passthrough(false);
3858 
3859 	dmar_disabled = 0;
3860 	no_iommu = 0;
3861 
3862 	return 1;
3863 }
3864 
3865 static int __init probe_acpi_namespace_devices(void)
3866 {
3867 	struct dmar_drhd_unit *drhd;
3868 	/* To avoid a -Wunused-but-set-variable warning. */
3869 	struct intel_iommu *iommu __maybe_unused;
3870 	struct device *dev;
3871 	int i, ret = 0;
3872 
3873 	for_each_active_iommu(iommu, drhd) {
3874 		for_each_active_dev_scope(drhd->devices,
3875 					  drhd->devices_cnt, i, dev) {
3876 			struct acpi_device_physical_node *pn;
3877 			struct iommu_group *group;
3878 			struct acpi_device *adev;
3879 
3880 			if (dev->bus != &acpi_bus_type)
3881 				continue;
3882 
3883 			adev = to_acpi_device(dev);
3884 			mutex_lock(&adev->physical_node_lock);
3885 			list_for_each_entry(pn,
3886 					    &adev->physical_node_list, node) {
3887 				group = iommu_group_get(pn->dev);
3888 				if (group) {
3889 					iommu_group_put(group);
3890 					continue;
3891 				}
3892 
3893 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
3894 				ret = iommu_probe_device(pn->dev);
3895 				if (ret)
3896 					break;
3897 			}
3898 			mutex_unlock(&adev->physical_node_lock);
3899 
3900 			if (ret)
3901 				return ret;
3902 		}
3903 	}
3904 
3905 	return 0;
3906 }
3907 
3908 static __init int tboot_force_iommu(void)
3909 {
3910 	if (!tboot_enabled())
3911 		return 0;
3912 
3913 	if (no_iommu || dmar_disabled)
3914 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3915 
3916 	dmar_disabled = 0;
3917 	no_iommu = 0;
3918 
3919 	return 1;
3920 }
3921 
3922 int __init intel_iommu_init(void)
3923 {
3924 	int ret = -ENODEV;
3925 	struct dmar_drhd_unit *drhd;
3926 	struct intel_iommu *iommu;
3927 
3928 	/*
3929 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3930 	 * opt in, so enforce that.
3931 	 */
3932 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3933 		    platform_optin_force_iommu();
3934 
3935 	down_write(&dmar_global_lock);
3936 	if (dmar_table_init()) {
3937 		if (force_on)
3938 			panic("tboot: Failed to initialize DMAR table\n");
3939 		goto out_free_dmar;
3940 	}
3941 
3942 	if (dmar_dev_scope_init() < 0) {
3943 		if (force_on)
3944 			panic("tboot: Failed to initialize DMAR device scope\n");
3945 		goto out_free_dmar;
3946 	}
3947 
3948 	up_write(&dmar_global_lock);
3949 
3950 	/*
3951 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3952 	 * complain later when we register it under the lock.
3953 	 */
3954 	dmar_register_bus_notifier();
3955 
3956 	down_write(&dmar_global_lock);
3957 
3958 	if (!no_iommu)
3959 		intel_iommu_debugfs_init();
3960 
3961 	if (no_iommu || dmar_disabled) {
3962 		/*
3963 		 * We exit the function here to ensure IOMMU's remapping and
3964 		 * mempool aren't setup, which means that the IOMMU's PMRs
3965 		 * won't be disabled via the call to init_dmars(). So disable
3966 		 * it explicitly here. The PMRs were setup by tboot prior to
3967 		 * calling SENTER, but the kernel is expected to reset/tear
3968 		 * down the PMRs.
3969 		 */
3970 		if (intel_iommu_tboot_noforce) {
3971 			for_each_iommu(iommu, drhd)
3972 				iommu_disable_protect_mem_regions(iommu);
3973 		}
3974 
3975 		/*
3976 		 * Make sure the IOMMUs are switched off, even when we
3977 		 * boot into a kexec kernel and the previous kernel left
3978 		 * them enabled
3979 		 */
3980 		intel_disable_iommus();
3981 		goto out_free_dmar;
3982 	}
3983 
3984 	if (list_empty(&dmar_rmrr_units))
3985 		pr_info("No RMRR found\n");
3986 
3987 	if (list_empty(&dmar_atsr_units))
3988 		pr_info("No ATSR found\n");
3989 
3990 	if (list_empty(&dmar_satc_units))
3991 		pr_info("No SATC found\n");
3992 
3993 	init_no_remapping_devices();
3994 
3995 	ret = init_dmars();
3996 	if (ret) {
3997 		if (force_on)
3998 			panic("tboot: Failed to initialize DMARs\n");
3999 		pr_err("Initialization failed\n");
4000 		goto out_free_dmar;
4001 	}
4002 	up_write(&dmar_global_lock);
4003 
4004 	init_iommu_pm_ops();
4005 
4006 	down_read(&dmar_global_lock);
4007 	for_each_active_iommu(iommu, drhd) {
4008 		/*
4009 		 * The flush queue implementation does not perform
4010 		 * page-selective invalidations that are required for efficient
4011 		 * TLB flushes in virtual environments.  The benefit of batching
4012 		 * is likely to be much lower than the overhead of synchronizing
4013 		 * the virtual and physical IOMMU page-tables.
4014 		 */
4015 		if (cap_caching_mode(iommu->cap)) {
4016 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4017 			iommu_set_dma_strict();
4018 		}
4019 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4020 				       intel_iommu_groups,
4021 				       "%s", iommu->name);
4022 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4023 	}
4024 	up_read(&dmar_global_lock);
4025 
4026 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4027 	if (si_domain && !hw_pass_through)
4028 		register_memory_notifier(&intel_iommu_memory_nb);
4029 
4030 	down_read(&dmar_global_lock);
4031 	if (probe_acpi_namespace_devices())
4032 		pr_warn("ACPI name space devices didn't probe correctly\n");
4033 
4034 	/* Finally, we enable the DMA remapping hardware. */
4035 	for_each_iommu(iommu, drhd) {
4036 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4037 			iommu_enable_translation(iommu);
4038 
4039 		iommu_disable_protect_mem_regions(iommu);
4040 	}
4041 	up_read(&dmar_global_lock);
4042 
4043 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4044 
4045 	intel_iommu_enabled = 1;
4046 
4047 	return 0;
4048 
4049 out_free_dmar:
4050 	intel_iommu_free_dmars();
4051 	up_write(&dmar_global_lock);
4052 	return ret;
4053 }
4054 
4055 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4056 {
4057 	struct device_domain_info *info = opaque;
4058 
4059 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4060 	return 0;
4061 }
4062 
4063 /*
4064  * NB - intel-iommu lacks any sort of reference counting for the users of
4065  * dependent devices.  If multiple endpoints have intersecting dependent
4066  * devices, unbinding the driver from any one of them will possibly leave
4067  * the others unable to operate.
4068  */
4069 static void domain_context_clear(struct device_domain_info *info)
4070 {
4071 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4072 		return;
4073 
4074 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4075 			       &domain_context_clear_one_cb, info);
4076 }
4077 
4078 static void dmar_remove_one_dev_info(struct device *dev)
4079 {
4080 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4081 	struct dmar_domain *domain = info->domain;
4082 	struct intel_iommu *iommu = info->iommu;
4083 
4084 	if (!dev_is_real_dma_subdevice(info->dev)) {
4085 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4086 			intel_pasid_tear_down_entry(iommu, info->dev,
4087 					PASID_RID2PASID, false);
4088 
4089 		iommu_disable_dev_iotlb(info);
4090 		domain_context_clear(info);
4091 		intel_pasid_free_table(info->dev);
4092 	}
4093 
4094 	spin_lock(&domain->lock);
4095 	list_del(&info->link);
4096 	spin_unlock(&domain->lock);
4097 
4098 	domain_detach_iommu(domain, iommu);
4099 	info->domain = NULL;
4100 }
4101 
4102 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4103 {
4104 	int adjust_width;
4105 
4106 	/* calculate AGAW */
4107 	domain->gaw = guest_width;
4108 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4109 	domain->agaw = width_to_agaw(adjust_width);
4110 
4111 	domain->iommu_coherency = false;
4112 	domain->iommu_superpage = 0;
4113 	domain->max_addr = 0;
4114 
4115 	/* always allocate the top pgd */
4116 	domain->pgd = alloc_pgtable_page(domain->nid);
4117 	if (!domain->pgd)
4118 		return -ENOMEM;
4119 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4120 	return 0;
4121 }
4122 
4123 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4124 {
4125 	struct dmar_domain *dmar_domain;
4126 	struct iommu_domain *domain;
4127 
4128 	switch (type) {
4129 	case IOMMU_DOMAIN_DMA:
4130 	case IOMMU_DOMAIN_DMA_FQ:
4131 	case IOMMU_DOMAIN_UNMANAGED:
4132 		dmar_domain = alloc_domain(type);
4133 		if (!dmar_domain) {
4134 			pr_err("Can't allocate dmar_domain\n");
4135 			return NULL;
4136 		}
4137 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4138 			pr_err("Domain initialization failed\n");
4139 			domain_exit(dmar_domain);
4140 			return NULL;
4141 		}
4142 
4143 		domain = &dmar_domain->domain;
4144 		domain->geometry.aperture_start = 0;
4145 		domain->geometry.aperture_end   =
4146 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4147 		domain->geometry.force_aperture = true;
4148 
4149 		return domain;
4150 	case IOMMU_DOMAIN_IDENTITY:
4151 		return &si_domain->domain;
4152 	default:
4153 		return NULL;
4154 	}
4155 
4156 	return NULL;
4157 }
4158 
4159 static void intel_iommu_domain_free(struct iommu_domain *domain)
4160 {
4161 	if (domain != &si_domain->domain)
4162 		domain_exit(to_dmar_domain(domain));
4163 }
4164 
4165 static int prepare_domain_attach_device(struct iommu_domain *domain,
4166 					struct device *dev)
4167 {
4168 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169 	struct intel_iommu *iommu;
4170 	int addr_width;
4171 
4172 	iommu = device_to_iommu(dev, NULL, NULL);
4173 	if (!iommu)
4174 		return -ENODEV;
4175 
4176 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4177 		return -EOPNOTSUPP;
4178 
4179 	/* check if this iommu agaw is sufficient for max mapped address */
4180 	addr_width = agaw_to_width(iommu->agaw);
4181 	if (addr_width > cap_mgaw(iommu->cap))
4182 		addr_width = cap_mgaw(iommu->cap);
4183 
4184 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4185 		dev_err(dev, "%s: iommu width (%d) is not "
4186 		        "sufficient for the mapped address (%llx)\n",
4187 		        __func__, addr_width, dmar_domain->max_addr);
4188 		return -EFAULT;
4189 	}
4190 	dmar_domain->gaw = addr_width;
4191 
4192 	/*
4193 	 * Knock out extra levels of page tables if necessary
4194 	 */
4195 	while (iommu->agaw < dmar_domain->agaw) {
4196 		struct dma_pte *pte;
4197 
4198 		pte = dmar_domain->pgd;
4199 		if (dma_pte_present(pte)) {
4200 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4201 			free_pgtable_page(pte);
4202 		}
4203 		dmar_domain->agaw--;
4204 	}
4205 
4206 	return 0;
4207 }
4208 
4209 static int intel_iommu_attach_device(struct iommu_domain *domain,
4210 				     struct device *dev)
4211 {
4212 	int ret;
4213 
4214 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4215 	    device_is_rmrr_locked(dev)) {
4216 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4217 		return -EPERM;
4218 	}
4219 
4220 	/* normally dev is not mapped */
4221 	if (unlikely(domain_context_mapped(dev))) {
4222 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4223 
4224 		if (info->domain)
4225 			dmar_remove_one_dev_info(dev);
4226 	}
4227 
4228 	ret = prepare_domain_attach_device(domain, dev);
4229 	if (ret)
4230 		return ret;
4231 
4232 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4233 }
4234 
4235 static void intel_iommu_detach_device(struct iommu_domain *domain,
4236 				      struct device *dev)
4237 {
4238 	dmar_remove_one_dev_info(dev);
4239 }
4240 
4241 static int intel_iommu_map(struct iommu_domain *domain,
4242 			   unsigned long iova, phys_addr_t hpa,
4243 			   size_t size, int iommu_prot, gfp_t gfp)
4244 {
4245 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4246 	u64 max_addr;
4247 	int prot = 0;
4248 
4249 	if (iommu_prot & IOMMU_READ)
4250 		prot |= DMA_PTE_READ;
4251 	if (iommu_prot & IOMMU_WRITE)
4252 		prot |= DMA_PTE_WRITE;
4253 	if (dmar_domain->set_pte_snp)
4254 		prot |= DMA_PTE_SNP;
4255 
4256 	max_addr = iova + size;
4257 	if (dmar_domain->max_addr < max_addr) {
4258 		u64 end;
4259 
4260 		/* check if minimum agaw is sufficient for mapped address */
4261 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4262 		if (end < max_addr) {
4263 			pr_err("%s: iommu width (%d) is not "
4264 			       "sufficient for the mapped address (%llx)\n",
4265 			       __func__, dmar_domain->gaw, max_addr);
4266 			return -EFAULT;
4267 		}
4268 		dmar_domain->max_addr = max_addr;
4269 	}
4270 	/* Round up size to next multiple of PAGE_SIZE, if it and
4271 	   the low bits of hpa would take us onto the next page */
4272 	size = aligned_nrpages(hpa, size);
4273 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4274 				hpa >> VTD_PAGE_SHIFT, size, prot);
4275 }
4276 
4277 static int intel_iommu_map_pages(struct iommu_domain *domain,
4278 				 unsigned long iova, phys_addr_t paddr,
4279 				 size_t pgsize, size_t pgcount,
4280 				 int prot, gfp_t gfp, size_t *mapped)
4281 {
4282 	unsigned long pgshift = __ffs(pgsize);
4283 	size_t size = pgcount << pgshift;
4284 	int ret;
4285 
4286 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4287 		return -EINVAL;
4288 
4289 	if (!IS_ALIGNED(iova | paddr, pgsize))
4290 		return -EINVAL;
4291 
4292 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4293 	if (!ret && mapped)
4294 		*mapped = size;
4295 
4296 	return ret;
4297 }
4298 
4299 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4300 				unsigned long iova, size_t size,
4301 				struct iommu_iotlb_gather *gather)
4302 {
4303 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4304 	unsigned long start_pfn, last_pfn;
4305 	int level = 0;
4306 
4307 	/* Cope with horrid API which requires us to unmap more than the
4308 	   size argument if it happens to be a large-page mapping. */
4309 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4310 
4311 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4312 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4313 
4314 	start_pfn = iova >> VTD_PAGE_SHIFT;
4315 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4316 
4317 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4318 
4319 	if (dmar_domain->max_addr == iova + size)
4320 		dmar_domain->max_addr = iova;
4321 
4322 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4323 
4324 	return size;
4325 }
4326 
4327 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4328 				      unsigned long iova,
4329 				      size_t pgsize, size_t pgcount,
4330 				      struct iommu_iotlb_gather *gather)
4331 {
4332 	unsigned long pgshift = __ffs(pgsize);
4333 	size_t size = pgcount << pgshift;
4334 
4335 	return intel_iommu_unmap(domain, iova, size, gather);
4336 }
4337 
4338 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4339 				 struct iommu_iotlb_gather *gather)
4340 {
4341 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4342 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4343 	size_t size = gather->end - gather->start;
4344 	struct iommu_domain_info *info;
4345 	unsigned long start_pfn;
4346 	unsigned long nrpages;
4347 	unsigned long i;
4348 
4349 	nrpages = aligned_nrpages(gather->start, size);
4350 	start_pfn = mm_to_dma_pfn(iova_pfn);
4351 
4352 	xa_for_each(&dmar_domain->iommu_array, i, info)
4353 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4354 				      start_pfn, nrpages,
4355 				      list_empty(&gather->freelist), 0);
4356 
4357 	put_pages_list(&gather->freelist);
4358 }
4359 
4360 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4361 					    dma_addr_t iova)
4362 {
4363 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4364 	struct dma_pte *pte;
4365 	int level = 0;
4366 	u64 phys = 0;
4367 
4368 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4369 	if (pte && dma_pte_present(pte))
4370 		phys = dma_pte_addr(pte) +
4371 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4372 						VTD_PAGE_SHIFT) - 1));
4373 
4374 	return phys;
4375 }
4376 
4377 static bool domain_support_force_snooping(struct dmar_domain *domain)
4378 {
4379 	struct device_domain_info *info;
4380 	bool support = true;
4381 
4382 	assert_spin_locked(&domain->lock);
4383 	list_for_each_entry(info, &domain->devices, link) {
4384 		if (!ecap_sc_support(info->iommu->ecap)) {
4385 			support = false;
4386 			break;
4387 		}
4388 	}
4389 
4390 	return support;
4391 }
4392 
4393 static void domain_set_force_snooping(struct dmar_domain *domain)
4394 {
4395 	struct device_domain_info *info;
4396 
4397 	assert_spin_locked(&domain->lock);
4398 	/*
4399 	 * Second level page table supports per-PTE snoop control. The
4400 	 * iommu_map() interface will handle this by setting SNP bit.
4401 	 */
4402 	if (!domain_use_first_level(domain)) {
4403 		domain->set_pte_snp = true;
4404 		return;
4405 	}
4406 
4407 	list_for_each_entry(info, &domain->devices, link)
4408 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4409 						     PASID_RID2PASID);
4410 }
4411 
4412 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4413 {
4414 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4415 
4416 	if (dmar_domain->force_snooping)
4417 		return true;
4418 
4419 	spin_lock(&dmar_domain->lock);
4420 	if (!domain_support_force_snooping(dmar_domain)) {
4421 		spin_unlock(&dmar_domain->lock);
4422 		return false;
4423 	}
4424 
4425 	domain_set_force_snooping(dmar_domain);
4426 	dmar_domain->force_snooping = true;
4427 	spin_unlock(&dmar_domain->lock);
4428 
4429 	return true;
4430 }
4431 
4432 static bool intel_iommu_capable(enum iommu_cap cap)
4433 {
4434 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4435 		return true;
4436 	if (cap == IOMMU_CAP_INTR_REMAP)
4437 		return irq_remapping_enabled == 1;
4438 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4439 		return dmar_platform_optin();
4440 
4441 	return false;
4442 }
4443 
4444 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4445 {
4446 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4447 	struct device_domain_info *info;
4448 	struct intel_iommu *iommu;
4449 	u8 bus, devfn;
4450 
4451 	iommu = device_to_iommu(dev, &bus, &devfn);
4452 	if (!iommu)
4453 		return ERR_PTR(-ENODEV);
4454 
4455 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4456 	if (!info)
4457 		return ERR_PTR(-ENOMEM);
4458 
4459 	if (dev_is_real_dma_subdevice(dev)) {
4460 		info->bus = pdev->bus->number;
4461 		info->devfn = pdev->devfn;
4462 		info->segment = pci_domain_nr(pdev->bus);
4463 	} else {
4464 		info->bus = bus;
4465 		info->devfn = devfn;
4466 		info->segment = iommu->segment;
4467 	}
4468 
4469 	info->dev = dev;
4470 	info->iommu = iommu;
4471 	if (dev_is_pci(dev)) {
4472 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4473 		    pci_ats_supported(pdev) &&
4474 		    dmar_ats_supported(pdev, iommu))
4475 			info->ats_supported = 1;
4476 
4477 		if (sm_supported(iommu)) {
4478 			if (pasid_supported(iommu)) {
4479 				int features = pci_pasid_features(pdev);
4480 
4481 				if (features >= 0)
4482 					info->pasid_supported = features | 1;
4483 			}
4484 
4485 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4486 			    pci_pri_supported(pdev))
4487 				info->pri_supported = 1;
4488 		}
4489 	}
4490 
4491 	dev_iommu_priv_set(dev, info);
4492 
4493 	return &iommu->iommu;
4494 }
4495 
4496 static void intel_iommu_release_device(struct device *dev)
4497 {
4498 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4499 
4500 	dmar_remove_one_dev_info(dev);
4501 	dev_iommu_priv_set(dev, NULL);
4502 	kfree(info);
4503 	set_dma_ops(dev, NULL);
4504 }
4505 
4506 static void intel_iommu_probe_finalize(struct device *dev)
4507 {
4508 	set_dma_ops(dev, NULL);
4509 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4510 }
4511 
4512 static void intel_iommu_get_resv_regions(struct device *device,
4513 					 struct list_head *head)
4514 {
4515 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4516 	struct iommu_resv_region *reg;
4517 	struct dmar_rmrr_unit *rmrr;
4518 	struct device *i_dev;
4519 	int i;
4520 
4521 	down_read(&dmar_global_lock);
4522 	for_each_rmrr_units(rmrr) {
4523 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4524 					  i, i_dev) {
4525 			struct iommu_resv_region *resv;
4526 			enum iommu_resv_type type;
4527 			size_t length;
4528 
4529 			if (i_dev != device &&
4530 			    !is_downstream_to_pci_bridge(device, i_dev))
4531 				continue;
4532 
4533 			length = rmrr->end_address - rmrr->base_address + 1;
4534 
4535 			type = device_rmrr_is_relaxable(device) ?
4536 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4537 
4538 			resv = iommu_alloc_resv_region(rmrr->base_address,
4539 						       length, prot, type);
4540 			if (!resv)
4541 				break;
4542 
4543 			list_add_tail(&resv->list, head);
4544 		}
4545 	}
4546 	up_read(&dmar_global_lock);
4547 
4548 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4549 	if (dev_is_pci(device)) {
4550 		struct pci_dev *pdev = to_pci_dev(device);
4551 
4552 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4553 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4554 						   IOMMU_RESV_DIRECT_RELAXABLE);
4555 			if (reg)
4556 				list_add_tail(&reg->list, head);
4557 		}
4558 	}
4559 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4560 
4561 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4562 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4563 				      0, IOMMU_RESV_MSI);
4564 	if (!reg)
4565 		return;
4566 	list_add_tail(&reg->list, head);
4567 }
4568 
4569 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4570 {
4571 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4572 	struct context_entry *context;
4573 	struct dmar_domain *domain;
4574 	u64 ctx_lo;
4575 	int ret;
4576 
4577 	domain = info->domain;
4578 	if (!domain)
4579 		return -EINVAL;
4580 
4581 	spin_lock(&iommu->lock);
4582 	ret = -EINVAL;
4583 	if (!info->pasid_supported)
4584 		goto out;
4585 
4586 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4587 	if (WARN_ON(!context))
4588 		goto out;
4589 
4590 	ctx_lo = context[0].lo;
4591 
4592 	if (!(ctx_lo & CONTEXT_PASIDE)) {
4593 		ctx_lo |= CONTEXT_PASIDE;
4594 		context[0].lo = ctx_lo;
4595 		wmb();
4596 		iommu->flush.flush_context(iommu,
4597 					   domain_id_iommu(domain, iommu),
4598 					   PCI_DEVID(info->bus, info->devfn),
4599 					   DMA_CCMD_MASK_NOBIT,
4600 					   DMA_CCMD_DEVICE_INVL);
4601 	}
4602 
4603 	/* Enable PASID support in the device, if it wasn't already */
4604 	if (!info->pasid_enabled)
4605 		iommu_enable_dev_iotlb(info);
4606 
4607 	ret = 0;
4608 
4609  out:
4610 	spin_unlock(&iommu->lock);
4611 
4612 	return ret;
4613 }
4614 
4615 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4616 {
4617 	if (dev_is_pci(dev))
4618 		return pci_device_group(dev);
4619 	return generic_device_group(dev);
4620 }
4621 
4622 static int intel_iommu_enable_sva(struct device *dev)
4623 {
4624 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4625 	struct intel_iommu *iommu;
4626 	int ret;
4627 
4628 	if (!info || dmar_disabled)
4629 		return -EINVAL;
4630 
4631 	iommu = info->iommu;
4632 	if (!iommu)
4633 		return -EINVAL;
4634 
4635 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4636 		return -ENODEV;
4637 
4638 	if (intel_iommu_enable_pasid(iommu, dev))
4639 		return -ENODEV;
4640 
4641 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642 		return -EINVAL;
4643 
4644 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645 	if (!ret)
4646 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647 
4648 	return ret;
4649 }
4650 
4651 static int intel_iommu_disable_sva(struct device *dev)
4652 {
4653 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4654 	struct intel_iommu *iommu = info->iommu;
4655 	int ret;
4656 
4657 	ret = iommu_unregister_device_fault_handler(dev);
4658 	if (!ret)
4659 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660 
4661 	return ret;
4662 }
4663 
4664 static int intel_iommu_enable_iopf(struct device *dev)
4665 {
4666 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4667 
4668 	if (info && info->pri_supported)
4669 		return 0;
4670 
4671 	return -ENODEV;
4672 }
4673 
4674 static int
4675 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676 {
4677 	switch (feat) {
4678 	case IOMMU_DEV_FEAT_IOPF:
4679 		return intel_iommu_enable_iopf(dev);
4680 
4681 	case IOMMU_DEV_FEAT_SVA:
4682 		return intel_iommu_enable_sva(dev);
4683 
4684 	default:
4685 		return -ENODEV;
4686 	}
4687 }
4688 
4689 static int
4690 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691 {
4692 	switch (feat) {
4693 	case IOMMU_DEV_FEAT_IOPF:
4694 		return 0;
4695 
4696 	case IOMMU_DEV_FEAT_SVA:
4697 		return intel_iommu_disable_sva(dev);
4698 
4699 	default:
4700 		return -ENODEV;
4701 	}
4702 }
4703 
4704 static bool intel_iommu_is_attach_deferred(struct device *dev)
4705 {
4706 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4707 
4708 	return translation_pre_enabled(info->iommu) && !info->domain;
4709 }
4710 
4711 /*
4712  * Check that the device does not live on an external facing PCI port that is
4713  * marked as untrusted. Such devices should not be able to apply quirks and
4714  * thus not be able to bypass the IOMMU restrictions.
4715  */
4716 static bool risky_device(struct pci_dev *pdev)
4717 {
4718 	if (pdev->untrusted) {
4719 		pci_info(pdev,
4720 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721 			 pdev->vendor, pdev->device);
4722 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723 		return true;
4724 	}
4725 	return false;
4726 }
4727 
4728 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729 				       unsigned long iova, size_t size)
4730 {
4731 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732 	unsigned long pages = aligned_nrpages(iova, size);
4733 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734 	struct iommu_domain_info *info;
4735 	unsigned long i;
4736 
4737 	xa_for_each(&dmar_domain->iommu_array, i, info)
4738 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739 }
4740 
4741 const struct iommu_ops intel_iommu_ops = {
4742 	.capable		= intel_iommu_capable,
4743 	.domain_alloc		= intel_iommu_domain_alloc,
4744 	.probe_device		= intel_iommu_probe_device,
4745 	.probe_finalize		= intel_iommu_probe_finalize,
4746 	.release_device		= intel_iommu_release_device,
4747 	.get_resv_regions	= intel_iommu_get_resv_regions,
4748 	.device_group		= intel_iommu_device_group,
4749 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4750 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4751 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4752 	.def_domain_type	= device_def_domain_type,
4753 	.pgsize_bitmap		= SZ_4K,
4754 #ifdef CONFIG_INTEL_IOMMU_SVM
4755 	.sva_bind		= intel_svm_bind,
4756 	.sva_unbind		= intel_svm_unbind,
4757 	.sva_get_pasid		= intel_svm_get_pasid,
4758 	.page_response		= intel_svm_page_response,
4759 #endif
4760 	.default_domain_ops = &(const struct iommu_domain_ops) {
4761 		.attach_dev		= intel_iommu_attach_device,
4762 		.detach_dev		= intel_iommu_detach_device,
4763 		.map_pages		= intel_iommu_map_pages,
4764 		.unmap_pages		= intel_iommu_unmap_pages,
4765 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4766 		.flush_iotlb_all        = intel_flush_iotlb_all,
4767 		.iotlb_sync		= intel_iommu_tlb_sync,
4768 		.iova_to_phys		= intel_iommu_iova_to_phys,
4769 		.free			= intel_iommu_domain_free,
4770 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4771 	}
4772 };
4773 
4774 static void quirk_iommu_igfx(struct pci_dev *dev)
4775 {
4776 	if (risky_device(dev))
4777 		return;
4778 
4779 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4780 	dmar_map_gfx = 0;
4781 }
4782 
4783 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4791 
4792 /* Broadwell igfx malfunctions with dmar */
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4817 
4818 static void quirk_iommu_rwbf(struct pci_dev *dev)
4819 {
4820 	if (risky_device(dev))
4821 		return;
4822 
4823 	/*
4824 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4825 	 * but needs it. Same seems to hold for the desktop versions.
4826 	 */
4827 	pci_info(dev, "Forcing write-buffer flush capability\n");
4828 	rwbf_quirk = 1;
4829 }
4830 
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4832 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4833 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4838 
4839 #define GGC 0x52
4840 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4841 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4842 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4843 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4844 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4845 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4846 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4847 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4848 
4849 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4850 {
4851 	unsigned short ggc;
4852 
4853 	if (risky_device(dev))
4854 		return;
4855 
4856 	if (pci_read_config_word(dev, GGC, &ggc))
4857 		return;
4858 
4859 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4860 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4861 		dmar_map_gfx = 0;
4862 	} else if (dmar_map_gfx) {
4863 		/* we have to ensure the gfx device is idle before we flush */
4864 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4865 		iommu_set_dma_strict();
4866 	}
4867 }
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4872 
4873 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4874 {
4875 	unsigned short ver;
4876 
4877 	if (!IS_GFX_DEVICE(dev))
4878 		return;
4879 
4880 	ver = (dev->device >> 8) & 0xff;
4881 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4882 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4883 	    ver != 0x9a && ver != 0xa7)
4884 		return;
4885 
4886 	if (risky_device(dev))
4887 		return;
4888 
4889 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4890 	iommu_skip_te_disable = 1;
4891 }
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4893 
4894 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4895    ISOCH DMAR unit for the Azalia sound device, but not give it any
4896    TLB entries, which causes it to deadlock. Check for that.  We do
4897    this in a function called from init_dmars(), instead of in a PCI
4898    quirk, because we don't want to print the obnoxious "BIOS broken"
4899    message if VT-d is actually disabled.
4900 */
4901 static void __init check_tylersburg_isoch(void)
4902 {
4903 	struct pci_dev *pdev;
4904 	uint32_t vtisochctrl;
4905 
4906 	/* If there's no Azalia in the system anyway, forget it. */
4907 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4908 	if (!pdev)
4909 		return;
4910 
4911 	if (risky_device(pdev)) {
4912 		pci_dev_put(pdev);
4913 		return;
4914 	}
4915 
4916 	pci_dev_put(pdev);
4917 
4918 	/* System Management Registers. Might be hidden, in which case
4919 	   we can't do the sanity check. But that's OK, because the
4920 	   known-broken BIOSes _don't_ actually hide it, so far. */
4921 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4922 	if (!pdev)
4923 		return;
4924 
4925 	if (risky_device(pdev)) {
4926 		pci_dev_put(pdev);
4927 		return;
4928 	}
4929 
4930 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4931 		pci_dev_put(pdev);
4932 		return;
4933 	}
4934 
4935 	pci_dev_put(pdev);
4936 
4937 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4938 	if (vtisochctrl & 1)
4939 		return;
4940 
4941 	/* Drop all bits other than the number of TLB entries */
4942 	vtisochctrl &= 0x1c;
4943 
4944 	/* If we have the recommended number of TLB entries (16), fine. */
4945 	if (vtisochctrl == 0x10)
4946 		return;
4947 
4948 	/* Zero TLB entries? You get to ride the short bus to school. */
4949 	if (!vtisochctrl) {
4950 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4951 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4952 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4953 		     dmi_get_system_info(DMI_BIOS_VERSION),
4954 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4955 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4956 		return;
4957 	}
4958 
4959 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4960 	       vtisochctrl);
4961 }
4962