xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 57fc7323)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28 
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131 
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173 	context->lo &= ~(1ULL << 11);
174 }
175 
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178 	return !!(context->lo & (1ULL << 11));
179 }
180 
181 static inline void context_set_copied(struct context_entry *context)
182 {
183 	context->hi |= (1ull << 3);
184 }
185 
186 static inline bool context_copied(struct context_entry *context)
187 {
188 	return !!(context->hi & (1ULL << 3));
189 }
190 
191 static inline bool __context_present(struct context_entry *context)
192 {
193 	return (context->lo & 1);
194 }
195 
196 bool context_present(struct context_entry *context)
197 {
198 	return context_pasid_enabled(context) ?
199 	     __context_present(context) :
200 	     __context_present(context) && !context_copied(context);
201 }
202 
203 static inline void context_set_present(struct context_entry *context)
204 {
205 	context->lo |= 1;
206 }
207 
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210 	context->lo &= (((u64)-1) << 2) | 1;
211 }
212 
213 static inline void context_set_translation_type(struct context_entry *context,
214 						unsigned long value)
215 {
216 	context->lo &= (((u64)-1) << 4) | 3;
217 	context->lo |= (value & 3) << 2;
218 }
219 
220 static inline void context_set_address_root(struct context_entry *context,
221 					    unsigned long value)
222 {
223 	context->lo &= ~VTD_PAGE_MASK;
224 	context->lo |= value & VTD_PAGE_MASK;
225 }
226 
227 static inline void context_set_address_width(struct context_entry *context,
228 					     unsigned long value)
229 {
230 	context->hi |= value & 7;
231 }
232 
233 static inline void context_set_domain_id(struct context_entry *context,
234 					 unsigned long value)
235 {
236 	context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238 
239 static inline int context_domain_id(struct context_entry *c)
240 {
241 	return((c->hi >> 8) & 0xffff);
242 }
243 
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246 	context->lo = 0;
247 	context->hi = 0;
248 }
249 
250 /*
251  * This domain is a statically identity mapping domain.
252  *	1. This domain creats a static 1:1 mapping to all usable memory.
253  * 	2. It maps to each iommu if successful.
254  *	3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258 
259 #define for_each_domain_iommu(idx, domain)			\
260 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
261 		if (domain->iommu_refcnt[idx])
262 
263 struct dmar_rmrr_unit {
264 	struct list_head list;		/* list of rmrr units	*/
265 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
266 	u64	base_address;		/* reserved base address*/
267 	u64	end_address;		/* reserved end address */
268 	struct dmar_dev_scope *devices;	/* target devices */
269 	int	devices_cnt;		/* target device count */
270 };
271 
272 struct dmar_atsr_unit {
273 	struct list_head list;		/* list of ATSR units */
274 	struct acpi_dmar_header *hdr;	/* ACPI header */
275 	struct dmar_dev_scope *devices;	/* target devices */
276 	int devices_cnt;		/* target device count */
277 	u8 include_all:1;		/* include all ports */
278 };
279 
280 struct dmar_satc_unit {
281 	struct list_head list;		/* list of SATC units */
282 	struct acpi_dmar_header *hdr;	/* ACPI header */
283 	struct dmar_dev_scope *devices;	/* target devices */
284 	struct intel_iommu *iommu;	/* the corresponding iommu */
285 	int devices_cnt;		/* target device count */
286 	u8 atc_required:1;		/* ATS is required */
287 };
288 
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292 
293 #define for_each_rmrr_units(rmrr) \
294 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295 
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298 
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302 
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305 
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308 
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313 
314 #define IDENTMAP_GFX		2
315 #define IDENTMAP_AZALIA		4
316 
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319 
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322 
323 const struct iommu_ops intel_iommu_ops;
324 
325 static bool translation_pre_enabled(struct intel_iommu *iommu)
326 {
327 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
328 }
329 
330 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
331 {
332 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
333 }
334 
335 static void init_translation_status(struct intel_iommu *iommu)
336 {
337 	u32 gsts;
338 
339 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
340 	if (gsts & DMA_GSTS_TES)
341 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
342 }
343 
344 static int __init intel_iommu_setup(char *str)
345 {
346 	if (!str)
347 		return -EINVAL;
348 
349 	while (*str) {
350 		if (!strncmp(str, "on", 2)) {
351 			dmar_disabled = 0;
352 			pr_info("IOMMU enabled\n");
353 		} else if (!strncmp(str, "off", 3)) {
354 			dmar_disabled = 1;
355 			no_platform_optin = 1;
356 			pr_info("IOMMU disabled\n");
357 		} else if (!strncmp(str, "igfx_off", 8)) {
358 			dmar_map_gfx = 0;
359 			pr_info("Disable GFX device mapping\n");
360 		} else if (!strncmp(str, "forcedac", 8)) {
361 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
362 			iommu_dma_forcedac = true;
363 		} else if (!strncmp(str, "strict", 6)) {
364 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
365 			iommu_set_dma_strict();
366 		} else if (!strncmp(str, "sp_off", 6)) {
367 			pr_info("Disable supported super page\n");
368 			intel_iommu_superpage = 0;
369 		} else if (!strncmp(str, "sm_on", 5)) {
370 			pr_info("Enable scalable mode if hardware supports\n");
371 			intel_iommu_sm = 1;
372 		} else if (!strncmp(str, "sm_off", 6)) {
373 			pr_info("Scalable mode is disallowed\n");
374 			intel_iommu_sm = 0;
375 		} else if (!strncmp(str, "tboot_noforce", 13)) {
376 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
377 			intel_iommu_tboot_noforce = 1;
378 		} else {
379 			pr_notice("Unknown option - '%s'\n", str);
380 		}
381 
382 		str += strcspn(str, ",");
383 		while (*str == ',')
384 			str++;
385 	}
386 
387 	return 1;
388 }
389 __setup("intel_iommu=", intel_iommu_setup);
390 
391 void *alloc_pgtable_page(int node)
392 {
393 	struct page *page;
394 	void *vaddr = NULL;
395 
396 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
397 	if (page)
398 		vaddr = page_address(page);
399 	return vaddr;
400 }
401 
402 void free_pgtable_page(void *vaddr)
403 {
404 	free_page((unsigned long)vaddr);
405 }
406 
407 static inline int domain_type_is_si(struct dmar_domain *domain)
408 {
409 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
410 }
411 
412 static inline bool domain_use_first_level(struct dmar_domain *domain)
413 {
414 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
415 }
416 
417 static inline int domain_pfn_supported(struct dmar_domain *domain,
418 				       unsigned long pfn)
419 {
420 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
421 
422 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
423 }
424 
425 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
426 {
427 	unsigned long sagaw;
428 	int agaw;
429 
430 	sagaw = cap_sagaw(iommu->cap);
431 	for (agaw = width_to_agaw(max_gaw);
432 	     agaw >= 0; agaw--) {
433 		if (test_bit(agaw, &sagaw))
434 			break;
435 	}
436 
437 	return agaw;
438 }
439 
440 /*
441  * Calculate max SAGAW for each iommu.
442  */
443 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
444 {
445 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
446 }
447 
448 /*
449  * calculate agaw for each iommu.
450  * "SAGAW" may be different across iommus, use a default agaw, and
451  * get a supported less agaw for iommus that don't support the default agaw.
452  */
453 int iommu_calculate_agaw(struct intel_iommu *iommu)
454 {
455 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
456 }
457 
458 /* This functionin only returns single iommu in a domain */
459 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
460 {
461 	int iommu_id;
462 
463 	/* si_domain and vm domain should not get here. */
464 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
465 		return NULL;
466 
467 	for_each_domain_iommu(iommu_id, domain)
468 		break;
469 
470 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
471 		return NULL;
472 
473 	return g_iommus[iommu_id];
474 }
475 
476 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
477 {
478 	return sm_supported(iommu) ?
479 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
480 }
481 
482 static void domain_update_iommu_coherency(struct dmar_domain *domain)
483 {
484 	struct dmar_drhd_unit *drhd;
485 	struct intel_iommu *iommu;
486 	bool found = false;
487 	int i;
488 
489 	domain->iommu_coherency = true;
490 
491 	for_each_domain_iommu(i, domain) {
492 		found = true;
493 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
494 			domain->iommu_coherency = false;
495 			break;
496 		}
497 	}
498 	if (found)
499 		return;
500 
501 	/* No hardware attached; use lowest common denominator */
502 	rcu_read_lock();
503 	for_each_active_iommu(iommu, drhd) {
504 		if (!iommu_paging_structure_coherency(iommu)) {
505 			domain->iommu_coherency = false;
506 			break;
507 		}
508 	}
509 	rcu_read_unlock();
510 }
511 
512 static int domain_update_iommu_superpage(struct dmar_domain *domain,
513 					 struct intel_iommu *skip)
514 {
515 	struct dmar_drhd_unit *drhd;
516 	struct intel_iommu *iommu;
517 	int mask = 0x3;
518 
519 	if (!intel_iommu_superpage)
520 		return 0;
521 
522 	/* set iommu_superpage to the smallest common denominator */
523 	rcu_read_lock();
524 	for_each_active_iommu(iommu, drhd) {
525 		if (iommu != skip) {
526 			if (domain && domain_use_first_level(domain)) {
527 				if (!cap_fl1gp_support(iommu->cap))
528 					mask = 0x1;
529 			} else {
530 				mask &= cap_super_page_val(iommu->cap);
531 			}
532 
533 			if (!mask)
534 				break;
535 		}
536 	}
537 	rcu_read_unlock();
538 
539 	return fls(mask);
540 }
541 
542 static int domain_update_device_node(struct dmar_domain *domain)
543 {
544 	struct device_domain_info *info;
545 	int nid = NUMA_NO_NODE;
546 
547 	assert_spin_locked(&device_domain_lock);
548 
549 	if (list_empty(&domain->devices))
550 		return NUMA_NO_NODE;
551 
552 	list_for_each_entry(info, &domain->devices, link) {
553 		if (!info->dev)
554 			continue;
555 
556 		/*
557 		 * There could possibly be multiple device numa nodes as devices
558 		 * within the same domain may sit behind different IOMMUs. There
559 		 * isn't perfect answer in such situation, so we select first
560 		 * come first served policy.
561 		 */
562 		nid = dev_to_node(info->dev);
563 		if (nid != NUMA_NO_NODE)
564 			break;
565 	}
566 
567 	return nid;
568 }
569 
570 static void domain_update_iotlb(struct dmar_domain *domain);
571 
572 /* Return the super pagesize bitmap if supported. */
573 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
574 {
575 	unsigned long bitmap = 0;
576 
577 	/*
578 	 * 1-level super page supports page size of 2MiB, 2-level super page
579 	 * supports page size of both 2MiB and 1GiB.
580 	 */
581 	if (domain->iommu_superpage == 1)
582 		bitmap |= SZ_2M;
583 	else if (domain->iommu_superpage == 2)
584 		bitmap |= SZ_2M | SZ_1G;
585 
586 	return bitmap;
587 }
588 
589 /* Some capabilities may be different across iommus */
590 static void domain_update_iommu_cap(struct dmar_domain *domain)
591 {
592 	domain_update_iommu_coherency(domain);
593 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
594 
595 	/*
596 	 * If RHSA is missing, we should default to the device numa domain
597 	 * as fall back.
598 	 */
599 	if (domain->nid == NUMA_NO_NODE)
600 		domain->nid = domain_update_device_node(domain);
601 
602 	/*
603 	 * First-level translation restricts the input-address to a
604 	 * canonical address (i.e., address bits 63:N have the same
605 	 * value as address bit [N-1], where N is 48-bits with 4-level
606 	 * paging and 57-bits with 5-level paging). Hence, skip bit
607 	 * [N-1].
608 	 */
609 	if (domain_use_first_level(domain))
610 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
611 	else
612 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
613 
614 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
615 	domain_update_iotlb(domain);
616 }
617 
618 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
619 					 u8 devfn, int alloc)
620 {
621 	struct root_entry *root = &iommu->root_entry[bus];
622 	struct context_entry *context;
623 	u64 *entry;
624 
625 	entry = &root->lo;
626 	if (sm_supported(iommu)) {
627 		if (devfn >= 0x80) {
628 			devfn -= 0x80;
629 			entry = &root->hi;
630 		}
631 		devfn *= 2;
632 	}
633 	if (*entry & 1)
634 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
635 	else {
636 		unsigned long phy_addr;
637 		if (!alloc)
638 			return NULL;
639 
640 		context = alloc_pgtable_page(iommu->node);
641 		if (!context)
642 			return NULL;
643 
644 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
645 		phy_addr = virt_to_phys((void *)context);
646 		*entry = phy_addr | 1;
647 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
648 	}
649 	return &context[devfn];
650 }
651 
652 /**
653  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
654  *				 sub-hierarchy of a candidate PCI-PCI bridge
655  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
656  * @bridge: the candidate PCI-PCI bridge
657  *
658  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
659  */
660 static bool
661 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
662 {
663 	struct pci_dev *pdev, *pbridge;
664 
665 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
666 		return false;
667 
668 	pdev = to_pci_dev(dev);
669 	pbridge = to_pci_dev(bridge);
670 
671 	if (pbridge->subordinate &&
672 	    pbridge->subordinate->number <= pdev->bus->number &&
673 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
674 		return true;
675 
676 	return false;
677 }
678 
679 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
680 {
681 	struct dmar_drhd_unit *drhd;
682 	u32 vtbar;
683 	int rc;
684 
685 	/* We know that this device on this chipset has its own IOMMU.
686 	 * If we find it under a different IOMMU, then the BIOS is lying
687 	 * to us. Hope that the IOMMU for this device is actually
688 	 * disabled, and it needs no translation...
689 	 */
690 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
691 	if (rc) {
692 		/* "can't" happen */
693 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
694 		return false;
695 	}
696 	vtbar &= 0xffff0000;
697 
698 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
699 	drhd = dmar_find_matched_drhd_unit(pdev);
700 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
701 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
702 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
703 		return true;
704 	}
705 
706 	return false;
707 }
708 
709 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
710 {
711 	if (!iommu || iommu->drhd->ignored)
712 		return true;
713 
714 	if (dev_is_pci(dev)) {
715 		struct pci_dev *pdev = to_pci_dev(dev);
716 
717 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
718 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
719 		    quirk_ioat_snb_local_iommu(pdev))
720 			return true;
721 	}
722 
723 	return false;
724 }
725 
726 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
727 {
728 	struct dmar_drhd_unit *drhd = NULL;
729 	struct pci_dev *pdev = NULL;
730 	struct intel_iommu *iommu;
731 	struct device *tmp;
732 	u16 segment = 0;
733 	int i;
734 
735 	if (!dev)
736 		return NULL;
737 
738 	if (dev_is_pci(dev)) {
739 		struct pci_dev *pf_pdev;
740 
741 		pdev = pci_real_dma_dev(to_pci_dev(dev));
742 
743 		/* VFs aren't listed in scope tables; we need to look up
744 		 * the PF instead to find the IOMMU. */
745 		pf_pdev = pci_physfn(pdev);
746 		dev = &pf_pdev->dev;
747 		segment = pci_domain_nr(pdev->bus);
748 	} else if (has_acpi_companion(dev))
749 		dev = &ACPI_COMPANION(dev)->dev;
750 
751 	rcu_read_lock();
752 	for_each_iommu(iommu, drhd) {
753 		if (pdev && segment != drhd->segment)
754 			continue;
755 
756 		for_each_active_dev_scope(drhd->devices,
757 					  drhd->devices_cnt, i, tmp) {
758 			if (tmp == dev) {
759 				/* For a VF use its original BDF# not that of the PF
760 				 * which we used for the IOMMU lookup. Strictly speaking
761 				 * we could do this for all PCI devices; we only need to
762 				 * get the BDF# from the scope table for ACPI matches. */
763 				if (pdev && pdev->is_virtfn)
764 					goto got_pdev;
765 
766 				if (bus && devfn) {
767 					*bus = drhd->devices[i].bus;
768 					*devfn = drhd->devices[i].devfn;
769 				}
770 				goto out;
771 			}
772 
773 			if (is_downstream_to_pci_bridge(dev, tmp))
774 				goto got_pdev;
775 		}
776 
777 		if (pdev && drhd->include_all) {
778 got_pdev:
779 			if (bus && devfn) {
780 				*bus = pdev->bus->number;
781 				*devfn = pdev->devfn;
782 			}
783 			goto out;
784 		}
785 	}
786 	iommu = NULL;
787 out:
788 	if (iommu_is_dummy(iommu, dev))
789 		iommu = NULL;
790 
791 	rcu_read_unlock();
792 
793 	return iommu;
794 }
795 
796 static void domain_flush_cache(struct dmar_domain *domain,
797 			       void *addr, int size)
798 {
799 	if (!domain->iommu_coherency)
800 		clflush_cache_range(addr, size);
801 }
802 
803 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
804 {
805 	struct context_entry *context;
806 	int ret = 0;
807 	unsigned long flags;
808 
809 	spin_lock_irqsave(&iommu->lock, flags);
810 	context = iommu_context_addr(iommu, bus, devfn, 0);
811 	if (context)
812 		ret = context_present(context);
813 	spin_unlock_irqrestore(&iommu->lock, flags);
814 	return ret;
815 }
816 
817 static void free_context_table(struct intel_iommu *iommu)
818 {
819 	int i;
820 	unsigned long flags;
821 	struct context_entry *context;
822 
823 	spin_lock_irqsave(&iommu->lock, flags);
824 	if (!iommu->root_entry) {
825 		goto out;
826 	}
827 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
828 		context = iommu_context_addr(iommu, i, 0, 0);
829 		if (context)
830 			free_pgtable_page(context);
831 
832 		if (!sm_supported(iommu))
833 			continue;
834 
835 		context = iommu_context_addr(iommu, i, 0x80, 0);
836 		if (context)
837 			free_pgtable_page(context);
838 
839 	}
840 	free_pgtable_page(iommu->root_entry);
841 	iommu->root_entry = NULL;
842 out:
843 	spin_unlock_irqrestore(&iommu->lock, flags);
844 }
845 
846 #ifdef CONFIG_DMAR_DEBUG
847 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
848 {
849 	struct device_domain_info *info;
850 	struct dma_pte *parent, *pte;
851 	struct dmar_domain *domain;
852 	int offset, level;
853 
854 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
855 	if (!info || !info->domain) {
856 		pr_info("device [%02x:%02x.%d] not probed\n",
857 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
858 		return;
859 	}
860 
861 	domain = info->domain;
862 	level = agaw_to_level(domain->agaw);
863 	parent = domain->pgd;
864 	if (!parent) {
865 		pr_info("no page table setup\n");
866 		return;
867 	}
868 
869 	while (1) {
870 		offset = pfn_level_offset(pfn, level);
871 		pte = &parent[offset];
872 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
873 			pr_info("PTE not present at level %d\n", level);
874 			break;
875 		}
876 
877 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
878 
879 		if (level == 1)
880 			break;
881 
882 		parent = phys_to_virt(dma_pte_addr(pte));
883 		level--;
884 	}
885 }
886 
887 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
888 			  unsigned long long addr, u32 pasid)
889 {
890 	struct pasid_dir_entry *dir, *pde;
891 	struct pasid_entry *entries, *pte;
892 	struct context_entry *ctx_entry;
893 	struct root_entry *rt_entry;
894 	u8 devfn = source_id & 0xff;
895 	u8 bus = source_id >> 8;
896 	int i, dir_index, index;
897 
898 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
899 
900 	/* root entry dump */
901 	rt_entry = &iommu->root_entry[bus];
902 	if (!rt_entry) {
903 		pr_info("root table entry is not present\n");
904 		return;
905 	}
906 
907 	if (sm_supported(iommu))
908 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
909 			rt_entry->hi, rt_entry->lo);
910 	else
911 		pr_info("root entry: 0x%016llx", rt_entry->lo);
912 
913 	/* context entry dump */
914 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
915 	if (!ctx_entry) {
916 		pr_info("context table entry is not present\n");
917 		return;
918 	}
919 
920 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
921 		ctx_entry->hi, ctx_entry->lo);
922 
923 	/* legacy mode does not require PASID entries */
924 	if (!sm_supported(iommu))
925 		goto pgtable_walk;
926 
927 	/* get the pointer to pasid directory entry */
928 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
929 	if (!dir) {
930 		pr_info("pasid directory entry is not present\n");
931 		return;
932 	}
933 	/* For request-without-pasid, get the pasid from context entry */
934 	if (intel_iommu_sm && pasid == INVALID_IOASID)
935 		pasid = PASID_RID2PASID;
936 
937 	dir_index = pasid >> PASID_PDE_SHIFT;
938 	pde = &dir[dir_index];
939 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
940 
941 	/* get the pointer to the pasid table entry */
942 	entries = get_pasid_table_from_pde(pde);
943 	if (!entries) {
944 		pr_info("pasid table entry is not present\n");
945 		return;
946 	}
947 	index = pasid & PASID_PTE_MASK;
948 	pte = &entries[index];
949 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
950 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
951 
952 pgtable_walk:
953 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
954 }
955 #endif
956 
957 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
958 				      unsigned long pfn, int *target_level)
959 {
960 	struct dma_pte *parent, *pte;
961 	int level = agaw_to_level(domain->agaw);
962 	int offset;
963 
964 	BUG_ON(!domain->pgd);
965 
966 	if (!domain_pfn_supported(domain, pfn))
967 		/* Address beyond IOMMU's addressing capabilities. */
968 		return NULL;
969 
970 	parent = domain->pgd;
971 
972 	while (1) {
973 		void *tmp_page;
974 
975 		offset = pfn_level_offset(pfn, level);
976 		pte = &parent[offset];
977 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
978 			break;
979 		if (level == *target_level)
980 			break;
981 
982 		if (!dma_pte_present(pte)) {
983 			uint64_t pteval;
984 
985 			tmp_page = alloc_pgtable_page(domain->nid);
986 
987 			if (!tmp_page)
988 				return NULL;
989 
990 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
991 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
992 			if (domain_use_first_level(domain)) {
993 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
994 				if (iommu_is_dma_domain(&domain->domain))
995 					pteval |= DMA_FL_PTE_ACCESS;
996 			}
997 			if (cmpxchg64(&pte->val, 0ULL, pteval))
998 				/* Someone else set it while we were thinking; use theirs. */
999 				free_pgtable_page(tmp_page);
1000 			else
1001 				domain_flush_cache(domain, pte, sizeof(*pte));
1002 		}
1003 		if (level == 1)
1004 			break;
1005 
1006 		parent = phys_to_virt(dma_pte_addr(pte));
1007 		level--;
1008 	}
1009 
1010 	if (!*target_level)
1011 		*target_level = level;
1012 
1013 	return pte;
1014 }
1015 
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018 					 unsigned long pfn,
1019 					 int level, int *large_page)
1020 {
1021 	struct dma_pte *parent, *pte;
1022 	int total = agaw_to_level(domain->agaw);
1023 	int offset;
1024 
1025 	parent = domain->pgd;
1026 	while (level <= total) {
1027 		offset = pfn_level_offset(pfn, total);
1028 		pte = &parent[offset];
1029 		if (level == total)
1030 			return pte;
1031 
1032 		if (!dma_pte_present(pte)) {
1033 			*large_page = total;
1034 			break;
1035 		}
1036 
1037 		if (dma_pte_superpage(pte)) {
1038 			*large_page = total;
1039 			return pte;
1040 		}
1041 
1042 		parent = phys_to_virt(dma_pte_addr(pte));
1043 		total--;
1044 	}
1045 	return NULL;
1046 }
1047 
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050 				unsigned long start_pfn,
1051 				unsigned long last_pfn)
1052 {
1053 	unsigned int large_page;
1054 	struct dma_pte *first_pte, *pte;
1055 
1056 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058 	BUG_ON(start_pfn > last_pfn);
1059 
1060 	/* we don't need lock here; nobody else touches the iova range */
1061 	do {
1062 		large_page = 1;
1063 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064 		if (!pte) {
1065 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1066 			continue;
1067 		}
1068 		do {
1069 			dma_clear_pte(pte);
1070 			start_pfn += lvl_to_nr_pages(large_page);
1071 			pte++;
1072 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073 
1074 		domain_flush_cache(domain, first_pte,
1075 				   (void *)pte - (void *)first_pte);
1076 
1077 	} while (start_pfn && start_pfn <= last_pfn);
1078 }
1079 
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081 			       int retain_level, struct dma_pte *pte,
1082 			       unsigned long pfn, unsigned long start_pfn,
1083 			       unsigned long last_pfn)
1084 {
1085 	pfn = max(start_pfn, pfn);
1086 	pte = &pte[pfn_level_offset(pfn, level)];
1087 
1088 	do {
1089 		unsigned long level_pfn;
1090 		struct dma_pte *level_pte;
1091 
1092 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1093 			goto next;
1094 
1095 		level_pfn = pfn & level_mask(level);
1096 		level_pte = phys_to_virt(dma_pte_addr(pte));
1097 
1098 		if (level > 2) {
1099 			dma_pte_free_level(domain, level - 1, retain_level,
1100 					   level_pte, level_pfn, start_pfn,
1101 					   last_pfn);
1102 		}
1103 
1104 		/*
1105 		 * Free the page table if we're below the level we want to
1106 		 * retain and the range covers the entire table.
1107 		 */
1108 		if (level < retain_level && !(start_pfn > level_pfn ||
1109 		      last_pfn < level_pfn + level_size(level) - 1)) {
1110 			dma_clear_pte(pte);
1111 			domain_flush_cache(domain, pte, sizeof(*pte));
1112 			free_pgtable_page(level_pte);
1113 		}
1114 next:
1115 		pfn += level_size(level);
1116 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1117 }
1118 
1119 /*
1120  * clear last level (leaf) ptes and free page table pages below the
1121  * level we wish to keep intact.
1122  */
1123 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1124 				   unsigned long start_pfn,
1125 				   unsigned long last_pfn,
1126 				   int retain_level)
1127 {
1128 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1129 
1130 	/* We don't need lock here; nobody else touches the iova range */
1131 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1132 			   domain->pgd, 0, start_pfn, last_pfn);
1133 
1134 	/* free pgd */
1135 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1136 		free_pgtable_page(domain->pgd);
1137 		domain->pgd = NULL;
1138 	}
1139 }
1140 
1141 /* When a page at a given level is being unlinked from its parent, we don't
1142    need to *modify* it at all. All we need to do is make a list of all the
1143    pages which can be freed just as soon as we've flushed the IOTLB and we
1144    know the hardware page-walk will no longer touch them.
1145    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1146    be freed. */
1147 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1148 				    int level, struct dma_pte *pte,
1149 				    struct list_head *freelist)
1150 {
1151 	struct page *pg;
1152 
1153 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1154 	list_add_tail(&pg->lru, freelist);
1155 
1156 	if (level == 1)
1157 		return;
1158 
1159 	pte = page_address(pg);
1160 	do {
1161 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1162 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1163 		pte++;
1164 	} while (!first_pte_in_page(pte));
1165 }
1166 
1167 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1168 				struct dma_pte *pte, unsigned long pfn,
1169 				unsigned long start_pfn, unsigned long last_pfn,
1170 				struct list_head *freelist)
1171 {
1172 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1173 
1174 	pfn = max(start_pfn, pfn);
1175 	pte = &pte[pfn_level_offset(pfn, level)];
1176 
1177 	do {
1178 		unsigned long level_pfn = pfn & level_mask(level);
1179 
1180 		if (!dma_pte_present(pte))
1181 			goto next;
1182 
1183 		/* If range covers entire pagetable, free it */
1184 		if (start_pfn <= level_pfn &&
1185 		    last_pfn >= level_pfn + level_size(level) - 1) {
1186 			/* These suborbinate page tables are going away entirely. Don't
1187 			   bother to clear them; we're just going to *free* them. */
1188 			if (level > 1 && !dma_pte_superpage(pte))
1189 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1190 
1191 			dma_clear_pte(pte);
1192 			if (!first_pte)
1193 				first_pte = pte;
1194 			last_pte = pte;
1195 		} else if (level > 1) {
1196 			/* Recurse down into a level that isn't *entirely* obsolete */
1197 			dma_pte_clear_level(domain, level - 1,
1198 					    phys_to_virt(dma_pte_addr(pte)),
1199 					    level_pfn, start_pfn, last_pfn,
1200 					    freelist);
1201 		}
1202 next:
1203 		pfn = level_pfn + level_size(level);
1204 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1205 
1206 	if (first_pte)
1207 		domain_flush_cache(domain, first_pte,
1208 				   (void *)++last_pte - (void *)first_pte);
1209 }
1210 
1211 /* We can't just free the pages because the IOMMU may still be walking
1212    the page tables, and may have cached the intermediate levels. The
1213    pages can only be freed after the IOTLB flush has been done. */
1214 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1215 			 unsigned long last_pfn, struct list_head *freelist)
1216 {
1217 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1218 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1219 	BUG_ON(start_pfn > last_pfn);
1220 
1221 	/* we don't need lock here; nobody else touches the iova range */
1222 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1223 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1224 
1225 	/* free pgd */
1226 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1227 		struct page *pgd_page = virt_to_page(domain->pgd);
1228 		list_add_tail(&pgd_page->lru, freelist);
1229 		domain->pgd = NULL;
1230 	}
1231 }
1232 
1233 /* iommu handling */
1234 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1235 {
1236 	struct root_entry *root;
1237 	unsigned long flags;
1238 
1239 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1240 	if (!root) {
1241 		pr_err("Allocating root entry for %s failed\n",
1242 			iommu->name);
1243 		return -ENOMEM;
1244 	}
1245 
1246 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1247 
1248 	spin_lock_irqsave(&iommu->lock, flags);
1249 	iommu->root_entry = root;
1250 	spin_unlock_irqrestore(&iommu->lock, flags);
1251 
1252 	return 0;
1253 }
1254 
1255 static void iommu_set_root_entry(struct intel_iommu *iommu)
1256 {
1257 	u64 addr;
1258 	u32 sts;
1259 	unsigned long flag;
1260 
1261 	addr = virt_to_phys(iommu->root_entry);
1262 	if (sm_supported(iommu))
1263 		addr |= DMA_RTADDR_SMT;
1264 
1265 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1266 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1267 
1268 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1269 
1270 	/* Make sure hardware complete it */
1271 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272 		      readl, (sts & DMA_GSTS_RTPS), sts);
1273 
1274 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1275 
1276 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1277 	if (sm_supported(iommu))
1278 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1279 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1280 }
1281 
1282 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1283 {
1284 	u32 val;
1285 	unsigned long flag;
1286 
1287 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1288 		return;
1289 
1290 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1291 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1292 
1293 	/* Make sure hardware complete it */
1294 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1295 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1296 
1297 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1298 }
1299 
1300 /* return value determine if we need a write buffer flush */
1301 static void __iommu_flush_context(struct intel_iommu *iommu,
1302 				  u16 did, u16 source_id, u8 function_mask,
1303 				  u64 type)
1304 {
1305 	u64 val = 0;
1306 	unsigned long flag;
1307 
1308 	switch (type) {
1309 	case DMA_CCMD_GLOBAL_INVL:
1310 		val = DMA_CCMD_GLOBAL_INVL;
1311 		break;
1312 	case DMA_CCMD_DOMAIN_INVL:
1313 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1314 		break;
1315 	case DMA_CCMD_DEVICE_INVL:
1316 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1317 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1318 		break;
1319 	default:
1320 		BUG();
1321 	}
1322 	val |= DMA_CCMD_ICC;
1323 
1324 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1326 
1327 	/* Make sure hardware complete it */
1328 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1329 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1330 
1331 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332 }
1333 
1334 /* return value determine if we need a write buffer flush */
1335 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1336 				u64 addr, unsigned int size_order, u64 type)
1337 {
1338 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1339 	u64 val = 0, val_iva = 0;
1340 	unsigned long flag;
1341 
1342 	switch (type) {
1343 	case DMA_TLB_GLOBAL_FLUSH:
1344 		/* global flush doesn't need set IVA_REG */
1345 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1346 		break;
1347 	case DMA_TLB_DSI_FLUSH:
1348 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1349 		break;
1350 	case DMA_TLB_PSI_FLUSH:
1351 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1352 		/* IH bit is passed in as part of address */
1353 		val_iva = size_order | addr;
1354 		break;
1355 	default:
1356 		BUG();
1357 	}
1358 	/* Note: set drain read/write */
1359 #if 0
1360 	/*
1361 	 * This is probably to be super secure.. Looks like we can
1362 	 * ignore it without any impact.
1363 	 */
1364 	if (cap_read_drain(iommu->cap))
1365 		val |= DMA_TLB_READ_DRAIN;
1366 #endif
1367 	if (cap_write_drain(iommu->cap))
1368 		val |= DMA_TLB_WRITE_DRAIN;
1369 
1370 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1371 	/* Note: Only uses first TLB reg currently */
1372 	if (val_iva)
1373 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1374 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1375 
1376 	/* Make sure hardware complete it */
1377 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1378 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1379 
1380 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 
1382 	/* check IOTLB invalidation granularity */
1383 	if (DMA_TLB_IAIG(val) == 0)
1384 		pr_err("Flush IOTLB failed\n");
1385 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1386 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1387 			(unsigned long long)DMA_TLB_IIRG(type),
1388 			(unsigned long long)DMA_TLB_IAIG(val));
1389 }
1390 
1391 static struct device_domain_info *
1392 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1393 			 u8 bus, u8 devfn)
1394 {
1395 	struct device_domain_info *info;
1396 
1397 	assert_spin_locked(&device_domain_lock);
1398 
1399 	if (!iommu->qi)
1400 		return NULL;
1401 
1402 	list_for_each_entry(info, &domain->devices, link)
1403 		if (info->iommu == iommu && info->bus == bus &&
1404 		    info->devfn == devfn) {
1405 			if (info->ats_supported && info->dev)
1406 				return info;
1407 			break;
1408 		}
1409 
1410 	return NULL;
1411 }
1412 
1413 static void domain_update_iotlb(struct dmar_domain *domain)
1414 {
1415 	struct device_domain_info *info;
1416 	bool has_iotlb_device = false;
1417 
1418 	assert_spin_locked(&device_domain_lock);
1419 
1420 	list_for_each_entry(info, &domain->devices, link)
1421 		if (info->ats_enabled) {
1422 			has_iotlb_device = true;
1423 			break;
1424 		}
1425 
1426 	domain->has_iotlb_device = has_iotlb_device;
1427 }
1428 
1429 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1430 {
1431 	struct pci_dev *pdev;
1432 
1433 	assert_spin_locked(&device_domain_lock);
1434 
1435 	if (!info || !dev_is_pci(info->dev))
1436 		return;
1437 
1438 	pdev = to_pci_dev(info->dev);
1439 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1440 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1441 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1442 	 * reserved, which should be set to 0.
1443 	 */
1444 	if (!ecap_dit(info->iommu->ecap))
1445 		info->pfsid = 0;
1446 	else {
1447 		struct pci_dev *pf_pdev;
1448 
1449 		/* pdev will be returned if device is not a vf */
1450 		pf_pdev = pci_physfn(pdev);
1451 		info->pfsid = pci_dev_id(pf_pdev);
1452 	}
1453 
1454 #ifdef CONFIG_INTEL_IOMMU_SVM
1455 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1456 	   the device if you enable PASID support after ATS support is
1457 	   undefined. So always enable PASID support on devices which
1458 	   have it, even if we can't yet know if we're ever going to
1459 	   use it. */
1460 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1461 		info->pasid_enabled = 1;
1462 
1463 	if (info->pri_supported &&
1464 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1465 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1466 		info->pri_enabled = 1;
1467 #endif
1468 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1469 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1470 		info->ats_enabled = 1;
1471 		domain_update_iotlb(info->domain);
1472 		info->ats_qdep = pci_ats_queue_depth(pdev);
1473 	}
1474 }
1475 
1476 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1477 {
1478 	struct pci_dev *pdev;
1479 
1480 	assert_spin_locked(&device_domain_lock);
1481 
1482 	if (!dev_is_pci(info->dev))
1483 		return;
1484 
1485 	pdev = to_pci_dev(info->dev);
1486 
1487 	if (info->ats_enabled) {
1488 		pci_disable_ats(pdev);
1489 		info->ats_enabled = 0;
1490 		domain_update_iotlb(info->domain);
1491 	}
1492 #ifdef CONFIG_INTEL_IOMMU_SVM
1493 	if (info->pri_enabled) {
1494 		pci_disable_pri(pdev);
1495 		info->pri_enabled = 0;
1496 	}
1497 	if (info->pasid_enabled) {
1498 		pci_disable_pasid(pdev);
1499 		info->pasid_enabled = 0;
1500 	}
1501 #endif
1502 }
1503 
1504 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1505 				    u64 addr, unsigned int mask)
1506 {
1507 	u16 sid, qdep;
1508 
1509 	if (!info || !info->ats_enabled)
1510 		return;
1511 
1512 	sid = info->bus << 8 | info->devfn;
1513 	qdep = info->ats_qdep;
1514 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1515 			   qdep, addr, mask);
1516 }
1517 
1518 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1519 				  u64 addr, unsigned mask)
1520 {
1521 	unsigned long flags;
1522 	struct device_domain_info *info;
1523 
1524 	if (!domain->has_iotlb_device)
1525 		return;
1526 
1527 	spin_lock_irqsave(&device_domain_lock, flags);
1528 	list_for_each_entry(info, &domain->devices, link)
1529 		__iommu_flush_dev_iotlb(info, addr, mask);
1530 
1531 	spin_unlock_irqrestore(&device_domain_lock, flags);
1532 }
1533 
1534 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1535 				  struct dmar_domain *domain,
1536 				  unsigned long pfn, unsigned int pages,
1537 				  int ih, int map)
1538 {
1539 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1540 	unsigned int mask = ilog2(aligned_pages);
1541 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1542 	u16 did = domain->iommu_did[iommu->seq_id];
1543 
1544 	BUG_ON(pages == 0);
1545 
1546 	if (ih)
1547 		ih = 1 << 6;
1548 
1549 	if (domain_use_first_level(domain)) {
1550 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1551 	} else {
1552 		unsigned long bitmask = aligned_pages - 1;
1553 
1554 		/*
1555 		 * PSI masks the low order bits of the base address. If the
1556 		 * address isn't aligned to the mask, then compute a mask value
1557 		 * needed to ensure the target range is flushed.
1558 		 */
1559 		if (unlikely(bitmask & pfn)) {
1560 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1561 
1562 			/*
1563 			 * Since end_pfn <= pfn + bitmask, the only way bits
1564 			 * higher than bitmask can differ in pfn and end_pfn is
1565 			 * by carrying. This means after masking out bitmask,
1566 			 * high bits starting with the first set bit in
1567 			 * shared_bits are all equal in both pfn and end_pfn.
1568 			 */
1569 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1570 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1571 		}
1572 
1573 		/*
1574 		 * Fallback to domain selective flush if no PSI support or
1575 		 * the size is too big.
1576 		 */
1577 		if (!cap_pgsel_inv(iommu->cap) ||
1578 		    mask > cap_max_amask_val(iommu->cap))
1579 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580 							DMA_TLB_DSI_FLUSH);
1581 		else
1582 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1583 							DMA_TLB_PSI_FLUSH);
1584 	}
1585 
1586 	/*
1587 	 * In caching mode, changes of pages from non-present to present require
1588 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1589 	 */
1590 	if (!cap_caching_mode(iommu->cap) || !map)
1591 		iommu_flush_dev_iotlb(domain, addr, mask);
1592 }
1593 
1594 /* Notification for newly created mappings */
1595 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1596 					struct dmar_domain *domain,
1597 					unsigned long pfn, unsigned int pages)
1598 {
1599 	/*
1600 	 * It's a non-present to present mapping. Only flush if caching mode
1601 	 * and second level.
1602 	 */
1603 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1604 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1605 	else
1606 		iommu_flush_write_buffer(iommu);
1607 }
1608 
1609 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1610 {
1611 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1612 	int idx;
1613 
1614 	for_each_domain_iommu(idx, dmar_domain) {
1615 		struct intel_iommu *iommu = g_iommus[idx];
1616 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1617 
1618 		if (domain_use_first_level(dmar_domain))
1619 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1620 		else
1621 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622 						 DMA_TLB_DSI_FLUSH);
1623 
1624 		if (!cap_caching_mode(iommu->cap))
1625 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1626 	}
1627 }
1628 
1629 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1630 {
1631 	u32 pmen;
1632 	unsigned long flags;
1633 
1634 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1635 		return;
1636 
1637 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1639 	pmen &= ~DMA_PMEN_EPM;
1640 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1641 
1642 	/* wait for the protected region status bit to clear */
1643 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1644 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1645 
1646 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1647 }
1648 
1649 static void iommu_enable_translation(struct intel_iommu *iommu)
1650 {
1651 	u32 sts;
1652 	unsigned long flags;
1653 
1654 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1655 	iommu->gcmd |= DMA_GCMD_TE;
1656 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1657 
1658 	/* Make sure hardware complete it */
1659 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1660 		      readl, (sts & DMA_GSTS_TES), sts);
1661 
1662 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1663 }
1664 
1665 static void iommu_disable_translation(struct intel_iommu *iommu)
1666 {
1667 	u32 sts;
1668 	unsigned long flag;
1669 
1670 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1671 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1672 		return;
1673 
1674 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1675 	iommu->gcmd &= ~DMA_GCMD_TE;
1676 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1677 
1678 	/* Make sure hardware complete it */
1679 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1680 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1681 
1682 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1683 }
1684 
1685 static int iommu_init_domains(struct intel_iommu *iommu)
1686 {
1687 	u32 ndomains;
1688 
1689 	ndomains = cap_ndoms(iommu->cap);
1690 	pr_debug("%s: Number of Domains supported <%d>\n",
1691 		 iommu->name, ndomains);
1692 
1693 	spin_lock_init(&iommu->lock);
1694 
1695 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1696 	if (!iommu->domain_ids)
1697 		return -ENOMEM;
1698 
1699 	/*
1700 	 * If Caching mode is set, then invalid translations are tagged
1701 	 * with domain-id 0, hence we need to pre-allocate it. We also
1702 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1703 	 * make sure it is not used for a real domain.
1704 	 */
1705 	set_bit(0, iommu->domain_ids);
1706 
1707 	/*
1708 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1709 	 * entry for first-level or pass-through translation modes should
1710 	 * be programmed with a domain id different from those used for
1711 	 * second-level or nested translation. We reserve a domain id for
1712 	 * this purpose.
1713 	 */
1714 	if (sm_supported(iommu))
1715 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1716 
1717 	return 0;
1718 }
1719 
1720 static void disable_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722 	struct device_domain_info *info, *tmp;
1723 	unsigned long flags;
1724 
1725 	if (!iommu->domain_ids)
1726 		return;
1727 
1728 	spin_lock_irqsave(&device_domain_lock, flags);
1729 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1730 		if (info->iommu != iommu)
1731 			continue;
1732 
1733 		if (!info->dev || !info->domain)
1734 			continue;
1735 
1736 		__dmar_remove_one_dev_info(info);
1737 	}
1738 	spin_unlock_irqrestore(&device_domain_lock, flags);
1739 
1740 	if (iommu->gcmd & DMA_GCMD_TE)
1741 		iommu_disable_translation(iommu);
1742 }
1743 
1744 static void free_dmar_iommu(struct intel_iommu *iommu)
1745 {
1746 	if (iommu->domain_ids) {
1747 		bitmap_free(iommu->domain_ids);
1748 		iommu->domain_ids = NULL;
1749 	}
1750 
1751 	g_iommus[iommu->seq_id] = NULL;
1752 
1753 	/* free context mapping */
1754 	free_context_table(iommu);
1755 
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757 	if (pasid_supported(iommu)) {
1758 		if (ecap_prs(iommu->ecap))
1759 			intel_svm_finish_prq(iommu);
1760 	}
1761 	if (vccap_pasid(iommu->vccap))
1762 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1763 
1764 #endif
1765 }
1766 
1767 /*
1768  * Check and return whether first level is used by default for
1769  * DMA translation.
1770  */
1771 static bool first_level_by_default(unsigned int type)
1772 {
1773 	/* Only SL is available in legacy mode */
1774 	if (!scalable_mode_support())
1775 		return false;
1776 
1777 	/* Only level (either FL or SL) is available, just use it */
1778 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1779 		return intel_cap_flts_sanity();
1780 
1781 	/* Both levels are available, decide it based on domain type */
1782 	return type != IOMMU_DOMAIN_UNMANAGED;
1783 }
1784 
1785 static struct dmar_domain *alloc_domain(unsigned int type)
1786 {
1787 	struct dmar_domain *domain;
1788 
1789 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1790 	if (!domain)
1791 		return NULL;
1792 
1793 	domain->nid = NUMA_NO_NODE;
1794 	if (first_level_by_default(type))
1795 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1796 	domain->has_iotlb_device = false;
1797 	INIT_LIST_HEAD(&domain->devices);
1798 
1799 	return domain;
1800 }
1801 
1802 /* Must be called with iommu->lock */
1803 static int domain_attach_iommu(struct dmar_domain *domain,
1804 			       struct intel_iommu *iommu)
1805 {
1806 	unsigned long ndomains;
1807 	int num;
1808 
1809 	assert_spin_locked(&device_domain_lock);
1810 	assert_spin_locked(&iommu->lock);
1811 
1812 	domain->iommu_refcnt[iommu->seq_id] += 1;
1813 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1814 		ndomains = cap_ndoms(iommu->cap);
1815 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1816 
1817 		if (num >= ndomains) {
1818 			pr_err("%s: No free domain ids\n", iommu->name);
1819 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1820 			return -ENOSPC;
1821 		}
1822 
1823 		set_bit(num, iommu->domain_ids);
1824 		domain->iommu_did[iommu->seq_id] = num;
1825 		domain->nid			 = iommu->node;
1826 		domain_update_iommu_cap(domain);
1827 	}
1828 
1829 	return 0;
1830 }
1831 
1832 static void domain_detach_iommu(struct dmar_domain *domain,
1833 				struct intel_iommu *iommu)
1834 {
1835 	int num;
1836 
1837 	assert_spin_locked(&device_domain_lock);
1838 	assert_spin_locked(&iommu->lock);
1839 
1840 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1841 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1842 		num = domain->iommu_did[iommu->seq_id];
1843 		clear_bit(num, iommu->domain_ids);
1844 		domain_update_iommu_cap(domain);
1845 		domain->iommu_did[iommu->seq_id] = 0;
1846 	}
1847 }
1848 
1849 static inline int guestwidth_to_adjustwidth(int gaw)
1850 {
1851 	int agaw;
1852 	int r = (gaw - 12) % 9;
1853 
1854 	if (r == 0)
1855 		agaw = gaw;
1856 	else
1857 		agaw = gaw + 9 - r;
1858 	if (agaw > 64)
1859 		agaw = 64;
1860 	return agaw;
1861 }
1862 
1863 static void domain_exit(struct dmar_domain *domain)
1864 {
1865 
1866 	/* Remove associated devices and clear attached or cached domains */
1867 	domain_remove_dev_info(domain);
1868 
1869 	if (domain->pgd) {
1870 		LIST_HEAD(freelist);
1871 
1872 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1873 		put_pages_list(&freelist);
1874 	}
1875 
1876 	kfree(domain);
1877 }
1878 
1879 /*
1880  * Get the PASID directory size for scalable mode context entry.
1881  * Value of X in the PDTS field of a scalable mode context entry
1882  * indicates PASID directory with 2^(X + 7) entries.
1883  */
1884 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1885 {
1886 	unsigned long pds, max_pde;
1887 
1888 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1889 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1890 	if (pds < 7)
1891 		return 0;
1892 
1893 	return pds - 7;
1894 }
1895 
1896 /*
1897  * Set the RID_PASID field of a scalable mode context entry. The
1898  * IOMMU hardware will use the PASID value set in this field for
1899  * DMA translations of DMA requests without PASID.
1900  */
1901 static inline void
1902 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1903 {
1904 	context->hi |= pasid & ((1 << 20) - 1);
1905 }
1906 
1907 /*
1908  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1909  * entry.
1910  */
1911 static inline void context_set_sm_dte(struct context_entry *context)
1912 {
1913 	context->lo |= (1 << 2);
1914 }
1915 
1916 /*
1917  * Set the PRE(Page Request Enable) field of a scalable mode context
1918  * entry.
1919  */
1920 static inline void context_set_sm_pre(struct context_entry *context)
1921 {
1922 	context->lo |= (1 << 4);
1923 }
1924 
1925 /* Convert value to context PASID directory size field coding. */
1926 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1927 
1928 static int domain_context_mapping_one(struct dmar_domain *domain,
1929 				      struct intel_iommu *iommu,
1930 				      struct pasid_table *table,
1931 				      u8 bus, u8 devfn)
1932 {
1933 	u16 did = domain->iommu_did[iommu->seq_id];
1934 	int translation = CONTEXT_TT_MULTI_LEVEL;
1935 	struct device_domain_info *info = NULL;
1936 	struct context_entry *context;
1937 	unsigned long flags;
1938 	int ret;
1939 
1940 	WARN_ON(did == 0);
1941 
1942 	if (hw_pass_through && domain_type_is_si(domain))
1943 		translation = CONTEXT_TT_PASS_THROUGH;
1944 
1945 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1946 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1947 
1948 	BUG_ON(!domain->pgd);
1949 
1950 	spin_lock_irqsave(&device_domain_lock, flags);
1951 	spin_lock(&iommu->lock);
1952 
1953 	ret = -ENOMEM;
1954 	context = iommu_context_addr(iommu, bus, devfn, 1);
1955 	if (!context)
1956 		goto out_unlock;
1957 
1958 	ret = 0;
1959 	if (context_present(context))
1960 		goto out_unlock;
1961 
1962 	/*
1963 	 * For kdump cases, old valid entries may be cached due to the
1964 	 * in-flight DMA and copied pgtable, but there is no unmapping
1965 	 * behaviour for them, thus we need an explicit cache flush for
1966 	 * the newly-mapped device. For kdump, at this point, the device
1967 	 * is supposed to finish reset at its driver probe stage, so no
1968 	 * in-flight DMA will exist, and we don't need to worry anymore
1969 	 * hereafter.
1970 	 */
1971 	if (context_copied(context)) {
1972 		u16 did_old = context_domain_id(context);
1973 
1974 		if (did_old < cap_ndoms(iommu->cap)) {
1975 			iommu->flush.flush_context(iommu, did_old,
1976 						   (((u16)bus) << 8) | devfn,
1977 						   DMA_CCMD_MASK_NOBIT,
1978 						   DMA_CCMD_DEVICE_INVL);
1979 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1980 						 DMA_TLB_DSI_FLUSH);
1981 		}
1982 	}
1983 
1984 	context_clear_entry(context);
1985 
1986 	if (sm_supported(iommu)) {
1987 		unsigned long pds;
1988 
1989 		WARN_ON(!table);
1990 
1991 		/* Setup the PASID DIR pointer: */
1992 		pds = context_get_sm_pds(table);
1993 		context->lo = (u64)virt_to_phys(table->table) |
1994 				context_pdts(pds);
1995 
1996 		/* Setup the RID_PASID field: */
1997 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1998 
1999 		/*
2000 		 * Setup the Device-TLB enable bit and Page request
2001 		 * Enable bit:
2002 		 */
2003 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2004 		if (info && info->ats_supported)
2005 			context_set_sm_dte(context);
2006 		if (info && info->pri_supported)
2007 			context_set_sm_pre(context);
2008 	} else {
2009 		struct dma_pte *pgd = domain->pgd;
2010 		int agaw;
2011 
2012 		context_set_domain_id(context, did);
2013 
2014 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2015 			/*
2016 			 * Skip top levels of page tables for iommu which has
2017 			 * less agaw than default. Unnecessary for PT mode.
2018 			 */
2019 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2020 				ret = -ENOMEM;
2021 				pgd = phys_to_virt(dma_pte_addr(pgd));
2022 				if (!dma_pte_present(pgd))
2023 					goto out_unlock;
2024 			}
2025 
2026 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2027 			if (info && info->ats_supported)
2028 				translation = CONTEXT_TT_DEV_IOTLB;
2029 			else
2030 				translation = CONTEXT_TT_MULTI_LEVEL;
2031 
2032 			context_set_address_root(context, virt_to_phys(pgd));
2033 			context_set_address_width(context, agaw);
2034 		} else {
2035 			/*
2036 			 * In pass through mode, AW must be programmed to
2037 			 * indicate the largest AGAW value supported by
2038 			 * hardware. And ASR is ignored by hardware.
2039 			 */
2040 			context_set_address_width(context, iommu->msagaw);
2041 		}
2042 
2043 		context_set_translation_type(context, translation);
2044 	}
2045 
2046 	context_set_fault_enable(context);
2047 	context_set_present(context);
2048 	if (!ecap_coherent(iommu->ecap))
2049 		clflush_cache_range(context, sizeof(*context));
2050 
2051 	/*
2052 	 * It's a non-present to present mapping. If hardware doesn't cache
2053 	 * non-present entry we only need to flush the write-buffer. If the
2054 	 * _does_ cache non-present entries, then it does so in the special
2055 	 * domain #0, which we have to flush:
2056 	 */
2057 	if (cap_caching_mode(iommu->cap)) {
2058 		iommu->flush.flush_context(iommu, 0,
2059 					   (((u16)bus) << 8) | devfn,
2060 					   DMA_CCMD_MASK_NOBIT,
2061 					   DMA_CCMD_DEVICE_INVL);
2062 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2063 	} else {
2064 		iommu_flush_write_buffer(iommu);
2065 	}
2066 	iommu_enable_dev_iotlb(info);
2067 
2068 	ret = 0;
2069 
2070 out_unlock:
2071 	spin_unlock(&iommu->lock);
2072 	spin_unlock_irqrestore(&device_domain_lock, flags);
2073 
2074 	return ret;
2075 }
2076 
2077 struct domain_context_mapping_data {
2078 	struct dmar_domain *domain;
2079 	struct intel_iommu *iommu;
2080 	struct pasid_table *table;
2081 };
2082 
2083 static int domain_context_mapping_cb(struct pci_dev *pdev,
2084 				     u16 alias, void *opaque)
2085 {
2086 	struct domain_context_mapping_data *data = opaque;
2087 
2088 	return domain_context_mapping_one(data->domain, data->iommu,
2089 					  data->table, PCI_BUS_NUM(alias),
2090 					  alias & 0xff);
2091 }
2092 
2093 static int
2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2095 {
2096 	struct domain_context_mapping_data data;
2097 	struct pasid_table *table;
2098 	struct intel_iommu *iommu;
2099 	u8 bus, devfn;
2100 
2101 	iommu = device_to_iommu(dev, &bus, &devfn);
2102 	if (!iommu)
2103 		return -ENODEV;
2104 
2105 	table = intel_pasid_get_table(dev);
2106 
2107 	if (!dev_is_pci(dev))
2108 		return domain_context_mapping_one(domain, iommu, table,
2109 						  bus, devfn);
2110 
2111 	data.domain = domain;
2112 	data.iommu = iommu;
2113 	data.table = table;
2114 
2115 	return pci_for_each_dma_alias(to_pci_dev(dev),
2116 				      &domain_context_mapping_cb, &data);
2117 }
2118 
2119 static int domain_context_mapped_cb(struct pci_dev *pdev,
2120 				    u16 alias, void *opaque)
2121 {
2122 	struct intel_iommu *iommu = opaque;
2123 
2124 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2125 }
2126 
2127 static int domain_context_mapped(struct device *dev)
2128 {
2129 	struct intel_iommu *iommu;
2130 	u8 bus, devfn;
2131 
2132 	iommu = device_to_iommu(dev, &bus, &devfn);
2133 	if (!iommu)
2134 		return -ENODEV;
2135 
2136 	if (!dev_is_pci(dev))
2137 		return device_context_mapped(iommu, bus, devfn);
2138 
2139 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2140 				       domain_context_mapped_cb, iommu);
2141 }
2142 
2143 /* Returns a number of VTD pages, but aligned to MM page size */
2144 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2145 					    size_t size)
2146 {
2147 	host_addr &= ~PAGE_MASK;
2148 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2149 }
2150 
2151 /* Return largest possible superpage level for a given mapping */
2152 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2153 					  unsigned long iov_pfn,
2154 					  unsigned long phy_pfn,
2155 					  unsigned long pages)
2156 {
2157 	int support, level = 1;
2158 	unsigned long pfnmerge;
2159 
2160 	support = domain->iommu_superpage;
2161 
2162 	/* To use a large page, the virtual *and* physical addresses
2163 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2164 	   of them will mean we have to use smaller pages. So just
2165 	   merge them and check both at once. */
2166 	pfnmerge = iov_pfn | phy_pfn;
2167 
2168 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2169 		pages >>= VTD_STRIDE_SHIFT;
2170 		if (!pages)
2171 			break;
2172 		pfnmerge >>= VTD_STRIDE_SHIFT;
2173 		level++;
2174 		support--;
2175 	}
2176 	return level;
2177 }
2178 
2179 /*
2180  * Ensure that old small page tables are removed to make room for superpage(s).
2181  * We're going to add new large pages, so make sure we don't remove their parent
2182  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2183  */
2184 static void switch_to_super_page(struct dmar_domain *domain,
2185 				 unsigned long start_pfn,
2186 				 unsigned long end_pfn, int level)
2187 {
2188 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2189 	struct dma_pte *pte = NULL;
2190 	int i;
2191 
2192 	while (start_pfn <= end_pfn) {
2193 		if (!pte)
2194 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2195 
2196 		if (dma_pte_present(pte)) {
2197 			dma_pte_free_pagetable(domain, start_pfn,
2198 					       start_pfn + lvl_pages - 1,
2199 					       level + 1);
2200 
2201 			for_each_domain_iommu(i, domain)
2202 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2203 						      start_pfn, lvl_pages,
2204 						      0, 0);
2205 		}
2206 
2207 		pte++;
2208 		start_pfn += lvl_pages;
2209 		if (first_pte_in_page(pte))
2210 			pte = NULL;
2211 	}
2212 }
2213 
2214 static int
2215 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2216 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2217 {
2218 	struct dma_pte *first_pte = NULL, *pte = NULL;
2219 	unsigned int largepage_lvl = 0;
2220 	unsigned long lvl_pages = 0;
2221 	phys_addr_t pteval;
2222 	u64 attr;
2223 
2224 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2225 
2226 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2227 		return -EINVAL;
2228 
2229 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2230 	attr |= DMA_FL_PTE_PRESENT;
2231 	if (domain_use_first_level(domain)) {
2232 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2233 		if (prot & DMA_PTE_WRITE)
2234 			attr |= DMA_FL_PTE_DIRTY;
2235 	}
2236 
2237 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2238 
2239 	while (nr_pages > 0) {
2240 		uint64_t tmp;
2241 
2242 		if (!pte) {
2243 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2244 					phys_pfn, nr_pages);
2245 
2246 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2247 			if (!pte)
2248 				return -ENOMEM;
2249 			first_pte = pte;
2250 
2251 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2252 
2253 			/* It is large page*/
2254 			if (largepage_lvl > 1) {
2255 				unsigned long end_pfn;
2256 				unsigned long pages_to_remove;
2257 
2258 				pteval |= DMA_PTE_LARGE_PAGE;
2259 				pages_to_remove = min_t(unsigned long, nr_pages,
2260 							nr_pte_to_next_page(pte) * lvl_pages);
2261 				end_pfn = iov_pfn + pages_to_remove - 1;
2262 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2263 			} else {
2264 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2265 			}
2266 
2267 		}
2268 		/* We don't need lock here, nobody else
2269 		 * touches the iova range
2270 		 */
2271 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2272 		if (tmp) {
2273 			static int dumps = 5;
2274 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2275 				iov_pfn, tmp, (unsigned long long)pteval);
2276 			if (dumps) {
2277 				dumps--;
2278 				debug_dma_dump_mappings(NULL);
2279 			}
2280 			WARN_ON(1);
2281 		}
2282 
2283 		nr_pages -= lvl_pages;
2284 		iov_pfn += lvl_pages;
2285 		phys_pfn += lvl_pages;
2286 		pteval += lvl_pages * VTD_PAGE_SIZE;
2287 
2288 		/* If the next PTE would be the first in a new page, then we
2289 		 * need to flush the cache on the entries we've just written.
2290 		 * And then we'll need to recalculate 'pte', so clear it and
2291 		 * let it get set again in the if (!pte) block above.
2292 		 *
2293 		 * If we're done (!nr_pages) we need to flush the cache too.
2294 		 *
2295 		 * Also if we've been setting superpages, we may need to
2296 		 * recalculate 'pte' and switch back to smaller pages for the
2297 		 * end of the mapping, if the trailing size is not enough to
2298 		 * use another superpage (i.e. nr_pages < lvl_pages).
2299 		 */
2300 		pte++;
2301 		if (!nr_pages || first_pte_in_page(pte) ||
2302 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2303 			domain_flush_cache(domain, first_pte,
2304 					   (void *)pte - (void *)first_pte);
2305 			pte = NULL;
2306 		}
2307 	}
2308 
2309 	return 0;
2310 }
2311 
2312 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2313 {
2314 	struct intel_iommu *iommu = info->iommu;
2315 	struct context_entry *context;
2316 	unsigned long flags;
2317 	u16 did_old;
2318 
2319 	if (!iommu)
2320 		return;
2321 
2322 	spin_lock_irqsave(&iommu->lock, flags);
2323 	context = iommu_context_addr(iommu, bus, devfn, 0);
2324 	if (!context) {
2325 		spin_unlock_irqrestore(&iommu->lock, flags);
2326 		return;
2327 	}
2328 
2329 	if (sm_supported(iommu)) {
2330 		if (hw_pass_through && domain_type_is_si(info->domain))
2331 			did_old = FLPT_DEFAULT_DID;
2332 		else
2333 			did_old = info->domain->iommu_did[iommu->seq_id];
2334 	} else {
2335 		did_old = context_domain_id(context);
2336 	}
2337 
2338 	context_clear_entry(context);
2339 	__iommu_flush_cache(iommu, context, sizeof(*context));
2340 	spin_unlock_irqrestore(&iommu->lock, flags);
2341 	iommu->flush.flush_context(iommu,
2342 				   did_old,
2343 				   (((u16)bus) << 8) | devfn,
2344 				   DMA_CCMD_MASK_NOBIT,
2345 				   DMA_CCMD_DEVICE_INVL);
2346 
2347 	if (sm_supported(iommu))
2348 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2349 
2350 	iommu->flush.flush_iotlb(iommu,
2351 				 did_old,
2352 				 0,
2353 				 0,
2354 				 DMA_TLB_DSI_FLUSH);
2355 
2356 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2357 }
2358 
2359 static void domain_remove_dev_info(struct dmar_domain *domain)
2360 {
2361 	struct device_domain_info *info, *tmp;
2362 	unsigned long flags;
2363 
2364 	spin_lock_irqsave(&device_domain_lock, flags);
2365 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2366 		__dmar_remove_one_dev_info(info);
2367 	spin_unlock_irqrestore(&device_domain_lock, flags);
2368 }
2369 
2370 static inline struct device_domain_info *
2371 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2372 {
2373 	struct device_domain_info *info;
2374 
2375 	list_for_each_entry(info, &device_domain_list, global)
2376 		if (info->segment == segment && info->bus == bus &&
2377 		    info->devfn == devfn)
2378 			return info;
2379 
2380 	return NULL;
2381 }
2382 
2383 static int domain_setup_first_level(struct intel_iommu *iommu,
2384 				    struct dmar_domain *domain,
2385 				    struct device *dev,
2386 				    u32 pasid)
2387 {
2388 	struct dma_pte *pgd = domain->pgd;
2389 	int agaw, level;
2390 	int flags = 0;
2391 
2392 	/*
2393 	 * Skip top levels of page tables for iommu which has
2394 	 * less agaw than default. Unnecessary for PT mode.
2395 	 */
2396 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2397 		pgd = phys_to_virt(dma_pte_addr(pgd));
2398 		if (!dma_pte_present(pgd))
2399 			return -ENOMEM;
2400 	}
2401 
2402 	level = agaw_to_level(agaw);
2403 	if (level != 4 && level != 5)
2404 		return -EINVAL;
2405 
2406 	if (pasid != PASID_RID2PASID)
2407 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2408 	if (level == 5)
2409 		flags |= PASID_FLAG_FL5LP;
2410 
2411 	if (domain->force_snooping)
2412 		flags |= PASID_FLAG_PAGE_SNOOP;
2413 
2414 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2415 					     domain->iommu_did[iommu->seq_id],
2416 					     flags);
2417 }
2418 
2419 static bool dev_is_real_dma_subdevice(struct device *dev)
2420 {
2421 	return dev && dev_is_pci(dev) &&
2422 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2423 }
2424 
2425 static int iommu_domain_identity_map(struct dmar_domain *domain,
2426 				     unsigned long first_vpfn,
2427 				     unsigned long last_vpfn)
2428 {
2429 	/*
2430 	 * RMRR range might have overlap with physical memory range,
2431 	 * clear it first
2432 	 */
2433 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2434 
2435 	return __domain_mapping(domain, first_vpfn,
2436 				first_vpfn, last_vpfn - first_vpfn + 1,
2437 				DMA_PTE_READ|DMA_PTE_WRITE);
2438 }
2439 
2440 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2441 
2442 static int __init si_domain_init(int hw)
2443 {
2444 	struct dmar_rmrr_unit *rmrr;
2445 	struct device *dev;
2446 	int i, nid, ret;
2447 
2448 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2449 	if (!si_domain)
2450 		return -EFAULT;
2451 
2452 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2453 		domain_exit(si_domain);
2454 		return -EFAULT;
2455 	}
2456 
2457 	if (hw)
2458 		return 0;
2459 
2460 	for_each_online_node(nid) {
2461 		unsigned long start_pfn, end_pfn;
2462 		int i;
2463 
2464 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2465 			ret = iommu_domain_identity_map(si_domain,
2466 					mm_to_dma_pfn(start_pfn),
2467 					mm_to_dma_pfn(end_pfn));
2468 			if (ret)
2469 				return ret;
2470 		}
2471 	}
2472 
2473 	/*
2474 	 * Identity map the RMRRs so that devices with RMRRs could also use
2475 	 * the si_domain.
2476 	 */
2477 	for_each_rmrr_units(rmrr) {
2478 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2479 					  i, dev) {
2480 			unsigned long long start = rmrr->base_address;
2481 			unsigned long long end = rmrr->end_address;
2482 
2483 			if (WARN_ON(end < start ||
2484 				    end >> agaw_to_width(si_domain->agaw)))
2485 				continue;
2486 
2487 			ret = iommu_domain_identity_map(si_domain,
2488 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2489 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2490 			if (ret)
2491 				return ret;
2492 		}
2493 	}
2494 
2495 	return 0;
2496 }
2497 
2498 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2499 {
2500 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2501 	struct intel_iommu *iommu;
2502 	unsigned long flags;
2503 	u8 bus, devfn;
2504 	int ret;
2505 
2506 	iommu = device_to_iommu(dev, &bus, &devfn);
2507 	if (!iommu)
2508 		return -ENODEV;
2509 
2510 	spin_lock_irqsave(&device_domain_lock, flags);
2511 	info->domain = domain;
2512 	spin_lock(&iommu->lock);
2513 	ret = domain_attach_iommu(domain, iommu);
2514 	spin_unlock(&iommu->lock);
2515 	if (ret) {
2516 		spin_unlock_irqrestore(&device_domain_lock, flags);
2517 		return ret;
2518 	}
2519 	list_add(&info->link, &domain->devices);
2520 	spin_unlock_irqrestore(&device_domain_lock, flags);
2521 
2522 	/* PASID table is mandatory for a PCI device in scalable mode. */
2523 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2524 		ret = intel_pasid_alloc_table(dev);
2525 		if (ret) {
2526 			dev_err(dev, "PASID table allocation failed\n");
2527 			dmar_remove_one_dev_info(dev);
2528 			return ret;
2529 		}
2530 
2531 		/* Setup the PASID entry for requests without PASID: */
2532 		spin_lock_irqsave(&iommu->lock, flags);
2533 		if (hw_pass_through && domain_type_is_si(domain))
2534 			ret = intel_pasid_setup_pass_through(iommu, domain,
2535 					dev, PASID_RID2PASID);
2536 		else if (domain_use_first_level(domain))
2537 			ret = domain_setup_first_level(iommu, domain, dev,
2538 					PASID_RID2PASID);
2539 		else
2540 			ret = intel_pasid_setup_second_level(iommu, domain,
2541 					dev, PASID_RID2PASID);
2542 		spin_unlock_irqrestore(&iommu->lock, flags);
2543 		if (ret) {
2544 			dev_err(dev, "Setup RID2PASID failed\n");
2545 			dmar_remove_one_dev_info(dev);
2546 			return ret;
2547 		}
2548 	}
2549 
2550 	ret = domain_context_mapping(domain, dev);
2551 	if (ret) {
2552 		dev_err(dev, "Domain context map failed\n");
2553 		dmar_remove_one_dev_info(dev);
2554 		return ret;
2555 	}
2556 
2557 	return 0;
2558 }
2559 
2560 static bool device_has_rmrr(struct device *dev)
2561 {
2562 	struct dmar_rmrr_unit *rmrr;
2563 	struct device *tmp;
2564 	int i;
2565 
2566 	rcu_read_lock();
2567 	for_each_rmrr_units(rmrr) {
2568 		/*
2569 		 * Return TRUE if this RMRR contains the device that
2570 		 * is passed in.
2571 		 */
2572 		for_each_active_dev_scope(rmrr->devices,
2573 					  rmrr->devices_cnt, i, tmp)
2574 			if (tmp == dev ||
2575 			    is_downstream_to_pci_bridge(dev, tmp)) {
2576 				rcu_read_unlock();
2577 				return true;
2578 			}
2579 	}
2580 	rcu_read_unlock();
2581 	return false;
2582 }
2583 
2584 /**
2585  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2586  * is relaxable (ie. is allowed to be not enforced under some conditions)
2587  * @dev: device handle
2588  *
2589  * We assume that PCI USB devices with RMRRs have them largely
2590  * for historical reasons and that the RMRR space is not actively used post
2591  * boot.  This exclusion may change if vendors begin to abuse it.
2592  *
2593  * The same exception is made for graphics devices, with the requirement that
2594  * any use of the RMRR regions will be torn down before assigning the device
2595  * to a guest.
2596  *
2597  * Return: true if the RMRR is relaxable, false otherwise
2598  */
2599 static bool device_rmrr_is_relaxable(struct device *dev)
2600 {
2601 	struct pci_dev *pdev;
2602 
2603 	if (!dev_is_pci(dev))
2604 		return false;
2605 
2606 	pdev = to_pci_dev(dev);
2607 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2608 		return true;
2609 	else
2610 		return false;
2611 }
2612 
2613 /*
2614  * There are a couple cases where we need to restrict the functionality of
2615  * devices associated with RMRRs.  The first is when evaluating a device for
2616  * identity mapping because problems exist when devices are moved in and out
2617  * of domains and their respective RMRR information is lost.  This means that
2618  * a device with associated RMRRs will never be in a "passthrough" domain.
2619  * The second is use of the device through the IOMMU API.  This interface
2620  * expects to have full control of the IOVA space for the device.  We cannot
2621  * satisfy both the requirement that RMRR access is maintained and have an
2622  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2623  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2624  * We therefore prevent devices associated with an RMRR from participating in
2625  * the IOMMU API, which eliminates them from device assignment.
2626  *
2627  * In both cases, devices which have relaxable RMRRs are not concerned by this
2628  * restriction. See device_rmrr_is_relaxable comment.
2629  */
2630 static bool device_is_rmrr_locked(struct device *dev)
2631 {
2632 	if (!device_has_rmrr(dev))
2633 		return false;
2634 
2635 	if (device_rmrr_is_relaxable(dev))
2636 		return false;
2637 
2638 	return true;
2639 }
2640 
2641 /*
2642  * Return the required default domain type for a specific device.
2643  *
2644  * @dev: the device in query
2645  * @startup: true if this is during early boot
2646  *
2647  * Returns:
2648  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2649  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2650  *  - 0: both identity and dynamic domains work for this device
2651  */
2652 static int device_def_domain_type(struct device *dev)
2653 {
2654 	if (dev_is_pci(dev)) {
2655 		struct pci_dev *pdev = to_pci_dev(dev);
2656 
2657 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2658 			return IOMMU_DOMAIN_IDENTITY;
2659 
2660 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2661 			return IOMMU_DOMAIN_IDENTITY;
2662 	}
2663 
2664 	return 0;
2665 }
2666 
2667 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2668 {
2669 	/*
2670 	 * Start from the sane iommu hardware state.
2671 	 * If the queued invalidation is already initialized by us
2672 	 * (for example, while enabling interrupt-remapping) then
2673 	 * we got the things already rolling from a sane state.
2674 	 */
2675 	if (!iommu->qi) {
2676 		/*
2677 		 * Clear any previous faults.
2678 		 */
2679 		dmar_fault(-1, iommu);
2680 		/*
2681 		 * Disable queued invalidation if supported and already enabled
2682 		 * before OS handover.
2683 		 */
2684 		dmar_disable_qi(iommu);
2685 	}
2686 
2687 	if (dmar_enable_qi(iommu)) {
2688 		/*
2689 		 * Queued Invalidate not enabled, use Register Based Invalidate
2690 		 */
2691 		iommu->flush.flush_context = __iommu_flush_context;
2692 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2693 		pr_info("%s: Using Register based invalidation\n",
2694 			iommu->name);
2695 	} else {
2696 		iommu->flush.flush_context = qi_flush_context;
2697 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2698 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2699 	}
2700 }
2701 
2702 static int copy_context_table(struct intel_iommu *iommu,
2703 			      struct root_entry *old_re,
2704 			      struct context_entry **tbl,
2705 			      int bus, bool ext)
2706 {
2707 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2708 	struct context_entry *new_ce = NULL, ce;
2709 	struct context_entry *old_ce = NULL;
2710 	struct root_entry re;
2711 	phys_addr_t old_ce_phys;
2712 
2713 	tbl_idx = ext ? bus * 2 : bus;
2714 	memcpy(&re, old_re, sizeof(re));
2715 
2716 	for (devfn = 0; devfn < 256; devfn++) {
2717 		/* First calculate the correct index */
2718 		idx = (ext ? devfn * 2 : devfn) % 256;
2719 
2720 		if (idx == 0) {
2721 			/* First save what we may have and clean up */
2722 			if (new_ce) {
2723 				tbl[tbl_idx] = new_ce;
2724 				__iommu_flush_cache(iommu, new_ce,
2725 						    VTD_PAGE_SIZE);
2726 				pos = 1;
2727 			}
2728 
2729 			if (old_ce)
2730 				memunmap(old_ce);
2731 
2732 			ret = 0;
2733 			if (devfn < 0x80)
2734 				old_ce_phys = root_entry_lctp(&re);
2735 			else
2736 				old_ce_phys = root_entry_uctp(&re);
2737 
2738 			if (!old_ce_phys) {
2739 				if (ext && devfn == 0) {
2740 					/* No LCTP, try UCTP */
2741 					devfn = 0x7f;
2742 					continue;
2743 				} else {
2744 					goto out;
2745 				}
2746 			}
2747 
2748 			ret = -ENOMEM;
2749 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2750 					MEMREMAP_WB);
2751 			if (!old_ce)
2752 				goto out;
2753 
2754 			new_ce = alloc_pgtable_page(iommu->node);
2755 			if (!new_ce)
2756 				goto out_unmap;
2757 
2758 			ret = 0;
2759 		}
2760 
2761 		/* Now copy the context entry */
2762 		memcpy(&ce, old_ce + idx, sizeof(ce));
2763 
2764 		if (!__context_present(&ce))
2765 			continue;
2766 
2767 		did = context_domain_id(&ce);
2768 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2769 			set_bit(did, iommu->domain_ids);
2770 
2771 		/*
2772 		 * We need a marker for copied context entries. This
2773 		 * marker needs to work for the old format as well as
2774 		 * for extended context entries.
2775 		 *
2776 		 * Bit 67 of the context entry is used. In the old
2777 		 * format this bit is available to software, in the
2778 		 * extended format it is the PGE bit, but PGE is ignored
2779 		 * by HW if PASIDs are disabled (and thus still
2780 		 * available).
2781 		 *
2782 		 * So disable PASIDs first and then mark the entry
2783 		 * copied. This means that we don't copy PASID
2784 		 * translations from the old kernel, but this is fine as
2785 		 * faults there are not fatal.
2786 		 */
2787 		context_clear_pasid_enable(&ce);
2788 		context_set_copied(&ce);
2789 
2790 		new_ce[idx] = ce;
2791 	}
2792 
2793 	tbl[tbl_idx + pos] = new_ce;
2794 
2795 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2796 
2797 out_unmap:
2798 	memunmap(old_ce);
2799 
2800 out:
2801 	return ret;
2802 }
2803 
2804 static int copy_translation_tables(struct intel_iommu *iommu)
2805 {
2806 	struct context_entry **ctxt_tbls;
2807 	struct root_entry *old_rt;
2808 	phys_addr_t old_rt_phys;
2809 	int ctxt_table_entries;
2810 	unsigned long flags;
2811 	u64 rtaddr_reg;
2812 	int bus, ret;
2813 	bool new_ext, ext;
2814 
2815 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2816 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2817 	new_ext    = !!ecap_ecs(iommu->ecap);
2818 
2819 	/*
2820 	 * The RTT bit can only be changed when translation is disabled,
2821 	 * but disabling translation means to open a window for data
2822 	 * corruption. So bail out and don't copy anything if we would
2823 	 * have to change the bit.
2824 	 */
2825 	if (new_ext != ext)
2826 		return -EINVAL;
2827 
2828 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2829 	if (!old_rt_phys)
2830 		return -EINVAL;
2831 
2832 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2833 	if (!old_rt)
2834 		return -ENOMEM;
2835 
2836 	/* This is too big for the stack - allocate it from slab */
2837 	ctxt_table_entries = ext ? 512 : 256;
2838 	ret = -ENOMEM;
2839 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2840 	if (!ctxt_tbls)
2841 		goto out_unmap;
2842 
2843 	for (bus = 0; bus < 256; bus++) {
2844 		ret = copy_context_table(iommu, &old_rt[bus],
2845 					 ctxt_tbls, bus, ext);
2846 		if (ret) {
2847 			pr_err("%s: Failed to copy context table for bus %d\n",
2848 				iommu->name, bus);
2849 			continue;
2850 		}
2851 	}
2852 
2853 	spin_lock_irqsave(&iommu->lock, flags);
2854 
2855 	/* Context tables are copied, now write them to the root_entry table */
2856 	for (bus = 0; bus < 256; bus++) {
2857 		int idx = ext ? bus * 2 : bus;
2858 		u64 val;
2859 
2860 		if (ctxt_tbls[idx]) {
2861 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2862 			iommu->root_entry[bus].lo = val;
2863 		}
2864 
2865 		if (!ext || !ctxt_tbls[idx + 1])
2866 			continue;
2867 
2868 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2869 		iommu->root_entry[bus].hi = val;
2870 	}
2871 
2872 	spin_unlock_irqrestore(&iommu->lock, flags);
2873 
2874 	kfree(ctxt_tbls);
2875 
2876 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2877 
2878 	ret = 0;
2879 
2880 out_unmap:
2881 	memunmap(old_rt);
2882 
2883 	return ret;
2884 }
2885 
2886 #ifdef CONFIG_INTEL_IOMMU_SVM
2887 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2888 {
2889 	struct intel_iommu *iommu = data;
2890 	ioasid_t ioasid;
2891 
2892 	if (!iommu)
2893 		return INVALID_IOASID;
2894 	/*
2895 	 * VT-d virtual command interface always uses the full 20 bit
2896 	 * PASID range. Host can partition guest PASID range based on
2897 	 * policies but it is out of guest's control.
2898 	 */
2899 	if (min < PASID_MIN || max > intel_pasid_max_id)
2900 		return INVALID_IOASID;
2901 
2902 	if (vcmd_alloc_pasid(iommu, &ioasid))
2903 		return INVALID_IOASID;
2904 
2905 	return ioasid;
2906 }
2907 
2908 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2909 {
2910 	struct intel_iommu *iommu = data;
2911 
2912 	if (!iommu)
2913 		return;
2914 	/*
2915 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2916 	 * We can only free the PASID when all the devices are unbound.
2917 	 */
2918 	if (ioasid_find(NULL, ioasid, NULL)) {
2919 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2920 		return;
2921 	}
2922 	vcmd_free_pasid(iommu, ioasid);
2923 }
2924 
2925 static void register_pasid_allocator(struct intel_iommu *iommu)
2926 {
2927 	/*
2928 	 * If we are running in the host, no need for custom allocator
2929 	 * in that PASIDs are allocated from the host system-wide.
2930 	 */
2931 	if (!cap_caching_mode(iommu->cap))
2932 		return;
2933 
2934 	if (!sm_supported(iommu)) {
2935 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2936 		return;
2937 	}
2938 
2939 	/*
2940 	 * Register a custom PASID allocator if we are running in a guest,
2941 	 * guest PASID must be obtained via virtual command interface.
2942 	 * There can be multiple vIOMMUs in each guest but only one allocator
2943 	 * is active. All vIOMMU allocators will eventually be calling the same
2944 	 * host allocator.
2945 	 */
2946 	if (!vccap_pasid(iommu->vccap))
2947 		return;
2948 
2949 	pr_info("Register custom PASID allocator\n");
2950 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2951 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2952 	iommu->pasid_allocator.pdata = (void *)iommu;
2953 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2954 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2955 		/*
2956 		 * Disable scalable mode on this IOMMU if there
2957 		 * is no custom allocator. Mixing SM capable vIOMMU
2958 		 * and non-SM vIOMMU are not supported.
2959 		 */
2960 		intel_iommu_sm = 0;
2961 	}
2962 }
2963 #endif
2964 
2965 static int __init init_dmars(void)
2966 {
2967 	struct dmar_drhd_unit *drhd;
2968 	struct intel_iommu *iommu;
2969 	int ret;
2970 
2971 	/*
2972 	 * for each drhd
2973 	 *    allocate root
2974 	 *    initialize and program root entry to not present
2975 	 * endfor
2976 	 */
2977 	for_each_drhd_unit(drhd) {
2978 		/*
2979 		 * lock not needed as this is only incremented in the single
2980 		 * threaded kernel __init code path all other access are read
2981 		 * only
2982 		 */
2983 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2984 			g_num_of_iommus++;
2985 			continue;
2986 		}
2987 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
2988 	}
2989 
2990 	/* Preallocate enough resources for IOMMU hot-addition */
2991 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2992 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2993 
2994 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2995 			GFP_KERNEL);
2996 	if (!g_iommus) {
2997 		ret = -ENOMEM;
2998 		goto error;
2999 	}
3000 
3001 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3002 	if (ret)
3003 		goto free_iommu;
3004 
3005 	for_each_iommu(iommu, drhd) {
3006 		if (drhd->ignored) {
3007 			iommu_disable_translation(iommu);
3008 			continue;
3009 		}
3010 
3011 		/*
3012 		 * Find the max pasid size of all IOMMU's in the system.
3013 		 * We need to ensure the system pasid table is no bigger
3014 		 * than the smallest supported.
3015 		 */
3016 		if (pasid_supported(iommu)) {
3017 			u32 temp = 2 << ecap_pss(iommu->ecap);
3018 
3019 			intel_pasid_max_id = min_t(u32, temp,
3020 						   intel_pasid_max_id);
3021 		}
3022 
3023 		g_iommus[iommu->seq_id] = iommu;
3024 
3025 		intel_iommu_init_qi(iommu);
3026 
3027 		ret = iommu_init_domains(iommu);
3028 		if (ret)
3029 			goto free_iommu;
3030 
3031 		init_translation_status(iommu);
3032 
3033 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3034 			iommu_disable_translation(iommu);
3035 			clear_translation_pre_enabled(iommu);
3036 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3037 				iommu->name);
3038 		}
3039 
3040 		/*
3041 		 * TBD:
3042 		 * we could share the same root & context tables
3043 		 * among all IOMMU's. Need to Split it later.
3044 		 */
3045 		ret = iommu_alloc_root_entry(iommu);
3046 		if (ret)
3047 			goto free_iommu;
3048 
3049 		if (translation_pre_enabled(iommu)) {
3050 			pr_info("Translation already enabled - trying to copy translation structures\n");
3051 
3052 			ret = copy_translation_tables(iommu);
3053 			if (ret) {
3054 				/*
3055 				 * We found the IOMMU with translation
3056 				 * enabled - but failed to copy over the
3057 				 * old root-entry table. Try to proceed
3058 				 * by disabling translation now and
3059 				 * allocating a clean root-entry table.
3060 				 * This might cause DMAR faults, but
3061 				 * probably the dump will still succeed.
3062 				 */
3063 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3064 				       iommu->name);
3065 				iommu_disable_translation(iommu);
3066 				clear_translation_pre_enabled(iommu);
3067 			} else {
3068 				pr_info("Copied translation tables from previous kernel for %s\n",
3069 					iommu->name);
3070 			}
3071 		}
3072 
3073 		if (!ecap_pass_through(iommu->ecap))
3074 			hw_pass_through = 0;
3075 		intel_svm_check(iommu);
3076 	}
3077 
3078 	/*
3079 	 * Now that qi is enabled on all iommus, set the root entry and flush
3080 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3081 	 * flush_context function will loop forever and the boot hangs.
3082 	 */
3083 	for_each_active_iommu(iommu, drhd) {
3084 		iommu_flush_write_buffer(iommu);
3085 #ifdef CONFIG_INTEL_IOMMU_SVM
3086 		register_pasid_allocator(iommu);
3087 #endif
3088 		iommu_set_root_entry(iommu);
3089 	}
3090 
3091 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3092 	dmar_map_gfx = 0;
3093 #endif
3094 
3095 	if (!dmar_map_gfx)
3096 		iommu_identity_mapping |= IDENTMAP_GFX;
3097 
3098 	check_tylersburg_isoch();
3099 
3100 	ret = si_domain_init(hw_pass_through);
3101 	if (ret)
3102 		goto free_iommu;
3103 
3104 	/*
3105 	 * for each drhd
3106 	 *   enable fault log
3107 	 *   global invalidate context cache
3108 	 *   global invalidate iotlb
3109 	 *   enable translation
3110 	 */
3111 	for_each_iommu(iommu, drhd) {
3112 		if (drhd->ignored) {
3113 			/*
3114 			 * we always have to disable PMRs or DMA may fail on
3115 			 * this device
3116 			 */
3117 			if (force_on)
3118 				iommu_disable_protect_mem_regions(iommu);
3119 			continue;
3120 		}
3121 
3122 		iommu_flush_write_buffer(iommu);
3123 
3124 #ifdef CONFIG_INTEL_IOMMU_SVM
3125 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3126 			/*
3127 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3128 			 * could cause possible lock race condition.
3129 			 */
3130 			up_write(&dmar_global_lock);
3131 			ret = intel_svm_enable_prq(iommu);
3132 			down_write(&dmar_global_lock);
3133 			if (ret)
3134 				goto free_iommu;
3135 		}
3136 #endif
3137 		ret = dmar_set_interrupt(iommu);
3138 		if (ret)
3139 			goto free_iommu;
3140 	}
3141 
3142 	return 0;
3143 
3144 free_iommu:
3145 	for_each_active_iommu(iommu, drhd) {
3146 		disable_dmar_iommu(iommu);
3147 		free_dmar_iommu(iommu);
3148 	}
3149 
3150 	kfree(g_iommus);
3151 
3152 error:
3153 	return ret;
3154 }
3155 
3156 static void __init init_no_remapping_devices(void)
3157 {
3158 	struct dmar_drhd_unit *drhd;
3159 	struct device *dev;
3160 	int i;
3161 
3162 	for_each_drhd_unit(drhd) {
3163 		if (!drhd->include_all) {
3164 			for_each_active_dev_scope(drhd->devices,
3165 						  drhd->devices_cnt, i, dev)
3166 				break;
3167 			/* ignore DMAR unit if no devices exist */
3168 			if (i == drhd->devices_cnt)
3169 				drhd->ignored = 1;
3170 		}
3171 	}
3172 
3173 	for_each_active_drhd_unit(drhd) {
3174 		if (drhd->include_all)
3175 			continue;
3176 
3177 		for_each_active_dev_scope(drhd->devices,
3178 					  drhd->devices_cnt, i, dev)
3179 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3180 				break;
3181 		if (i < drhd->devices_cnt)
3182 			continue;
3183 
3184 		/* This IOMMU has *only* gfx devices. Either bypass it or
3185 		   set the gfx_mapped flag, as appropriate */
3186 		drhd->gfx_dedicated = 1;
3187 		if (!dmar_map_gfx)
3188 			drhd->ignored = 1;
3189 	}
3190 }
3191 
3192 #ifdef CONFIG_SUSPEND
3193 static int init_iommu_hw(void)
3194 {
3195 	struct dmar_drhd_unit *drhd;
3196 	struct intel_iommu *iommu = NULL;
3197 
3198 	for_each_active_iommu(iommu, drhd)
3199 		if (iommu->qi)
3200 			dmar_reenable_qi(iommu);
3201 
3202 	for_each_iommu(iommu, drhd) {
3203 		if (drhd->ignored) {
3204 			/*
3205 			 * we always have to disable PMRs or DMA may fail on
3206 			 * this device
3207 			 */
3208 			if (force_on)
3209 				iommu_disable_protect_mem_regions(iommu);
3210 			continue;
3211 		}
3212 
3213 		iommu_flush_write_buffer(iommu);
3214 		iommu_set_root_entry(iommu);
3215 		iommu_enable_translation(iommu);
3216 		iommu_disable_protect_mem_regions(iommu);
3217 	}
3218 
3219 	return 0;
3220 }
3221 
3222 static void iommu_flush_all(void)
3223 {
3224 	struct dmar_drhd_unit *drhd;
3225 	struct intel_iommu *iommu;
3226 
3227 	for_each_active_iommu(iommu, drhd) {
3228 		iommu->flush.flush_context(iommu, 0, 0, 0,
3229 					   DMA_CCMD_GLOBAL_INVL);
3230 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3231 					 DMA_TLB_GLOBAL_FLUSH);
3232 	}
3233 }
3234 
3235 static int iommu_suspend(void)
3236 {
3237 	struct dmar_drhd_unit *drhd;
3238 	struct intel_iommu *iommu = NULL;
3239 	unsigned long flag;
3240 
3241 	for_each_active_iommu(iommu, drhd) {
3242 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3243 					     GFP_KERNEL);
3244 		if (!iommu->iommu_state)
3245 			goto nomem;
3246 	}
3247 
3248 	iommu_flush_all();
3249 
3250 	for_each_active_iommu(iommu, drhd) {
3251 		iommu_disable_translation(iommu);
3252 
3253 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3254 
3255 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3256 			readl(iommu->reg + DMAR_FECTL_REG);
3257 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3258 			readl(iommu->reg + DMAR_FEDATA_REG);
3259 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3260 			readl(iommu->reg + DMAR_FEADDR_REG);
3261 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3262 			readl(iommu->reg + DMAR_FEUADDR_REG);
3263 
3264 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3265 	}
3266 	return 0;
3267 
3268 nomem:
3269 	for_each_active_iommu(iommu, drhd)
3270 		kfree(iommu->iommu_state);
3271 
3272 	return -ENOMEM;
3273 }
3274 
3275 static void iommu_resume(void)
3276 {
3277 	struct dmar_drhd_unit *drhd;
3278 	struct intel_iommu *iommu = NULL;
3279 	unsigned long flag;
3280 
3281 	if (init_iommu_hw()) {
3282 		if (force_on)
3283 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3284 		else
3285 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3286 		return;
3287 	}
3288 
3289 	for_each_active_iommu(iommu, drhd) {
3290 
3291 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3292 
3293 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3294 			iommu->reg + DMAR_FECTL_REG);
3295 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3296 			iommu->reg + DMAR_FEDATA_REG);
3297 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3298 			iommu->reg + DMAR_FEADDR_REG);
3299 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3300 			iommu->reg + DMAR_FEUADDR_REG);
3301 
3302 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3303 	}
3304 
3305 	for_each_active_iommu(iommu, drhd)
3306 		kfree(iommu->iommu_state);
3307 }
3308 
3309 static struct syscore_ops iommu_syscore_ops = {
3310 	.resume		= iommu_resume,
3311 	.suspend	= iommu_suspend,
3312 };
3313 
3314 static void __init init_iommu_pm_ops(void)
3315 {
3316 	register_syscore_ops(&iommu_syscore_ops);
3317 }
3318 
3319 #else
3320 static inline void init_iommu_pm_ops(void) {}
3321 #endif	/* CONFIG_PM */
3322 
3323 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3324 {
3325 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3326 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3327 	    rmrr->end_address <= rmrr->base_address ||
3328 	    arch_rmrr_sanity_check(rmrr))
3329 		return -EINVAL;
3330 
3331 	return 0;
3332 }
3333 
3334 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3335 {
3336 	struct acpi_dmar_reserved_memory *rmrr;
3337 	struct dmar_rmrr_unit *rmrru;
3338 
3339 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3340 	if (rmrr_sanity_check(rmrr)) {
3341 		pr_warn(FW_BUG
3342 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3343 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3344 			   rmrr->base_address, rmrr->end_address,
3345 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3346 			   dmi_get_system_info(DMI_BIOS_VERSION),
3347 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3348 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3349 	}
3350 
3351 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3352 	if (!rmrru)
3353 		goto out;
3354 
3355 	rmrru->hdr = header;
3356 
3357 	rmrru->base_address = rmrr->base_address;
3358 	rmrru->end_address = rmrr->end_address;
3359 
3360 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3361 				((void *)rmrr) + rmrr->header.length,
3362 				&rmrru->devices_cnt);
3363 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3364 		goto free_rmrru;
3365 
3366 	list_add(&rmrru->list, &dmar_rmrr_units);
3367 
3368 	return 0;
3369 free_rmrru:
3370 	kfree(rmrru);
3371 out:
3372 	return -ENOMEM;
3373 }
3374 
3375 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3376 {
3377 	struct dmar_atsr_unit *atsru;
3378 	struct acpi_dmar_atsr *tmp;
3379 
3380 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3381 				dmar_rcu_check()) {
3382 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3383 		if (atsr->segment != tmp->segment)
3384 			continue;
3385 		if (atsr->header.length != tmp->header.length)
3386 			continue;
3387 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3388 			return atsru;
3389 	}
3390 
3391 	return NULL;
3392 }
3393 
3394 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3395 {
3396 	struct acpi_dmar_atsr *atsr;
3397 	struct dmar_atsr_unit *atsru;
3398 
3399 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3400 		return 0;
3401 
3402 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3403 	atsru = dmar_find_atsr(atsr);
3404 	if (atsru)
3405 		return 0;
3406 
3407 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3408 	if (!atsru)
3409 		return -ENOMEM;
3410 
3411 	/*
3412 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3413 	 * copy the memory content because the memory buffer will be freed
3414 	 * on return.
3415 	 */
3416 	atsru->hdr = (void *)(atsru + 1);
3417 	memcpy(atsru->hdr, hdr, hdr->length);
3418 	atsru->include_all = atsr->flags & 0x1;
3419 	if (!atsru->include_all) {
3420 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3421 				(void *)atsr + atsr->header.length,
3422 				&atsru->devices_cnt);
3423 		if (atsru->devices_cnt && atsru->devices == NULL) {
3424 			kfree(atsru);
3425 			return -ENOMEM;
3426 		}
3427 	}
3428 
3429 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3430 
3431 	return 0;
3432 }
3433 
3434 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3435 {
3436 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3437 	kfree(atsru);
3438 }
3439 
3440 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3441 {
3442 	struct acpi_dmar_atsr *atsr;
3443 	struct dmar_atsr_unit *atsru;
3444 
3445 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3446 	atsru = dmar_find_atsr(atsr);
3447 	if (atsru) {
3448 		list_del_rcu(&atsru->list);
3449 		synchronize_rcu();
3450 		intel_iommu_free_atsr(atsru);
3451 	}
3452 
3453 	return 0;
3454 }
3455 
3456 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3457 {
3458 	int i;
3459 	struct device *dev;
3460 	struct acpi_dmar_atsr *atsr;
3461 	struct dmar_atsr_unit *atsru;
3462 
3463 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3464 	atsru = dmar_find_atsr(atsr);
3465 	if (!atsru)
3466 		return 0;
3467 
3468 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3469 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3470 					  i, dev)
3471 			return -EBUSY;
3472 	}
3473 
3474 	return 0;
3475 }
3476 
3477 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3478 {
3479 	struct dmar_satc_unit *satcu;
3480 	struct acpi_dmar_satc *tmp;
3481 
3482 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3483 				dmar_rcu_check()) {
3484 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3485 		if (satc->segment != tmp->segment)
3486 			continue;
3487 		if (satc->header.length != tmp->header.length)
3488 			continue;
3489 		if (memcmp(satc, tmp, satc->header.length) == 0)
3490 			return satcu;
3491 	}
3492 
3493 	return NULL;
3494 }
3495 
3496 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3497 {
3498 	struct acpi_dmar_satc *satc;
3499 	struct dmar_satc_unit *satcu;
3500 
3501 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3502 		return 0;
3503 
3504 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3505 	satcu = dmar_find_satc(satc);
3506 	if (satcu)
3507 		return 0;
3508 
3509 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3510 	if (!satcu)
3511 		return -ENOMEM;
3512 
3513 	satcu->hdr = (void *)(satcu + 1);
3514 	memcpy(satcu->hdr, hdr, hdr->length);
3515 	satcu->atc_required = satc->flags & 0x1;
3516 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3517 					      (void *)satc + satc->header.length,
3518 					      &satcu->devices_cnt);
3519 	if (satcu->devices_cnt && !satcu->devices) {
3520 		kfree(satcu);
3521 		return -ENOMEM;
3522 	}
3523 	list_add_rcu(&satcu->list, &dmar_satc_units);
3524 
3525 	return 0;
3526 }
3527 
3528 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3529 {
3530 	int sp, ret;
3531 	struct intel_iommu *iommu = dmaru->iommu;
3532 
3533 	if (g_iommus[iommu->seq_id])
3534 		return 0;
3535 
3536 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3537 	if (ret)
3538 		goto out;
3539 
3540 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3541 		pr_warn("%s: Doesn't support hardware pass through.\n",
3542 			iommu->name);
3543 		return -ENXIO;
3544 	}
3545 
3546 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3547 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3548 		pr_warn("%s: Doesn't support large page.\n",
3549 			iommu->name);
3550 		return -ENXIO;
3551 	}
3552 
3553 	/*
3554 	 * Disable translation if already enabled prior to OS handover.
3555 	 */
3556 	if (iommu->gcmd & DMA_GCMD_TE)
3557 		iommu_disable_translation(iommu);
3558 
3559 	g_iommus[iommu->seq_id] = iommu;
3560 	ret = iommu_init_domains(iommu);
3561 	if (ret == 0)
3562 		ret = iommu_alloc_root_entry(iommu);
3563 	if (ret)
3564 		goto out;
3565 
3566 	intel_svm_check(iommu);
3567 
3568 	if (dmaru->ignored) {
3569 		/*
3570 		 * we always have to disable PMRs or DMA may fail on this device
3571 		 */
3572 		if (force_on)
3573 			iommu_disable_protect_mem_regions(iommu);
3574 		return 0;
3575 	}
3576 
3577 	intel_iommu_init_qi(iommu);
3578 	iommu_flush_write_buffer(iommu);
3579 
3580 #ifdef CONFIG_INTEL_IOMMU_SVM
3581 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3582 		ret = intel_svm_enable_prq(iommu);
3583 		if (ret)
3584 			goto disable_iommu;
3585 	}
3586 #endif
3587 	ret = dmar_set_interrupt(iommu);
3588 	if (ret)
3589 		goto disable_iommu;
3590 
3591 	iommu_set_root_entry(iommu);
3592 	iommu_enable_translation(iommu);
3593 
3594 	iommu_disable_protect_mem_regions(iommu);
3595 	return 0;
3596 
3597 disable_iommu:
3598 	disable_dmar_iommu(iommu);
3599 out:
3600 	free_dmar_iommu(iommu);
3601 	return ret;
3602 }
3603 
3604 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3605 {
3606 	int ret = 0;
3607 	struct intel_iommu *iommu = dmaru->iommu;
3608 
3609 	if (!intel_iommu_enabled)
3610 		return 0;
3611 	if (iommu == NULL)
3612 		return -EINVAL;
3613 
3614 	if (insert) {
3615 		ret = intel_iommu_add(dmaru);
3616 	} else {
3617 		disable_dmar_iommu(iommu);
3618 		free_dmar_iommu(iommu);
3619 	}
3620 
3621 	return ret;
3622 }
3623 
3624 static void intel_iommu_free_dmars(void)
3625 {
3626 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3627 	struct dmar_atsr_unit *atsru, *atsr_n;
3628 	struct dmar_satc_unit *satcu, *satc_n;
3629 
3630 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3631 		list_del(&rmrru->list);
3632 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3633 		kfree(rmrru);
3634 	}
3635 
3636 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3637 		list_del(&atsru->list);
3638 		intel_iommu_free_atsr(atsru);
3639 	}
3640 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3641 		list_del(&satcu->list);
3642 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3643 		kfree(satcu);
3644 	}
3645 }
3646 
3647 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3648 {
3649 	struct dmar_satc_unit *satcu;
3650 	struct acpi_dmar_satc *satc;
3651 	struct device *tmp;
3652 	int i;
3653 
3654 	dev = pci_physfn(dev);
3655 	rcu_read_lock();
3656 
3657 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3658 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3659 		if (satc->segment != pci_domain_nr(dev->bus))
3660 			continue;
3661 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3662 			if (to_pci_dev(tmp) == dev)
3663 				goto out;
3664 	}
3665 	satcu = NULL;
3666 out:
3667 	rcu_read_unlock();
3668 	return satcu;
3669 }
3670 
3671 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3672 {
3673 	int i, ret = 1;
3674 	struct pci_bus *bus;
3675 	struct pci_dev *bridge = NULL;
3676 	struct device *tmp;
3677 	struct acpi_dmar_atsr *atsr;
3678 	struct dmar_atsr_unit *atsru;
3679 	struct dmar_satc_unit *satcu;
3680 
3681 	dev = pci_physfn(dev);
3682 	satcu = dmar_find_matched_satc_unit(dev);
3683 	if (satcu)
3684 		/*
3685 		 * This device supports ATS as it is in SATC table.
3686 		 * When IOMMU is in legacy mode, enabling ATS is done
3687 		 * automatically by HW for the device that requires
3688 		 * ATS, hence OS should not enable this device ATS
3689 		 * to avoid duplicated TLB invalidation.
3690 		 */
3691 		return !(satcu->atc_required && !sm_supported(iommu));
3692 
3693 	for (bus = dev->bus; bus; bus = bus->parent) {
3694 		bridge = bus->self;
3695 		/* If it's an integrated device, allow ATS */
3696 		if (!bridge)
3697 			return 1;
3698 		/* Connected via non-PCIe: no ATS */
3699 		if (!pci_is_pcie(bridge) ||
3700 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3701 			return 0;
3702 		/* If we found the root port, look it up in the ATSR */
3703 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3704 			break;
3705 	}
3706 
3707 	rcu_read_lock();
3708 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3709 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3710 		if (atsr->segment != pci_domain_nr(dev->bus))
3711 			continue;
3712 
3713 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3714 			if (tmp == &bridge->dev)
3715 				goto out;
3716 
3717 		if (atsru->include_all)
3718 			goto out;
3719 	}
3720 	ret = 0;
3721 out:
3722 	rcu_read_unlock();
3723 
3724 	return ret;
3725 }
3726 
3727 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3728 {
3729 	int ret;
3730 	struct dmar_rmrr_unit *rmrru;
3731 	struct dmar_atsr_unit *atsru;
3732 	struct dmar_satc_unit *satcu;
3733 	struct acpi_dmar_atsr *atsr;
3734 	struct acpi_dmar_reserved_memory *rmrr;
3735 	struct acpi_dmar_satc *satc;
3736 
3737 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3738 		return 0;
3739 
3740 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3741 		rmrr = container_of(rmrru->hdr,
3742 				    struct acpi_dmar_reserved_memory, header);
3743 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3744 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3745 				((void *)rmrr) + rmrr->header.length,
3746 				rmrr->segment, rmrru->devices,
3747 				rmrru->devices_cnt);
3748 			if (ret < 0)
3749 				return ret;
3750 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3751 			dmar_remove_dev_scope(info, rmrr->segment,
3752 				rmrru->devices, rmrru->devices_cnt);
3753 		}
3754 	}
3755 
3756 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3757 		if (atsru->include_all)
3758 			continue;
3759 
3760 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3761 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3762 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3763 					(void *)atsr + atsr->header.length,
3764 					atsr->segment, atsru->devices,
3765 					atsru->devices_cnt);
3766 			if (ret > 0)
3767 				break;
3768 			else if (ret < 0)
3769 				return ret;
3770 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3771 			if (dmar_remove_dev_scope(info, atsr->segment,
3772 					atsru->devices, atsru->devices_cnt))
3773 				break;
3774 		}
3775 	}
3776 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3777 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3778 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3779 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3780 					(void *)satc + satc->header.length,
3781 					satc->segment, satcu->devices,
3782 					satcu->devices_cnt);
3783 			if (ret > 0)
3784 				break;
3785 			else if (ret < 0)
3786 				return ret;
3787 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3788 			if (dmar_remove_dev_scope(info, satc->segment,
3789 					satcu->devices, satcu->devices_cnt))
3790 				break;
3791 		}
3792 	}
3793 
3794 	return 0;
3795 }
3796 
3797 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3798 				       unsigned long val, void *v)
3799 {
3800 	struct memory_notify *mhp = v;
3801 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3802 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3803 			mhp->nr_pages - 1);
3804 
3805 	switch (val) {
3806 	case MEM_GOING_ONLINE:
3807 		if (iommu_domain_identity_map(si_domain,
3808 					      start_vpfn, last_vpfn)) {
3809 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3810 				start_vpfn, last_vpfn);
3811 			return NOTIFY_BAD;
3812 		}
3813 		break;
3814 
3815 	case MEM_OFFLINE:
3816 	case MEM_CANCEL_ONLINE:
3817 		{
3818 			struct dmar_drhd_unit *drhd;
3819 			struct intel_iommu *iommu;
3820 			LIST_HEAD(freelist);
3821 
3822 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3823 
3824 			rcu_read_lock();
3825 			for_each_active_iommu(iommu, drhd)
3826 				iommu_flush_iotlb_psi(iommu, si_domain,
3827 					start_vpfn, mhp->nr_pages,
3828 					list_empty(&freelist), 0);
3829 			rcu_read_unlock();
3830 			put_pages_list(&freelist);
3831 		}
3832 		break;
3833 	}
3834 
3835 	return NOTIFY_OK;
3836 }
3837 
3838 static struct notifier_block intel_iommu_memory_nb = {
3839 	.notifier_call = intel_iommu_memory_notifier,
3840 	.priority = 0
3841 };
3842 
3843 static void intel_disable_iommus(void)
3844 {
3845 	struct intel_iommu *iommu = NULL;
3846 	struct dmar_drhd_unit *drhd;
3847 
3848 	for_each_iommu(iommu, drhd)
3849 		iommu_disable_translation(iommu);
3850 }
3851 
3852 void intel_iommu_shutdown(void)
3853 {
3854 	struct dmar_drhd_unit *drhd;
3855 	struct intel_iommu *iommu = NULL;
3856 
3857 	if (no_iommu || dmar_disabled)
3858 		return;
3859 
3860 	down_write(&dmar_global_lock);
3861 
3862 	/* Disable PMRs explicitly here. */
3863 	for_each_iommu(iommu, drhd)
3864 		iommu_disable_protect_mem_regions(iommu);
3865 
3866 	/* Make sure the IOMMUs are switched off */
3867 	intel_disable_iommus();
3868 
3869 	up_write(&dmar_global_lock);
3870 }
3871 
3872 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3873 {
3874 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3875 
3876 	return container_of(iommu_dev, struct intel_iommu, iommu);
3877 }
3878 
3879 static ssize_t version_show(struct device *dev,
3880 			    struct device_attribute *attr, char *buf)
3881 {
3882 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3883 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3884 	return sprintf(buf, "%d:%d\n",
3885 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3886 }
3887 static DEVICE_ATTR_RO(version);
3888 
3889 static ssize_t address_show(struct device *dev,
3890 			    struct device_attribute *attr, char *buf)
3891 {
3892 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3893 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3894 }
3895 static DEVICE_ATTR_RO(address);
3896 
3897 static ssize_t cap_show(struct device *dev,
3898 			struct device_attribute *attr, char *buf)
3899 {
3900 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3901 	return sprintf(buf, "%llx\n", iommu->cap);
3902 }
3903 static DEVICE_ATTR_RO(cap);
3904 
3905 static ssize_t ecap_show(struct device *dev,
3906 			 struct device_attribute *attr, char *buf)
3907 {
3908 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3909 	return sprintf(buf, "%llx\n", iommu->ecap);
3910 }
3911 static DEVICE_ATTR_RO(ecap);
3912 
3913 static ssize_t domains_supported_show(struct device *dev,
3914 				      struct device_attribute *attr, char *buf)
3915 {
3916 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3917 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3918 }
3919 static DEVICE_ATTR_RO(domains_supported);
3920 
3921 static ssize_t domains_used_show(struct device *dev,
3922 				 struct device_attribute *attr, char *buf)
3923 {
3924 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3925 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3926 						  cap_ndoms(iommu->cap)));
3927 }
3928 static DEVICE_ATTR_RO(domains_used);
3929 
3930 static struct attribute *intel_iommu_attrs[] = {
3931 	&dev_attr_version.attr,
3932 	&dev_attr_address.attr,
3933 	&dev_attr_cap.attr,
3934 	&dev_attr_ecap.attr,
3935 	&dev_attr_domains_supported.attr,
3936 	&dev_attr_domains_used.attr,
3937 	NULL,
3938 };
3939 
3940 static struct attribute_group intel_iommu_group = {
3941 	.name = "intel-iommu",
3942 	.attrs = intel_iommu_attrs,
3943 };
3944 
3945 const struct attribute_group *intel_iommu_groups[] = {
3946 	&intel_iommu_group,
3947 	NULL,
3948 };
3949 
3950 static inline bool has_external_pci(void)
3951 {
3952 	struct pci_dev *pdev = NULL;
3953 
3954 	for_each_pci_dev(pdev)
3955 		if (pdev->external_facing)
3956 			return true;
3957 
3958 	return false;
3959 }
3960 
3961 static int __init platform_optin_force_iommu(void)
3962 {
3963 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3964 		return 0;
3965 
3966 	if (no_iommu || dmar_disabled)
3967 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3968 
3969 	/*
3970 	 * If Intel-IOMMU is disabled by default, we will apply identity
3971 	 * map for all devices except those marked as being untrusted.
3972 	 */
3973 	if (dmar_disabled)
3974 		iommu_set_default_passthrough(false);
3975 
3976 	dmar_disabled = 0;
3977 	no_iommu = 0;
3978 
3979 	return 1;
3980 }
3981 
3982 static int __init probe_acpi_namespace_devices(void)
3983 {
3984 	struct dmar_drhd_unit *drhd;
3985 	/* To avoid a -Wunused-but-set-variable warning. */
3986 	struct intel_iommu *iommu __maybe_unused;
3987 	struct device *dev;
3988 	int i, ret = 0;
3989 
3990 	for_each_active_iommu(iommu, drhd) {
3991 		for_each_active_dev_scope(drhd->devices,
3992 					  drhd->devices_cnt, i, dev) {
3993 			struct acpi_device_physical_node *pn;
3994 			struct iommu_group *group;
3995 			struct acpi_device *adev;
3996 
3997 			if (dev->bus != &acpi_bus_type)
3998 				continue;
3999 
4000 			adev = to_acpi_device(dev);
4001 			mutex_lock(&adev->physical_node_lock);
4002 			list_for_each_entry(pn,
4003 					    &adev->physical_node_list, node) {
4004 				group = iommu_group_get(pn->dev);
4005 				if (group) {
4006 					iommu_group_put(group);
4007 					continue;
4008 				}
4009 
4010 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4011 				ret = iommu_probe_device(pn->dev);
4012 				if (ret)
4013 					break;
4014 			}
4015 			mutex_unlock(&adev->physical_node_lock);
4016 
4017 			if (ret)
4018 				return ret;
4019 		}
4020 	}
4021 
4022 	return 0;
4023 }
4024 
4025 int __init intel_iommu_init(void)
4026 {
4027 	int ret = -ENODEV;
4028 	struct dmar_drhd_unit *drhd;
4029 	struct intel_iommu *iommu;
4030 
4031 	/*
4032 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4033 	 * opt in, so enforce that.
4034 	 */
4035 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4036 		    platform_optin_force_iommu();
4037 
4038 	down_write(&dmar_global_lock);
4039 	if (dmar_table_init()) {
4040 		if (force_on)
4041 			panic("tboot: Failed to initialize DMAR table\n");
4042 		goto out_free_dmar;
4043 	}
4044 
4045 	if (dmar_dev_scope_init() < 0) {
4046 		if (force_on)
4047 			panic("tboot: Failed to initialize DMAR device scope\n");
4048 		goto out_free_dmar;
4049 	}
4050 
4051 	up_write(&dmar_global_lock);
4052 
4053 	/*
4054 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4055 	 * complain later when we register it under the lock.
4056 	 */
4057 	dmar_register_bus_notifier();
4058 
4059 	down_write(&dmar_global_lock);
4060 
4061 	if (!no_iommu)
4062 		intel_iommu_debugfs_init();
4063 
4064 	if (no_iommu || dmar_disabled) {
4065 		/*
4066 		 * We exit the function here to ensure IOMMU's remapping and
4067 		 * mempool aren't setup, which means that the IOMMU's PMRs
4068 		 * won't be disabled via the call to init_dmars(). So disable
4069 		 * it explicitly here. The PMRs were setup by tboot prior to
4070 		 * calling SENTER, but the kernel is expected to reset/tear
4071 		 * down the PMRs.
4072 		 */
4073 		if (intel_iommu_tboot_noforce) {
4074 			for_each_iommu(iommu, drhd)
4075 				iommu_disable_protect_mem_regions(iommu);
4076 		}
4077 
4078 		/*
4079 		 * Make sure the IOMMUs are switched off, even when we
4080 		 * boot into a kexec kernel and the previous kernel left
4081 		 * them enabled
4082 		 */
4083 		intel_disable_iommus();
4084 		goto out_free_dmar;
4085 	}
4086 
4087 	if (list_empty(&dmar_rmrr_units))
4088 		pr_info("No RMRR found\n");
4089 
4090 	if (list_empty(&dmar_atsr_units))
4091 		pr_info("No ATSR found\n");
4092 
4093 	if (list_empty(&dmar_satc_units))
4094 		pr_info("No SATC found\n");
4095 
4096 	if (dmar_map_gfx)
4097 		intel_iommu_gfx_mapped = 1;
4098 
4099 	init_no_remapping_devices();
4100 
4101 	ret = init_dmars();
4102 	if (ret) {
4103 		if (force_on)
4104 			panic("tboot: Failed to initialize DMARs\n");
4105 		pr_err("Initialization failed\n");
4106 		goto out_free_dmar;
4107 	}
4108 	up_write(&dmar_global_lock);
4109 
4110 	init_iommu_pm_ops();
4111 
4112 	down_read(&dmar_global_lock);
4113 	for_each_active_iommu(iommu, drhd) {
4114 		/*
4115 		 * The flush queue implementation does not perform
4116 		 * page-selective invalidations that are required for efficient
4117 		 * TLB flushes in virtual environments.  The benefit of batching
4118 		 * is likely to be much lower than the overhead of synchronizing
4119 		 * the virtual and physical IOMMU page-tables.
4120 		 */
4121 		if (cap_caching_mode(iommu->cap)) {
4122 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4123 			iommu_set_dma_strict();
4124 		}
4125 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4126 				       intel_iommu_groups,
4127 				       "%s", iommu->name);
4128 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4129 	}
4130 	up_read(&dmar_global_lock);
4131 
4132 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4133 	if (si_domain && !hw_pass_through)
4134 		register_memory_notifier(&intel_iommu_memory_nb);
4135 
4136 	down_read(&dmar_global_lock);
4137 	if (probe_acpi_namespace_devices())
4138 		pr_warn("ACPI name space devices didn't probe correctly\n");
4139 
4140 	/* Finally, we enable the DMA remapping hardware. */
4141 	for_each_iommu(iommu, drhd) {
4142 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4143 			iommu_enable_translation(iommu);
4144 
4145 		iommu_disable_protect_mem_regions(iommu);
4146 	}
4147 	up_read(&dmar_global_lock);
4148 
4149 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4150 
4151 	intel_iommu_enabled = 1;
4152 
4153 	return 0;
4154 
4155 out_free_dmar:
4156 	intel_iommu_free_dmars();
4157 	up_write(&dmar_global_lock);
4158 	return ret;
4159 }
4160 
4161 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4162 {
4163 	struct device_domain_info *info = opaque;
4164 
4165 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4166 	return 0;
4167 }
4168 
4169 /*
4170  * NB - intel-iommu lacks any sort of reference counting for the users of
4171  * dependent devices.  If multiple endpoints have intersecting dependent
4172  * devices, unbinding the driver from any one of them will possibly leave
4173  * the others unable to operate.
4174  */
4175 static void domain_context_clear(struct device_domain_info *info)
4176 {
4177 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4178 		return;
4179 
4180 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4181 			       &domain_context_clear_one_cb, info);
4182 }
4183 
4184 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4185 {
4186 	struct dmar_domain *domain;
4187 	struct intel_iommu *iommu;
4188 	unsigned long flags;
4189 
4190 	assert_spin_locked(&device_domain_lock);
4191 
4192 	if (WARN_ON(!info))
4193 		return;
4194 
4195 	iommu = info->iommu;
4196 	domain = info->domain;
4197 
4198 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4199 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4200 			intel_pasid_tear_down_entry(iommu, info->dev,
4201 					PASID_RID2PASID, false);
4202 
4203 		iommu_disable_dev_iotlb(info);
4204 		domain_context_clear(info);
4205 		intel_pasid_free_table(info->dev);
4206 	}
4207 
4208 	list_del(&info->link);
4209 
4210 	spin_lock_irqsave(&iommu->lock, flags);
4211 	domain_detach_iommu(domain, iommu);
4212 	spin_unlock_irqrestore(&iommu->lock, flags);
4213 }
4214 
4215 static void dmar_remove_one_dev_info(struct device *dev)
4216 {
4217 	struct device_domain_info *info;
4218 	unsigned long flags;
4219 
4220 	spin_lock_irqsave(&device_domain_lock, flags);
4221 	info = dev_iommu_priv_get(dev);
4222 	if (info)
4223 		__dmar_remove_one_dev_info(info);
4224 	spin_unlock_irqrestore(&device_domain_lock, flags);
4225 }
4226 
4227 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4228 {
4229 	int adjust_width;
4230 
4231 	/* calculate AGAW */
4232 	domain->gaw = guest_width;
4233 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4234 	domain->agaw = width_to_agaw(adjust_width);
4235 
4236 	domain->iommu_coherency = false;
4237 	domain->iommu_superpage = 0;
4238 	domain->max_addr = 0;
4239 
4240 	/* always allocate the top pgd */
4241 	domain->pgd = alloc_pgtable_page(domain->nid);
4242 	if (!domain->pgd)
4243 		return -ENOMEM;
4244 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4245 	return 0;
4246 }
4247 
4248 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4249 {
4250 	struct dmar_domain *dmar_domain;
4251 	struct iommu_domain *domain;
4252 
4253 	switch (type) {
4254 	case IOMMU_DOMAIN_DMA:
4255 	case IOMMU_DOMAIN_DMA_FQ:
4256 	case IOMMU_DOMAIN_UNMANAGED:
4257 		dmar_domain = alloc_domain(type);
4258 		if (!dmar_domain) {
4259 			pr_err("Can't allocate dmar_domain\n");
4260 			return NULL;
4261 		}
4262 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4263 			pr_err("Domain initialization failed\n");
4264 			domain_exit(dmar_domain);
4265 			return NULL;
4266 		}
4267 
4268 		domain = &dmar_domain->domain;
4269 		domain->geometry.aperture_start = 0;
4270 		domain->geometry.aperture_end   =
4271 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4272 		domain->geometry.force_aperture = true;
4273 
4274 		return domain;
4275 	case IOMMU_DOMAIN_IDENTITY:
4276 		return &si_domain->domain;
4277 	default:
4278 		return NULL;
4279 	}
4280 
4281 	return NULL;
4282 }
4283 
4284 static void intel_iommu_domain_free(struct iommu_domain *domain)
4285 {
4286 	if (domain != &si_domain->domain)
4287 		domain_exit(to_dmar_domain(domain));
4288 }
4289 
4290 static int prepare_domain_attach_device(struct iommu_domain *domain,
4291 					struct device *dev)
4292 {
4293 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4294 	struct intel_iommu *iommu;
4295 	int addr_width;
4296 
4297 	iommu = device_to_iommu(dev, NULL, NULL);
4298 	if (!iommu)
4299 		return -ENODEV;
4300 
4301 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4302 		return -EOPNOTSUPP;
4303 
4304 	/* check if this iommu agaw is sufficient for max mapped address */
4305 	addr_width = agaw_to_width(iommu->agaw);
4306 	if (addr_width > cap_mgaw(iommu->cap))
4307 		addr_width = cap_mgaw(iommu->cap);
4308 
4309 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4310 		dev_err(dev, "%s: iommu width (%d) is not "
4311 		        "sufficient for the mapped address (%llx)\n",
4312 		        __func__, addr_width, dmar_domain->max_addr);
4313 		return -EFAULT;
4314 	}
4315 	dmar_domain->gaw = addr_width;
4316 
4317 	/*
4318 	 * Knock out extra levels of page tables if necessary
4319 	 */
4320 	while (iommu->agaw < dmar_domain->agaw) {
4321 		struct dma_pte *pte;
4322 
4323 		pte = dmar_domain->pgd;
4324 		if (dma_pte_present(pte)) {
4325 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4326 			free_pgtable_page(pte);
4327 		}
4328 		dmar_domain->agaw--;
4329 	}
4330 
4331 	return 0;
4332 }
4333 
4334 static int intel_iommu_attach_device(struct iommu_domain *domain,
4335 				     struct device *dev)
4336 {
4337 	int ret;
4338 
4339 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4340 	    device_is_rmrr_locked(dev)) {
4341 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4342 		return -EPERM;
4343 	}
4344 
4345 	/* normally dev is not mapped */
4346 	if (unlikely(domain_context_mapped(dev))) {
4347 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4348 
4349 		if (info->domain)
4350 			dmar_remove_one_dev_info(dev);
4351 	}
4352 
4353 	ret = prepare_domain_attach_device(domain, dev);
4354 	if (ret)
4355 		return ret;
4356 
4357 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4358 }
4359 
4360 static void intel_iommu_detach_device(struct iommu_domain *domain,
4361 				      struct device *dev)
4362 {
4363 	dmar_remove_one_dev_info(dev);
4364 }
4365 
4366 static int intel_iommu_map(struct iommu_domain *domain,
4367 			   unsigned long iova, phys_addr_t hpa,
4368 			   size_t size, int iommu_prot, gfp_t gfp)
4369 {
4370 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4371 	u64 max_addr;
4372 	int prot = 0;
4373 
4374 	if (iommu_prot & IOMMU_READ)
4375 		prot |= DMA_PTE_READ;
4376 	if (iommu_prot & IOMMU_WRITE)
4377 		prot |= DMA_PTE_WRITE;
4378 	if (dmar_domain->set_pte_snp)
4379 		prot |= DMA_PTE_SNP;
4380 
4381 	max_addr = iova + size;
4382 	if (dmar_domain->max_addr < max_addr) {
4383 		u64 end;
4384 
4385 		/* check if minimum agaw is sufficient for mapped address */
4386 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4387 		if (end < max_addr) {
4388 			pr_err("%s: iommu width (%d) is not "
4389 			       "sufficient for the mapped address (%llx)\n",
4390 			       __func__, dmar_domain->gaw, max_addr);
4391 			return -EFAULT;
4392 		}
4393 		dmar_domain->max_addr = max_addr;
4394 	}
4395 	/* Round up size to next multiple of PAGE_SIZE, if it and
4396 	   the low bits of hpa would take us onto the next page */
4397 	size = aligned_nrpages(hpa, size);
4398 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4399 				hpa >> VTD_PAGE_SHIFT, size, prot);
4400 }
4401 
4402 static int intel_iommu_map_pages(struct iommu_domain *domain,
4403 				 unsigned long iova, phys_addr_t paddr,
4404 				 size_t pgsize, size_t pgcount,
4405 				 int prot, gfp_t gfp, size_t *mapped)
4406 {
4407 	unsigned long pgshift = __ffs(pgsize);
4408 	size_t size = pgcount << pgshift;
4409 	int ret;
4410 
4411 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4412 		return -EINVAL;
4413 
4414 	if (!IS_ALIGNED(iova | paddr, pgsize))
4415 		return -EINVAL;
4416 
4417 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4418 	if (!ret && mapped)
4419 		*mapped = size;
4420 
4421 	return ret;
4422 }
4423 
4424 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4425 				unsigned long iova, size_t size,
4426 				struct iommu_iotlb_gather *gather)
4427 {
4428 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4429 	unsigned long start_pfn, last_pfn;
4430 	int level = 0;
4431 
4432 	/* Cope with horrid API which requires us to unmap more than the
4433 	   size argument if it happens to be a large-page mapping. */
4434 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4435 
4436 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4437 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4438 
4439 	start_pfn = iova >> VTD_PAGE_SHIFT;
4440 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4441 
4442 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4443 
4444 	if (dmar_domain->max_addr == iova + size)
4445 		dmar_domain->max_addr = iova;
4446 
4447 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4448 
4449 	return size;
4450 }
4451 
4452 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4453 				      unsigned long iova,
4454 				      size_t pgsize, size_t pgcount,
4455 				      struct iommu_iotlb_gather *gather)
4456 {
4457 	unsigned long pgshift = __ffs(pgsize);
4458 	size_t size = pgcount << pgshift;
4459 
4460 	return intel_iommu_unmap(domain, iova, size, gather);
4461 }
4462 
4463 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4464 				 struct iommu_iotlb_gather *gather)
4465 {
4466 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4467 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4468 	size_t size = gather->end - gather->start;
4469 	unsigned long start_pfn;
4470 	unsigned long nrpages;
4471 	int iommu_id;
4472 
4473 	nrpages = aligned_nrpages(gather->start, size);
4474 	start_pfn = mm_to_dma_pfn(iova_pfn);
4475 
4476 	for_each_domain_iommu(iommu_id, dmar_domain)
4477 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4478 				      start_pfn, nrpages,
4479 				      list_empty(&gather->freelist), 0);
4480 
4481 	put_pages_list(&gather->freelist);
4482 }
4483 
4484 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4485 					    dma_addr_t iova)
4486 {
4487 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4488 	struct dma_pte *pte;
4489 	int level = 0;
4490 	u64 phys = 0;
4491 
4492 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4493 	if (pte && dma_pte_present(pte))
4494 		phys = dma_pte_addr(pte) +
4495 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4496 						VTD_PAGE_SHIFT) - 1));
4497 
4498 	return phys;
4499 }
4500 
4501 static bool domain_support_force_snooping(struct dmar_domain *domain)
4502 {
4503 	struct device_domain_info *info;
4504 	bool support = true;
4505 
4506 	assert_spin_locked(&device_domain_lock);
4507 	list_for_each_entry(info, &domain->devices, link) {
4508 		if (!ecap_sc_support(info->iommu->ecap)) {
4509 			support = false;
4510 			break;
4511 		}
4512 	}
4513 
4514 	return support;
4515 }
4516 
4517 static void domain_set_force_snooping(struct dmar_domain *domain)
4518 {
4519 	struct device_domain_info *info;
4520 
4521 	assert_spin_locked(&device_domain_lock);
4522 
4523 	/*
4524 	 * Second level page table supports per-PTE snoop control. The
4525 	 * iommu_map() interface will handle this by setting SNP bit.
4526 	 */
4527 	if (!domain_use_first_level(domain)) {
4528 		domain->set_pte_snp = true;
4529 		return;
4530 	}
4531 
4532 	list_for_each_entry(info, &domain->devices, link)
4533 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4534 						     PASID_RID2PASID);
4535 }
4536 
4537 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4538 {
4539 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4540 	unsigned long flags;
4541 
4542 	if (dmar_domain->force_snooping)
4543 		return true;
4544 
4545 	spin_lock_irqsave(&device_domain_lock, flags);
4546 	if (!domain_support_force_snooping(dmar_domain)) {
4547 		spin_unlock_irqrestore(&device_domain_lock, flags);
4548 		return false;
4549 	}
4550 
4551 	domain_set_force_snooping(dmar_domain);
4552 	dmar_domain->force_snooping = true;
4553 	spin_unlock_irqrestore(&device_domain_lock, flags);
4554 
4555 	return true;
4556 }
4557 
4558 static bool intel_iommu_capable(enum iommu_cap cap)
4559 {
4560 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4561 		return true;
4562 	if (cap == IOMMU_CAP_INTR_REMAP)
4563 		return irq_remapping_enabled == 1;
4564 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4565 		return dmar_platform_optin();
4566 
4567 	return false;
4568 }
4569 
4570 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4571 {
4572 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4573 	struct device_domain_info *info;
4574 	struct intel_iommu *iommu;
4575 	unsigned long flags;
4576 	u8 bus, devfn;
4577 
4578 	iommu = device_to_iommu(dev, &bus, &devfn);
4579 	if (!iommu)
4580 		return ERR_PTR(-ENODEV);
4581 
4582 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4583 	if (!info)
4584 		return ERR_PTR(-ENOMEM);
4585 
4586 	if (dev_is_real_dma_subdevice(dev)) {
4587 		info->bus = pdev->bus->number;
4588 		info->devfn = pdev->devfn;
4589 		info->segment = pci_domain_nr(pdev->bus);
4590 	} else {
4591 		info->bus = bus;
4592 		info->devfn = devfn;
4593 		info->segment = iommu->segment;
4594 	}
4595 
4596 	info->dev = dev;
4597 	info->iommu = iommu;
4598 	if (dev_is_pci(dev)) {
4599 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4600 		    pci_ats_supported(pdev) &&
4601 		    dmar_ats_supported(pdev, iommu))
4602 			info->ats_supported = 1;
4603 
4604 		if (sm_supported(iommu)) {
4605 			if (pasid_supported(iommu)) {
4606 				int features = pci_pasid_features(pdev);
4607 
4608 				if (features >= 0)
4609 					info->pasid_supported = features | 1;
4610 			}
4611 
4612 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4613 			    pci_pri_supported(pdev))
4614 				info->pri_supported = 1;
4615 		}
4616 	}
4617 
4618 	spin_lock_irqsave(&device_domain_lock, flags);
4619 	list_add(&info->global, &device_domain_list);
4620 	dev_iommu_priv_set(dev, info);
4621 	spin_unlock_irqrestore(&device_domain_lock, flags);
4622 
4623 	return &iommu->iommu;
4624 }
4625 
4626 static void intel_iommu_release_device(struct device *dev)
4627 {
4628 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4629 	unsigned long flags;
4630 
4631 	dmar_remove_one_dev_info(dev);
4632 
4633 	spin_lock_irqsave(&device_domain_lock, flags);
4634 	dev_iommu_priv_set(dev, NULL);
4635 	list_del(&info->global);
4636 	spin_unlock_irqrestore(&device_domain_lock, flags);
4637 
4638 	kfree(info);
4639 	set_dma_ops(dev, NULL);
4640 }
4641 
4642 static void intel_iommu_probe_finalize(struct device *dev)
4643 {
4644 	set_dma_ops(dev, NULL);
4645 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4646 }
4647 
4648 static void intel_iommu_get_resv_regions(struct device *device,
4649 					 struct list_head *head)
4650 {
4651 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4652 	struct iommu_resv_region *reg;
4653 	struct dmar_rmrr_unit *rmrr;
4654 	struct device *i_dev;
4655 	int i;
4656 
4657 	down_read(&dmar_global_lock);
4658 	for_each_rmrr_units(rmrr) {
4659 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4660 					  i, i_dev) {
4661 			struct iommu_resv_region *resv;
4662 			enum iommu_resv_type type;
4663 			size_t length;
4664 
4665 			if (i_dev != device &&
4666 			    !is_downstream_to_pci_bridge(device, i_dev))
4667 				continue;
4668 
4669 			length = rmrr->end_address - rmrr->base_address + 1;
4670 
4671 			type = device_rmrr_is_relaxable(device) ?
4672 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4673 
4674 			resv = iommu_alloc_resv_region(rmrr->base_address,
4675 						       length, prot, type);
4676 			if (!resv)
4677 				break;
4678 
4679 			list_add_tail(&resv->list, head);
4680 		}
4681 	}
4682 	up_read(&dmar_global_lock);
4683 
4684 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4685 	if (dev_is_pci(device)) {
4686 		struct pci_dev *pdev = to_pci_dev(device);
4687 
4688 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4689 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4690 						   IOMMU_RESV_DIRECT_RELAXABLE);
4691 			if (reg)
4692 				list_add_tail(&reg->list, head);
4693 		}
4694 	}
4695 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4696 
4697 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4698 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4699 				      0, IOMMU_RESV_MSI);
4700 	if (!reg)
4701 		return;
4702 	list_add_tail(&reg->list, head);
4703 }
4704 
4705 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4706 {
4707 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4708 	struct context_entry *context;
4709 	struct dmar_domain *domain;
4710 	unsigned long flags;
4711 	u64 ctx_lo;
4712 	int ret;
4713 
4714 	domain = info->domain;
4715 	if (!domain)
4716 		return -EINVAL;
4717 
4718 	spin_lock_irqsave(&device_domain_lock, flags);
4719 	spin_lock(&iommu->lock);
4720 
4721 	ret = -EINVAL;
4722 	if (!info->pasid_supported)
4723 		goto out;
4724 
4725 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4726 	if (WARN_ON(!context))
4727 		goto out;
4728 
4729 	ctx_lo = context[0].lo;
4730 
4731 	if (!(ctx_lo & CONTEXT_PASIDE)) {
4732 		ctx_lo |= CONTEXT_PASIDE;
4733 		context[0].lo = ctx_lo;
4734 		wmb();
4735 		iommu->flush.flush_context(iommu,
4736 					   domain->iommu_did[iommu->seq_id],
4737 					   PCI_DEVID(info->bus, info->devfn),
4738 					   DMA_CCMD_MASK_NOBIT,
4739 					   DMA_CCMD_DEVICE_INVL);
4740 	}
4741 
4742 	/* Enable PASID support in the device, if it wasn't already */
4743 	if (!info->pasid_enabled)
4744 		iommu_enable_dev_iotlb(info);
4745 
4746 	ret = 0;
4747 
4748  out:
4749 	spin_unlock(&iommu->lock);
4750 	spin_unlock_irqrestore(&device_domain_lock, flags);
4751 
4752 	return ret;
4753 }
4754 
4755 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4756 {
4757 	if (dev_is_pci(dev))
4758 		return pci_device_group(dev);
4759 	return generic_device_group(dev);
4760 }
4761 
4762 static int intel_iommu_enable_sva(struct device *dev)
4763 {
4764 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4765 	struct intel_iommu *iommu;
4766 	int ret;
4767 
4768 	if (!info || dmar_disabled)
4769 		return -EINVAL;
4770 
4771 	iommu = info->iommu;
4772 	if (!iommu)
4773 		return -EINVAL;
4774 
4775 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4776 		return -ENODEV;
4777 
4778 	if (intel_iommu_enable_pasid(iommu, dev))
4779 		return -ENODEV;
4780 
4781 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4782 		return -EINVAL;
4783 
4784 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4785 	if (!ret)
4786 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4787 
4788 	return ret;
4789 }
4790 
4791 static int intel_iommu_disable_sva(struct device *dev)
4792 {
4793 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4794 	struct intel_iommu *iommu = info->iommu;
4795 	int ret;
4796 
4797 	ret = iommu_unregister_device_fault_handler(dev);
4798 	if (!ret)
4799 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4800 
4801 	return ret;
4802 }
4803 
4804 static int intel_iommu_enable_iopf(struct device *dev)
4805 {
4806 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4807 
4808 	if (info && info->pri_supported)
4809 		return 0;
4810 
4811 	return -ENODEV;
4812 }
4813 
4814 static int
4815 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4816 {
4817 	switch (feat) {
4818 	case IOMMU_DEV_FEAT_IOPF:
4819 		return intel_iommu_enable_iopf(dev);
4820 
4821 	case IOMMU_DEV_FEAT_SVA:
4822 		return intel_iommu_enable_sva(dev);
4823 
4824 	default:
4825 		return -ENODEV;
4826 	}
4827 }
4828 
4829 static int
4830 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4831 {
4832 	switch (feat) {
4833 	case IOMMU_DEV_FEAT_IOPF:
4834 		return 0;
4835 
4836 	case IOMMU_DEV_FEAT_SVA:
4837 		return intel_iommu_disable_sva(dev);
4838 
4839 	default:
4840 		return -ENODEV;
4841 	}
4842 }
4843 
4844 static bool intel_iommu_is_attach_deferred(struct device *dev)
4845 {
4846 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4847 
4848 	return translation_pre_enabled(info->iommu) && !info->domain;
4849 }
4850 
4851 /*
4852  * Check that the device does not live on an external facing PCI port that is
4853  * marked as untrusted. Such devices should not be able to apply quirks and
4854  * thus not be able to bypass the IOMMU restrictions.
4855  */
4856 static bool risky_device(struct pci_dev *pdev)
4857 {
4858 	if (pdev->untrusted) {
4859 		pci_info(pdev,
4860 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4861 			 pdev->vendor, pdev->device);
4862 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4863 		return true;
4864 	}
4865 	return false;
4866 }
4867 
4868 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4869 				       unsigned long iova, size_t size)
4870 {
4871 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4872 	unsigned long pages = aligned_nrpages(iova, size);
4873 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4874 	struct intel_iommu *iommu;
4875 	int iommu_id;
4876 
4877 	for_each_domain_iommu(iommu_id, dmar_domain) {
4878 		iommu = g_iommus[iommu_id];
4879 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
4880 	}
4881 }
4882 
4883 const struct iommu_ops intel_iommu_ops = {
4884 	.capable		= intel_iommu_capable,
4885 	.domain_alloc		= intel_iommu_domain_alloc,
4886 	.probe_device		= intel_iommu_probe_device,
4887 	.probe_finalize		= intel_iommu_probe_finalize,
4888 	.release_device		= intel_iommu_release_device,
4889 	.get_resv_regions	= intel_iommu_get_resv_regions,
4890 	.put_resv_regions	= generic_iommu_put_resv_regions,
4891 	.device_group		= intel_iommu_device_group,
4892 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4893 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4894 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4895 	.def_domain_type	= device_def_domain_type,
4896 	.pgsize_bitmap		= SZ_4K,
4897 #ifdef CONFIG_INTEL_IOMMU_SVM
4898 	.sva_bind		= intel_svm_bind,
4899 	.sva_unbind		= intel_svm_unbind,
4900 	.sva_get_pasid		= intel_svm_get_pasid,
4901 	.page_response		= intel_svm_page_response,
4902 #endif
4903 	.default_domain_ops = &(const struct iommu_domain_ops) {
4904 		.attach_dev		= intel_iommu_attach_device,
4905 		.detach_dev		= intel_iommu_detach_device,
4906 		.map_pages		= intel_iommu_map_pages,
4907 		.unmap_pages		= intel_iommu_unmap_pages,
4908 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4909 		.flush_iotlb_all        = intel_flush_iotlb_all,
4910 		.iotlb_sync		= intel_iommu_tlb_sync,
4911 		.iova_to_phys		= intel_iommu_iova_to_phys,
4912 		.free			= intel_iommu_domain_free,
4913 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4914 	}
4915 };
4916 
4917 static void quirk_iommu_igfx(struct pci_dev *dev)
4918 {
4919 	if (risky_device(dev))
4920 		return;
4921 
4922 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4923 	dmar_map_gfx = 0;
4924 }
4925 
4926 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4934 
4935 /* Broadwell igfx malfunctions with dmar */
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4960 
4961 static void quirk_iommu_rwbf(struct pci_dev *dev)
4962 {
4963 	if (risky_device(dev))
4964 		return;
4965 
4966 	/*
4967 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4968 	 * but needs it. Same seems to hold for the desktop versions.
4969 	 */
4970 	pci_info(dev, "Forcing write-buffer flush capability\n");
4971 	rwbf_quirk = 1;
4972 }
4973 
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4981 
4982 #define GGC 0x52
4983 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4984 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4985 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4986 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4987 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4988 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4989 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4990 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4991 
4992 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4993 {
4994 	unsigned short ggc;
4995 
4996 	if (risky_device(dev))
4997 		return;
4998 
4999 	if (pci_read_config_word(dev, GGC, &ggc))
5000 		return;
5001 
5002 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5003 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5004 		dmar_map_gfx = 0;
5005 	} else if (dmar_map_gfx) {
5006 		/* we have to ensure the gfx device is idle before we flush */
5007 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5008 		iommu_set_dma_strict();
5009 	}
5010 }
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5015 
5016 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5017 {
5018 	unsigned short ver;
5019 
5020 	if (!IS_GFX_DEVICE(dev))
5021 		return;
5022 
5023 	ver = (dev->device >> 8) & 0xff;
5024 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5025 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5026 	    ver != 0x9a && ver != 0xa7)
5027 		return;
5028 
5029 	if (risky_device(dev))
5030 		return;
5031 
5032 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5033 	iommu_skip_te_disable = 1;
5034 }
5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5036 
5037 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5038    ISOCH DMAR unit for the Azalia sound device, but not give it any
5039    TLB entries, which causes it to deadlock. Check for that.  We do
5040    this in a function called from init_dmars(), instead of in a PCI
5041    quirk, because we don't want to print the obnoxious "BIOS broken"
5042    message if VT-d is actually disabled.
5043 */
5044 static void __init check_tylersburg_isoch(void)
5045 {
5046 	struct pci_dev *pdev;
5047 	uint32_t vtisochctrl;
5048 
5049 	/* If there's no Azalia in the system anyway, forget it. */
5050 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5051 	if (!pdev)
5052 		return;
5053 
5054 	if (risky_device(pdev)) {
5055 		pci_dev_put(pdev);
5056 		return;
5057 	}
5058 
5059 	pci_dev_put(pdev);
5060 
5061 	/* System Management Registers. Might be hidden, in which case
5062 	   we can't do the sanity check. But that's OK, because the
5063 	   known-broken BIOSes _don't_ actually hide it, so far. */
5064 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5065 	if (!pdev)
5066 		return;
5067 
5068 	if (risky_device(pdev)) {
5069 		pci_dev_put(pdev);
5070 		return;
5071 	}
5072 
5073 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5074 		pci_dev_put(pdev);
5075 		return;
5076 	}
5077 
5078 	pci_dev_put(pdev);
5079 
5080 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5081 	if (vtisochctrl & 1)
5082 		return;
5083 
5084 	/* Drop all bits other than the number of TLB entries */
5085 	vtisochctrl &= 0x1c;
5086 
5087 	/* If we have the recommended number of TLB entries (16), fine. */
5088 	if (vtisochctrl == 0x10)
5089 		return;
5090 
5091 	/* Zero TLB entries? You get to ride the short bus to school. */
5092 	if (!vtisochctrl) {
5093 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5094 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5095 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5096 		     dmi_get_system_info(DMI_BIOS_VERSION),
5097 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5098 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5099 		return;
5100 	}
5101 
5102 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5103 	       vtisochctrl);
5104 }
5105