xref: /openbmc/linux/drivers/iommu/intel/iommu.c (revision 72661ff7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28 
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131 
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136 
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144 
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146 
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153 	if (!(re->lo & 1))
154 		return 0;
155 
156 	return re->lo & VTD_PAGE_MASK;
157 }
158 
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165 	if (!(re->hi & 1))
166 		return 0;
167 
168 	return re->hi & VTD_PAGE_MASK;
169 }
170 
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173 	context->lo &= ~(1ULL << 11);
174 }
175 
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178 	return !!(context->lo & (1ULL << 11));
179 }
180 
181 static inline void context_set_copied(struct context_entry *context)
182 {
183 	context->hi |= (1ull << 3);
184 }
185 
186 static inline bool context_copied(struct context_entry *context)
187 {
188 	return !!(context->hi & (1ULL << 3));
189 }
190 
191 static inline bool __context_present(struct context_entry *context)
192 {
193 	return (context->lo & 1);
194 }
195 
196 bool context_present(struct context_entry *context)
197 {
198 	return context_pasid_enabled(context) ?
199 	     __context_present(context) :
200 	     __context_present(context) && !context_copied(context);
201 }
202 
203 static inline void context_set_present(struct context_entry *context)
204 {
205 	context->lo |= 1;
206 }
207 
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210 	context->lo &= (((u64)-1) << 2) | 1;
211 }
212 
213 static inline void context_set_translation_type(struct context_entry *context,
214 						unsigned long value)
215 {
216 	context->lo &= (((u64)-1) << 4) | 3;
217 	context->lo |= (value & 3) << 2;
218 }
219 
220 static inline void context_set_address_root(struct context_entry *context,
221 					    unsigned long value)
222 {
223 	context->lo &= ~VTD_PAGE_MASK;
224 	context->lo |= value & VTD_PAGE_MASK;
225 }
226 
227 static inline void context_set_address_width(struct context_entry *context,
228 					     unsigned long value)
229 {
230 	context->hi |= value & 7;
231 }
232 
233 static inline void context_set_domain_id(struct context_entry *context,
234 					 unsigned long value)
235 {
236 	context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238 
239 static inline int context_domain_id(struct context_entry *c)
240 {
241 	return((c->hi >> 8) & 0xffff);
242 }
243 
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246 	context->lo = 0;
247 	context->hi = 0;
248 }
249 
250 /*
251  * This domain is a statically identity mapping domain.
252  *	1. This domain creats a static 1:1 mapping to all usable memory.
253  * 	2. It maps to each iommu if successful.
254  *	3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258 
259 #define for_each_domain_iommu(idx, domain)			\
260 	for (idx = 0; idx < g_num_of_iommus; idx++)		\
261 		if (domain->iommu_refcnt[idx])
262 
263 struct dmar_rmrr_unit {
264 	struct list_head list;		/* list of rmrr units	*/
265 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
266 	u64	base_address;		/* reserved base address*/
267 	u64	end_address;		/* reserved end address */
268 	struct dmar_dev_scope *devices;	/* target devices */
269 	int	devices_cnt;		/* target device count */
270 };
271 
272 struct dmar_atsr_unit {
273 	struct list_head list;		/* list of ATSR units */
274 	struct acpi_dmar_header *hdr;	/* ACPI header */
275 	struct dmar_dev_scope *devices;	/* target devices */
276 	int devices_cnt;		/* target device count */
277 	u8 include_all:1;		/* include all ports */
278 };
279 
280 struct dmar_satc_unit {
281 	struct list_head list;		/* list of SATC units */
282 	struct acpi_dmar_header *hdr;	/* ACPI header */
283 	struct dmar_dev_scope *devices;	/* target devices */
284 	struct intel_iommu *iommu;	/* the corresponding iommu */
285 	int devices_cnt;		/* target device count */
286 	u8 atc_required:1;		/* ATS is required */
287 };
288 
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292 
293 #define for_each_rmrr_units(rmrr) \
294 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295 
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298 
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302 
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305 
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308 
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313 
314 #define IDENTMAP_GFX		2
315 #define IDENTMAP_AZALIA		4
316 
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319 
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322 
323 /*
324  * Iterate over elements in device_domain_list and call the specified
325  * callback @fn against each element.
326  */
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328 				     void *data), void *data)
329 {
330 	int ret = 0;
331 	unsigned long flags;
332 	struct device_domain_info *info;
333 
334 	spin_lock_irqsave(&device_domain_lock, flags);
335 	list_for_each_entry(info, &device_domain_list, global) {
336 		ret = fn(info, data);
337 		if (ret) {
338 			spin_unlock_irqrestore(&device_domain_lock, flags);
339 			return ret;
340 		}
341 	}
342 	spin_unlock_irqrestore(&device_domain_lock, flags);
343 
344 	return 0;
345 }
346 
347 const struct iommu_ops intel_iommu_ops;
348 
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
350 {
351 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
352 }
353 
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
355 {
356 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
357 }
358 
359 static void init_translation_status(struct intel_iommu *iommu)
360 {
361 	u32 gsts;
362 
363 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
364 	if (gsts & DMA_GSTS_TES)
365 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
366 }
367 
368 static int __init intel_iommu_setup(char *str)
369 {
370 	if (!str)
371 		return -EINVAL;
372 
373 	while (*str) {
374 		if (!strncmp(str, "on", 2)) {
375 			dmar_disabled = 0;
376 			pr_info("IOMMU enabled\n");
377 		} else if (!strncmp(str, "off", 3)) {
378 			dmar_disabled = 1;
379 			no_platform_optin = 1;
380 			pr_info("IOMMU disabled\n");
381 		} else if (!strncmp(str, "igfx_off", 8)) {
382 			dmar_map_gfx = 0;
383 			pr_info("Disable GFX device mapping\n");
384 		} else if (!strncmp(str, "forcedac", 8)) {
385 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386 			iommu_dma_forcedac = true;
387 		} else if (!strncmp(str, "strict", 6)) {
388 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389 			iommu_set_dma_strict();
390 		} else if (!strncmp(str, "sp_off", 6)) {
391 			pr_info("Disable supported super page\n");
392 			intel_iommu_superpage = 0;
393 		} else if (!strncmp(str, "sm_on", 5)) {
394 			pr_info("Enable scalable mode if hardware supports\n");
395 			intel_iommu_sm = 1;
396 		} else if (!strncmp(str, "sm_off", 6)) {
397 			pr_info("Scalable mode is disallowed\n");
398 			intel_iommu_sm = 0;
399 		} else if (!strncmp(str, "tboot_noforce", 13)) {
400 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401 			intel_iommu_tboot_noforce = 1;
402 		} else {
403 			pr_notice("Unknown option - '%s'\n", str);
404 		}
405 
406 		str += strcspn(str, ",");
407 		while (*str == ',')
408 			str++;
409 	}
410 
411 	return 1;
412 }
413 __setup("intel_iommu=", intel_iommu_setup);
414 
415 void *alloc_pgtable_page(int node)
416 {
417 	struct page *page;
418 	void *vaddr = NULL;
419 
420 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
421 	if (page)
422 		vaddr = page_address(page);
423 	return vaddr;
424 }
425 
426 void free_pgtable_page(void *vaddr)
427 {
428 	free_page((unsigned long)vaddr);
429 }
430 
431 static inline int domain_type_is_si(struct dmar_domain *domain)
432 {
433 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
434 }
435 
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
437 {
438 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
439 }
440 
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
442 				       unsigned long pfn)
443 {
444 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
445 
446 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
447 }
448 
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
450 {
451 	unsigned long sagaw;
452 	int agaw;
453 
454 	sagaw = cap_sagaw(iommu->cap);
455 	for (agaw = width_to_agaw(max_gaw);
456 	     agaw >= 0; agaw--) {
457 		if (test_bit(agaw, &sagaw))
458 			break;
459 	}
460 
461 	return agaw;
462 }
463 
464 /*
465  * Calculate max SAGAW for each iommu.
466  */
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
468 {
469 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
470 }
471 
472 /*
473  * calculate agaw for each iommu.
474  * "SAGAW" may be different across iommus, use a default agaw, and
475  * get a supported less agaw for iommus that don't support the default agaw.
476  */
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
478 {
479 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
480 }
481 
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
484 {
485 	int iommu_id;
486 
487 	/* si_domain and vm domain should not get here. */
488 	if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
489 		return NULL;
490 
491 	for_each_domain_iommu(iommu_id, domain)
492 		break;
493 
494 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
495 		return NULL;
496 
497 	return g_iommus[iommu_id];
498 }
499 
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
501 {
502 	return sm_supported(iommu) ?
503 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
504 }
505 
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
507 {
508 	struct dmar_drhd_unit *drhd;
509 	struct intel_iommu *iommu;
510 	bool found = false;
511 	int i;
512 
513 	domain->iommu_coherency = true;
514 
515 	for_each_domain_iommu(i, domain) {
516 		found = true;
517 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
518 			domain->iommu_coherency = false;
519 			break;
520 		}
521 	}
522 	if (found)
523 		return;
524 
525 	/* No hardware attached; use lowest common denominator */
526 	rcu_read_lock();
527 	for_each_active_iommu(iommu, drhd) {
528 		if (!iommu_paging_structure_coherency(iommu)) {
529 			domain->iommu_coherency = false;
530 			break;
531 		}
532 	}
533 	rcu_read_unlock();
534 }
535 
536 static bool domain_update_iommu_snooping(struct intel_iommu *skip)
537 {
538 	struct dmar_drhd_unit *drhd;
539 	struct intel_iommu *iommu;
540 	bool ret = true;
541 
542 	rcu_read_lock();
543 	for_each_active_iommu(iommu, drhd) {
544 		if (iommu != skip) {
545 			/*
546 			 * If the hardware is operating in the scalable mode,
547 			 * the snooping control is always supported since we
548 			 * always set PASID-table-entry.PGSNP bit if the domain
549 			 * is managed outside (UNMANAGED).
550 			 */
551 			if (!sm_supported(iommu) &&
552 			    !ecap_sc_support(iommu->ecap)) {
553 				ret = false;
554 				break;
555 			}
556 		}
557 	}
558 	rcu_read_unlock();
559 
560 	return ret;
561 }
562 
563 static int domain_update_iommu_superpage(struct dmar_domain *domain,
564 					 struct intel_iommu *skip)
565 {
566 	struct dmar_drhd_unit *drhd;
567 	struct intel_iommu *iommu;
568 	int mask = 0x3;
569 
570 	if (!intel_iommu_superpage)
571 		return 0;
572 
573 	/* set iommu_superpage to the smallest common denominator */
574 	rcu_read_lock();
575 	for_each_active_iommu(iommu, drhd) {
576 		if (iommu != skip) {
577 			if (domain && domain_use_first_level(domain)) {
578 				if (!cap_fl1gp_support(iommu->cap))
579 					mask = 0x1;
580 			} else {
581 				mask &= cap_super_page_val(iommu->cap);
582 			}
583 
584 			if (!mask)
585 				break;
586 		}
587 	}
588 	rcu_read_unlock();
589 
590 	return fls(mask);
591 }
592 
593 static int domain_update_device_node(struct dmar_domain *domain)
594 {
595 	struct device_domain_info *info;
596 	int nid = NUMA_NO_NODE;
597 
598 	assert_spin_locked(&device_domain_lock);
599 
600 	if (list_empty(&domain->devices))
601 		return NUMA_NO_NODE;
602 
603 	list_for_each_entry(info, &domain->devices, link) {
604 		if (!info->dev)
605 			continue;
606 
607 		/*
608 		 * There could possibly be multiple device numa nodes as devices
609 		 * within the same domain may sit behind different IOMMUs. There
610 		 * isn't perfect answer in such situation, so we select first
611 		 * come first served policy.
612 		 */
613 		nid = dev_to_node(info->dev);
614 		if (nid != NUMA_NO_NODE)
615 			break;
616 	}
617 
618 	return nid;
619 }
620 
621 static void domain_update_iotlb(struct dmar_domain *domain);
622 
623 /* Return the super pagesize bitmap if supported. */
624 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
625 {
626 	unsigned long bitmap = 0;
627 
628 	/*
629 	 * 1-level super page supports page size of 2MiB, 2-level super page
630 	 * supports page size of both 2MiB and 1GiB.
631 	 */
632 	if (domain->iommu_superpage == 1)
633 		bitmap |= SZ_2M;
634 	else if (domain->iommu_superpage == 2)
635 		bitmap |= SZ_2M | SZ_1G;
636 
637 	return bitmap;
638 }
639 
640 /* Some capabilities may be different across iommus */
641 static void domain_update_iommu_cap(struct dmar_domain *domain)
642 {
643 	domain_update_iommu_coherency(domain);
644 	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
645 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
646 
647 	/*
648 	 * If RHSA is missing, we should default to the device numa domain
649 	 * as fall back.
650 	 */
651 	if (domain->nid == NUMA_NO_NODE)
652 		domain->nid = domain_update_device_node(domain);
653 
654 	/*
655 	 * First-level translation restricts the input-address to a
656 	 * canonical address (i.e., address bits 63:N have the same
657 	 * value as address bit [N-1], where N is 48-bits with 4-level
658 	 * paging and 57-bits with 5-level paging). Hence, skip bit
659 	 * [N-1].
660 	 */
661 	if (domain_use_first_level(domain))
662 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
663 	else
664 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
665 
666 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
667 	domain_update_iotlb(domain);
668 }
669 
670 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
671 					 u8 devfn, int alloc)
672 {
673 	struct root_entry *root = &iommu->root_entry[bus];
674 	struct context_entry *context;
675 	u64 *entry;
676 
677 	entry = &root->lo;
678 	if (sm_supported(iommu)) {
679 		if (devfn >= 0x80) {
680 			devfn -= 0x80;
681 			entry = &root->hi;
682 		}
683 		devfn *= 2;
684 	}
685 	if (*entry & 1)
686 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
687 	else {
688 		unsigned long phy_addr;
689 		if (!alloc)
690 			return NULL;
691 
692 		context = alloc_pgtable_page(iommu->node);
693 		if (!context)
694 			return NULL;
695 
696 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
697 		phy_addr = virt_to_phys((void *)context);
698 		*entry = phy_addr | 1;
699 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
700 	}
701 	return &context[devfn];
702 }
703 
704 /**
705  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
706  *				 sub-hierarchy of a candidate PCI-PCI bridge
707  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
708  * @bridge: the candidate PCI-PCI bridge
709  *
710  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
711  */
712 static bool
713 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
714 {
715 	struct pci_dev *pdev, *pbridge;
716 
717 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
718 		return false;
719 
720 	pdev = to_pci_dev(dev);
721 	pbridge = to_pci_dev(bridge);
722 
723 	if (pbridge->subordinate &&
724 	    pbridge->subordinate->number <= pdev->bus->number &&
725 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
726 		return true;
727 
728 	return false;
729 }
730 
731 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
732 {
733 	struct dmar_drhd_unit *drhd;
734 	u32 vtbar;
735 	int rc;
736 
737 	/* We know that this device on this chipset has its own IOMMU.
738 	 * If we find it under a different IOMMU, then the BIOS is lying
739 	 * to us. Hope that the IOMMU for this device is actually
740 	 * disabled, and it needs no translation...
741 	 */
742 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
743 	if (rc) {
744 		/* "can't" happen */
745 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
746 		return false;
747 	}
748 	vtbar &= 0xffff0000;
749 
750 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
751 	drhd = dmar_find_matched_drhd_unit(pdev);
752 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
753 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
754 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
755 		return true;
756 	}
757 
758 	return false;
759 }
760 
761 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
762 {
763 	if (!iommu || iommu->drhd->ignored)
764 		return true;
765 
766 	if (dev_is_pci(dev)) {
767 		struct pci_dev *pdev = to_pci_dev(dev);
768 
769 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
770 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
771 		    quirk_ioat_snb_local_iommu(pdev))
772 			return true;
773 	}
774 
775 	return false;
776 }
777 
778 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
779 {
780 	struct dmar_drhd_unit *drhd = NULL;
781 	struct pci_dev *pdev = NULL;
782 	struct intel_iommu *iommu;
783 	struct device *tmp;
784 	u16 segment = 0;
785 	int i;
786 
787 	if (!dev)
788 		return NULL;
789 
790 	if (dev_is_pci(dev)) {
791 		struct pci_dev *pf_pdev;
792 
793 		pdev = pci_real_dma_dev(to_pci_dev(dev));
794 
795 		/* VFs aren't listed in scope tables; we need to look up
796 		 * the PF instead to find the IOMMU. */
797 		pf_pdev = pci_physfn(pdev);
798 		dev = &pf_pdev->dev;
799 		segment = pci_domain_nr(pdev->bus);
800 	} else if (has_acpi_companion(dev))
801 		dev = &ACPI_COMPANION(dev)->dev;
802 
803 	rcu_read_lock();
804 	for_each_iommu(iommu, drhd) {
805 		if (pdev && segment != drhd->segment)
806 			continue;
807 
808 		for_each_active_dev_scope(drhd->devices,
809 					  drhd->devices_cnt, i, tmp) {
810 			if (tmp == dev) {
811 				/* For a VF use its original BDF# not that of the PF
812 				 * which we used for the IOMMU lookup. Strictly speaking
813 				 * we could do this for all PCI devices; we only need to
814 				 * get the BDF# from the scope table for ACPI matches. */
815 				if (pdev && pdev->is_virtfn)
816 					goto got_pdev;
817 
818 				if (bus && devfn) {
819 					*bus = drhd->devices[i].bus;
820 					*devfn = drhd->devices[i].devfn;
821 				}
822 				goto out;
823 			}
824 
825 			if (is_downstream_to_pci_bridge(dev, tmp))
826 				goto got_pdev;
827 		}
828 
829 		if (pdev && drhd->include_all) {
830 got_pdev:
831 			if (bus && devfn) {
832 				*bus = pdev->bus->number;
833 				*devfn = pdev->devfn;
834 			}
835 			goto out;
836 		}
837 	}
838 	iommu = NULL;
839 out:
840 	if (iommu_is_dummy(iommu, dev))
841 		iommu = NULL;
842 
843 	rcu_read_unlock();
844 
845 	return iommu;
846 }
847 
848 static void domain_flush_cache(struct dmar_domain *domain,
849 			       void *addr, int size)
850 {
851 	if (!domain->iommu_coherency)
852 		clflush_cache_range(addr, size);
853 }
854 
855 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
856 {
857 	struct context_entry *context;
858 	int ret = 0;
859 	unsigned long flags;
860 
861 	spin_lock_irqsave(&iommu->lock, flags);
862 	context = iommu_context_addr(iommu, bus, devfn, 0);
863 	if (context)
864 		ret = context_present(context);
865 	spin_unlock_irqrestore(&iommu->lock, flags);
866 	return ret;
867 }
868 
869 static void free_context_table(struct intel_iommu *iommu)
870 {
871 	int i;
872 	unsigned long flags;
873 	struct context_entry *context;
874 
875 	spin_lock_irqsave(&iommu->lock, flags);
876 	if (!iommu->root_entry) {
877 		goto out;
878 	}
879 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
880 		context = iommu_context_addr(iommu, i, 0, 0);
881 		if (context)
882 			free_pgtable_page(context);
883 
884 		if (!sm_supported(iommu))
885 			continue;
886 
887 		context = iommu_context_addr(iommu, i, 0x80, 0);
888 		if (context)
889 			free_pgtable_page(context);
890 
891 	}
892 	free_pgtable_page(iommu->root_entry);
893 	iommu->root_entry = NULL;
894 out:
895 	spin_unlock_irqrestore(&iommu->lock, flags);
896 }
897 
898 #ifdef CONFIG_DMAR_DEBUG
899 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
900 {
901 	struct device_domain_info *info;
902 	struct dma_pte *parent, *pte;
903 	struct dmar_domain *domain;
904 	int offset, level;
905 
906 	info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
907 	if (!info || !info->domain) {
908 		pr_info("device [%02x:%02x.%d] not probed\n",
909 			bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
910 		return;
911 	}
912 
913 	domain = info->domain;
914 	level = agaw_to_level(domain->agaw);
915 	parent = domain->pgd;
916 	if (!parent) {
917 		pr_info("no page table setup\n");
918 		return;
919 	}
920 
921 	while (1) {
922 		offset = pfn_level_offset(pfn, level);
923 		pte = &parent[offset];
924 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
925 			pr_info("PTE not present at level %d\n", level);
926 			break;
927 		}
928 
929 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
930 
931 		if (level == 1)
932 			break;
933 
934 		parent = phys_to_virt(dma_pte_addr(pte));
935 		level--;
936 	}
937 }
938 
939 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
940 			  unsigned long long addr, u32 pasid)
941 {
942 	struct pasid_dir_entry *dir, *pde;
943 	struct pasid_entry *entries, *pte;
944 	struct context_entry *ctx_entry;
945 	struct root_entry *rt_entry;
946 	u8 devfn = source_id & 0xff;
947 	u8 bus = source_id >> 8;
948 	int i, dir_index, index;
949 
950 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
951 
952 	/* root entry dump */
953 	rt_entry = &iommu->root_entry[bus];
954 	if (!rt_entry) {
955 		pr_info("root table entry is not present\n");
956 		return;
957 	}
958 
959 	if (sm_supported(iommu))
960 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
961 			rt_entry->hi, rt_entry->lo);
962 	else
963 		pr_info("root entry: 0x%016llx", rt_entry->lo);
964 
965 	/* context entry dump */
966 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
967 	if (!ctx_entry) {
968 		pr_info("context table entry is not present\n");
969 		return;
970 	}
971 
972 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
973 		ctx_entry->hi, ctx_entry->lo);
974 
975 	/* legacy mode does not require PASID entries */
976 	if (!sm_supported(iommu))
977 		goto pgtable_walk;
978 
979 	/* get the pointer to pasid directory entry */
980 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
981 	if (!dir) {
982 		pr_info("pasid directory entry is not present\n");
983 		return;
984 	}
985 	/* For request-without-pasid, get the pasid from context entry */
986 	if (intel_iommu_sm && pasid == INVALID_IOASID)
987 		pasid = PASID_RID2PASID;
988 
989 	dir_index = pasid >> PASID_PDE_SHIFT;
990 	pde = &dir[dir_index];
991 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
992 
993 	/* get the pointer to the pasid table entry */
994 	entries = get_pasid_table_from_pde(pde);
995 	if (!entries) {
996 		pr_info("pasid table entry is not present\n");
997 		return;
998 	}
999 	index = pasid & PASID_PTE_MASK;
1000 	pte = &entries[index];
1001 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1002 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1003 
1004 pgtable_walk:
1005 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1006 }
1007 #endif
1008 
1009 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1010 				      unsigned long pfn, int *target_level)
1011 {
1012 	struct dma_pte *parent, *pte;
1013 	int level = agaw_to_level(domain->agaw);
1014 	int offset;
1015 
1016 	BUG_ON(!domain->pgd);
1017 
1018 	if (!domain_pfn_supported(domain, pfn))
1019 		/* Address beyond IOMMU's addressing capabilities. */
1020 		return NULL;
1021 
1022 	parent = domain->pgd;
1023 
1024 	while (1) {
1025 		void *tmp_page;
1026 
1027 		offset = pfn_level_offset(pfn, level);
1028 		pte = &parent[offset];
1029 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1030 			break;
1031 		if (level == *target_level)
1032 			break;
1033 
1034 		if (!dma_pte_present(pte)) {
1035 			uint64_t pteval;
1036 
1037 			tmp_page = alloc_pgtable_page(domain->nid);
1038 
1039 			if (!tmp_page)
1040 				return NULL;
1041 
1042 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1043 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1044 			if (domain_use_first_level(domain)) {
1045 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1046 				if (iommu_is_dma_domain(&domain->domain))
1047 					pteval |= DMA_FL_PTE_ACCESS;
1048 			}
1049 			if (cmpxchg64(&pte->val, 0ULL, pteval))
1050 				/* Someone else set it while we were thinking; use theirs. */
1051 				free_pgtable_page(tmp_page);
1052 			else
1053 				domain_flush_cache(domain, pte, sizeof(*pte));
1054 		}
1055 		if (level == 1)
1056 			break;
1057 
1058 		parent = phys_to_virt(dma_pte_addr(pte));
1059 		level--;
1060 	}
1061 
1062 	if (!*target_level)
1063 		*target_level = level;
1064 
1065 	return pte;
1066 }
1067 
1068 /* return address's pte at specific level */
1069 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1070 					 unsigned long pfn,
1071 					 int level, int *large_page)
1072 {
1073 	struct dma_pte *parent, *pte;
1074 	int total = agaw_to_level(domain->agaw);
1075 	int offset;
1076 
1077 	parent = domain->pgd;
1078 	while (level <= total) {
1079 		offset = pfn_level_offset(pfn, total);
1080 		pte = &parent[offset];
1081 		if (level == total)
1082 			return pte;
1083 
1084 		if (!dma_pte_present(pte)) {
1085 			*large_page = total;
1086 			break;
1087 		}
1088 
1089 		if (dma_pte_superpage(pte)) {
1090 			*large_page = total;
1091 			return pte;
1092 		}
1093 
1094 		parent = phys_to_virt(dma_pte_addr(pte));
1095 		total--;
1096 	}
1097 	return NULL;
1098 }
1099 
1100 /* clear last level pte, a tlb flush should be followed */
1101 static void dma_pte_clear_range(struct dmar_domain *domain,
1102 				unsigned long start_pfn,
1103 				unsigned long last_pfn)
1104 {
1105 	unsigned int large_page;
1106 	struct dma_pte *first_pte, *pte;
1107 
1108 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1109 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1110 	BUG_ON(start_pfn > last_pfn);
1111 
1112 	/* we don't need lock here; nobody else touches the iova range */
1113 	do {
1114 		large_page = 1;
1115 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1116 		if (!pte) {
1117 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1118 			continue;
1119 		}
1120 		do {
1121 			dma_clear_pte(pte);
1122 			start_pfn += lvl_to_nr_pages(large_page);
1123 			pte++;
1124 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1125 
1126 		domain_flush_cache(domain, first_pte,
1127 				   (void *)pte - (void *)first_pte);
1128 
1129 	} while (start_pfn && start_pfn <= last_pfn);
1130 }
1131 
1132 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1133 			       int retain_level, struct dma_pte *pte,
1134 			       unsigned long pfn, unsigned long start_pfn,
1135 			       unsigned long last_pfn)
1136 {
1137 	pfn = max(start_pfn, pfn);
1138 	pte = &pte[pfn_level_offset(pfn, level)];
1139 
1140 	do {
1141 		unsigned long level_pfn;
1142 		struct dma_pte *level_pte;
1143 
1144 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1145 			goto next;
1146 
1147 		level_pfn = pfn & level_mask(level);
1148 		level_pte = phys_to_virt(dma_pte_addr(pte));
1149 
1150 		if (level > 2) {
1151 			dma_pte_free_level(domain, level - 1, retain_level,
1152 					   level_pte, level_pfn, start_pfn,
1153 					   last_pfn);
1154 		}
1155 
1156 		/*
1157 		 * Free the page table if we're below the level we want to
1158 		 * retain and the range covers the entire table.
1159 		 */
1160 		if (level < retain_level && !(start_pfn > level_pfn ||
1161 		      last_pfn < level_pfn + level_size(level) - 1)) {
1162 			dma_clear_pte(pte);
1163 			domain_flush_cache(domain, pte, sizeof(*pte));
1164 			free_pgtable_page(level_pte);
1165 		}
1166 next:
1167 		pfn += level_size(level);
1168 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1169 }
1170 
1171 /*
1172  * clear last level (leaf) ptes and free page table pages below the
1173  * level we wish to keep intact.
1174  */
1175 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1176 				   unsigned long start_pfn,
1177 				   unsigned long last_pfn,
1178 				   int retain_level)
1179 {
1180 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1181 
1182 	/* We don't need lock here; nobody else touches the iova range */
1183 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1184 			   domain->pgd, 0, start_pfn, last_pfn);
1185 
1186 	/* free pgd */
1187 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188 		free_pgtable_page(domain->pgd);
1189 		domain->pgd = NULL;
1190 	}
1191 }
1192 
1193 /* When a page at a given level is being unlinked from its parent, we don't
1194    need to *modify* it at all. All we need to do is make a list of all the
1195    pages which can be freed just as soon as we've flushed the IOTLB and we
1196    know the hardware page-walk will no longer touch them.
1197    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1198    be freed. */
1199 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1200 				    int level, struct dma_pte *pte,
1201 				    struct list_head *freelist)
1202 {
1203 	struct page *pg;
1204 
1205 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1206 	list_add_tail(&pg->lru, freelist);
1207 
1208 	if (level == 1)
1209 		return;
1210 
1211 	pte = page_address(pg);
1212 	do {
1213 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1214 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1215 		pte++;
1216 	} while (!first_pte_in_page(pte));
1217 }
1218 
1219 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 				struct dma_pte *pte, unsigned long pfn,
1221 				unsigned long start_pfn, unsigned long last_pfn,
1222 				struct list_head *freelist)
1223 {
1224 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1225 
1226 	pfn = max(start_pfn, pfn);
1227 	pte = &pte[pfn_level_offset(pfn, level)];
1228 
1229 	do {
1230 		unsigned long level_pfn = pfn & level_mask(level);
1231 
1232 		if (!dma_pte_present(pte))
1233 			goto next;
1234 
1235 		/* If range covers entire pagetable, free it */
1236 		if (start_pfn <= level_pfn &&
1237 		    last_pfn >= level_pfn + level_size(level) - 1) {
1238 			/* These suborbinate page tables are going away entirely. Don't
1239 			   bother to clear them; we're just going to *free* them. */
1240 			if (level > 1 && !dma_pte_superpage(pte))
1241 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1242 
1243 			dma_clear_pte(pte);
1244 			if (!first_pte)
1245 				first_pte = pte;
1246 			last_pte = pte;
1247 		} else if (level > 1) {
1248 			/* Recurse down into a level that isn't *entirely* obsolete */
1249 			dma_pte_clear_level(domain, level - 1,
1250 					    phys_to_virt(dma_pte_addr(pte)),
1251 					    level_pfn, start_pfn, last_pfn,
1252 					    freelist);
1253 		}
1254 next:
1255 		pfn = level_pfn + level_size(level);
1256 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1257 
1258 	if (first_pte)
1259 		domain_flush_cache(domain, first_pte,
1260 				   (void *)++last_pte - (void *)first_pte);
1261 }
1262 
1263 /* We can't just free the pages because the IOMMU may still be walking
1264    the page tables, and may have cached the intermediate levels. The
1265    pages can only be freed after the IOTLB flush has been done. */
1266 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1267 			 unsigned long last_pfn, struct list_head *freelist)
1268 {
1269 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1270 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1271 	BUG_ON(start_pfn > last_pfn);
1272 
1273 	/* we don't need lock here; nobody else touches the iova range */
1274 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1275 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1276 
1277 	/* free pgd */
1278 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1279 		struct page *pgd_page = virt_to_page(domain->pgd);
1280 		list_add_tail(&pgd_page->lru, freelist);
1281 		domain->pgd = NULL;
1282 	}
1283 }
1284 
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1287 {
1288 	struct root_entry *root;
1289 	unsigned long flags;
1290 
1291 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1292 	if (!root) {
1293 		pr_err("Allocating root entry for %s failed\n",
1294 			iommu->name);
1295 		return -ENOMEM;
1296 	}
1297 
1298 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1299 
1300 	spin_lock_irqsave(&iommu->lock, flags);
1301 	iommu->root_entry = root;
1302 	spin_unlock_irqrestore(&iommu->lock, flags);
1303 
1304 	return 0;
1305 }
1306 
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1308 {
1309 	u64 addr;
1310 	u32 sts;
1311 	unsigned long flag;
1312 
1313 	addr = virt_to_phys(iommu->root_entry);
1314 	if (sm_supported(iommu))
1315 		addr |= DMA_RTADDR_SMT;
1316 
1317 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1319 
1320 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1321 
1322 	/* Make sure hardware complete it */
1323 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324 		      readl, (sts & DMA_GSTS_RTPS), sts);
1325 
1326 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 
1328 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1329 	if (sm_supported(iommu))
1330 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1331 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1332 }
1333 
1334 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1335 {
1336 	u32 val;
1337 	unsigned long flag;
1338 
1339 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1340 		return;
1341 
1342 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1343 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1344 
1345 	/* Make sure hardware complete it */
1346 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1347 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1348 
1349 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1350 }
1351 
1352 /* return value determine if we need a write buffer flush */
1353 static void __iommu_flush_context(struct intel_iommu *iommu,
1354 				  u16 did, u16 source_id, u8 function_mask,
1355 				  u64 type)
1356 {
1357 	u64 val = 0;
1358 	unsigned long flag;
1359 
1360 	switch (type) {
1361 	case DMA_CCMD_GLOBAL_INVL:
1362 		val = DMA_CCMD_GLOBAL_INVL;
1363 		break;
1364 	case DMA_CCMD_DOMAIN_INVL:
1365 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1366 		break;
1367 	case DMA_CCMD_DEVICE_INVL:
1368 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1369 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1370 		break;
1371 	default:
1372 		BUG();
1373 	}
1374 	val |= DMA_CCMD_ICC;
1375 
1376 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1377 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1378 
1379 	/* Make sure hardware complete it */
1380 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1381 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1382 
1383 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1384 }
1385 
1386 /* return value determine if we need a write buffer flush */
1387 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1388 				u64 addr, unsigned int size_order, u64 type)
1389 {
1390 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1391 	u64 val = 0, val_iva = 0;
1392 	unsigned long flag;
1393 
1394 	switch (type) {
1395 	case DMA_TLB_GLOBAL_FLUSH:
1396 		/* global flush doesn't need set IVA_REG */
1397 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1398 		break;
1399 	case DMA_TLB_DSI_FLUSH:
1400 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1401 		break;
1402 	case DMA_TLB_PSI_FLUSH:
1403 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1404 		/* IH bit is passed in as part of address */
1405 		val_iva = size_order | addr;
1406 		break;
1407 	default:
1408 		BUG();
1409 	}
1410 	/* Note: set drain read/write */
1411 #if 0
1412 	/*
1413 	 * This is probably to be super secure.. Looks like we can
1414 	 * ignore it without any impact.
1415 	 */
1416 	if (cap_read_drain(iommu->cap))
1417 		val |= DMA_TLB_READ_DRAIN;
1418 #endif
1419 	if (cap_write_drain(iommu->cap))
1420 		val |= DMA_TLB_WRITE_DRAIN;
1421 
1422 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 	/* Note: Only uses first TLB reg currently */
1424 	if (val_iva)
1425 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1426 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1427 
1428 	/* Make sure hardware complete it */
1429 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1430 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1431 
1432 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1433 
1434 	/* check IOTLB invalidation granularity */
1435 	if (DMA_TLB_IAIG(val) == 0)
1436 		pr_err("Flush IOTLB failed\n");
1437 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1438 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1439 			(unsigned long long)DMA_TLB_IIRG(type),
1440 			(unsigned long long)DMA_TLB_IAIG(val));
1441 }
1442 
1443 static struct device_domain_info *
1444 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1445 			 u8 bus, u8 devfn)
1446 {
1447 	struct device_domain_info *info;
1448 
1449 	assert_spin_locked(&device_domain_lock);
1450 
1451 	if (!iommu->qi)
1452 		return NULL;
1453 
1454 	list_for_each_entry(info, &domain->devices, link)
1455 		if (info->iommu == iommu && info->bus == bus &&
1456 		    info->devfn == devfn) {
1457 			if (info->ats_supported && info->dev)
1458 				return info;
1459 			break;
1460 		}
1461 
1462 	return NULL;
1463 }
1464 
1465 static void domain_update_iotlb(struct dmar_domain *domain)
1466 {
1467 	struct device_domain_info *info;
1468 	bool has_iotlb_device = false;
1469 
1470 	assert_spin_locked(&device_domain_lock);
1471 
1472 	list_for_each_entry(info, &domain->devices, link)
1473 		if (info->ats_enabled) {
1474 			has_iotlb_device = true;
1475 			break;
1476 		}
1477 
1478 	domain->has_iotlb_device = has_iotlb_device;
1479 }
1480 
1481 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1482 {
1483 	struct pci_dev *pdev;
1484 
1485 	assert_spin_locked(&device_domain_lock);
1486 
1487 	if (!info || !dev_is_pci(info->dev))
1488 		return;
1489 
1490 	pdev = to_pci_dev(info->dev);
1491 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1492 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1493 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1494 	 * reserved, which should be set to 0.
1495 	 */
1496 	if (!ecap_dit(info->iommu->ecap))
1497 		info->pfsid = 0;
1498 	else {
1499 		struct pci_dev *pf_pdev;
1500 
1501 		/* pdev will be returned if device is not a vf */
1502 		pf_pdev = pci_physfn(pdev);
1503 		info->pfsid = pci_dev_id(pf_pdev);
1504 	}
1505 
1506 #ifdef CONFIG_INTEL_IOMMU_SVM
1507 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1508 	   the device if you enable PASID support after ATS support is
1509 	   undefined. So always enable PASID support on devices which
1510 	   have it, even if we can't yet know if we're ever going to
1511 	   use it. */
1512 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513 		info->pasid_enabled = 1;
1514 
1515 	if (info->pri_supported &&
1516 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1517 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1518 		info->pri_enabled = 1;
1519 #endif
1520 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1521 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1522 		info->ats_enabled = 1;
1523 		domain_update_iotlb(info->domain);
1524 		info->ats_qdep = pci_ats_queue_depth(pdev);
1525 	}
1526 }
1527 
1528 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1529 {
1530 	struct pci_dev *pdev;
1531 
1532 	assert_spin_locked(&device_domain_lock);
1533 
1534 	if (!dev_is_pci(info->dev))
1535 		return;
1536 
1537 	pdev = to_pci_dev(info->dev);
1538 
1539 	if (info->ats_enabled) {
1540 		pci_disable_ats(pdev);
1541 		info->ats_enabled = 0;
1542 		domain_update_iotlb(info->domain);
1543 	}
1544 #ifdef CONFIG_INTEL_IOMMU_SVM
1545 	if (info->pri_enabled) {
1546 		pci_disable_pri(pdev);
1547 		info->pri_enabled = 0;
1548 	}
1549 	if (info->pasid_enabled) {
1550 		pci_disable_pasid(pdev);
1551 		info->pasid_enabled = 0;
1552 	}
1553 #endif
1554 }
1555 
1556 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1557 				    u64 addr, unsigned int mask)
1558 {
1559 	u16 sid, qdep;
1560 
1561 	if (!info || !info->ats_enabled)
1562 		return;
1563 
1564 	sid = info->bus << 8 | info->devfn;
1565 	qdep = info->ats_qdep;
1566 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1567 			   qdep, addr, mask);
1568 }
1569 
1570 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1571 				  u64 addr, unsigned mask)
1572 {
1573 	unsigned long flags;
1574 	struct device_domain_info *info;
1575 
1576 	if (!domain->has_iotlb_device)
1577 		return;
1578 
1579 	spin_lock_irqsave(&device_domain_lock, flags);
1580 	list_for_each_entry(info, &domain->devices, link)
1581 		__iommu_flush_dev_iotlb(info, addr, mask);
1582 
1583 	spin_unlock_irqrestore(&device_domain_lock, flags);
1584 }
1585 
1586 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1587 				  struct dmar_domain *domain,
1588 				  unsigned long pfn, unsigned int pages,
1589 				  int ih, int map)
1590 {
1591 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1592 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1593 	u16 did = domain->iommu_did[iommu->seq_id];
1594 
1595 	BUG_ON(pages == 0);
1596 
1597 	if (ih)
1598 		ih = 1 << 6;
1599 
1600 	if (domain_use_first_level(domain)) {
1601 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1602 	} else {
1603 		/*
1604 		 * Fallback to domain selective flush if no PSI support or
1605 		 * the size is too big. PSI requires page size to be 2 ^ x,
1606 		 * and the base address is naturally aligned to the size.
1607 		 */
1608 		if (!cap_pgsel_inv(iommu->cap) ||
1609 		    mask > cap_max_amask_val(iommu->cap))
1610 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1611 							DMA_TLB_DSI_FLUSH);
1612 		else
1613 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1614 							DMA_TLB_PSI_FLUSH);
1615 	}
1616 
1617 	/*
1618 	 * In caching mode, changes of pages from non-present to present require
1619 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1620 	 */
1621 	if (!cap_caching_mode(iommu->cap) || !map)
1622 		iommu_flush_dev_iotlb(domain, addr, mask);
1623 }
1624 
1625 /* Notification for newly created mappings */
1626 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1627 					struct dmar_domain *domain,
1628 					unsigned long pfn, unsigned int pages)
1629 {
1630 	/*
1631 	 * It's a non-present to present mapping. Only flush if caching mode
1632 	 * and second level.
1633 	 */
1634 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1635 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1636 	else
1637 		iommu_flush_write_buffer(iommu);
1638 }
1639 
1640 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1641 {
1642 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1643 	int idx;
1644 
1645 	for_each_domain_iommu(idx, dmar_domain) {
1646 		struct intel_iommu *iommu = g_iommus[idx];
1647 		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1648 
1649 		if (domain_use_first_level(dmar_domain))
1650 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1651 		else
1652 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1653 						 DMA_TLB_DSI_FLUSH);
1654 
1655 		if (!cap_caching_mode(iommu->cap))
1656 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1657 	}
1658 }
1659 
1660 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1661 {
1662 	u32 pmen;
1663 	unsigned long flags;
1664 
1665 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1666 		return;
1667 
1668 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1670 	pmen &= ~DMA_PMEN_EPM;
1671 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1672 
1673 	/* wait for the protected region status bit to clear */
1674 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1675 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1676 
1677 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1678 }
1679 
1680 static void iommu_enable_translation(struct intel_iommu *iommu)
1681 {
1682 	u32 sts;
1683 	unsigned long flags;
1684 
1685 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1686 	iommu->gcmd |= DMA_GCMD_TE;
1687 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1688 
1689 	/* Make sure hardware complete it */
1690 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1691 		      readl, (sts & DMA_GSTS_TES), sts);
1692 
1693 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1694 }
1695 
1696 static void iommu_disable_translation(struct intel_iommu *iommu)
1697 {
1698 	u32 sts;
1699 	unsigned long flag;
1700 
1701 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1702 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1703 		return;
1704 
1705 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1706 	iommu->gcmd &= ~DMA_GCMD_TE;
1707 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1708 
1709 	/* Make sure hardware complete it */
1710 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1711 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1712 
1713 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1714 }
1715 
1716 static int iommu_init_domains(struct intel_iommu *iommu)
1717 {
1718 	u32 ndomains;
1719 
1720 	ndomains = cap_ndoms(iommu->cap);
1721 	pr_debug("%s: Number of Domains supported <%d>\n",
1722 		 iommu->name, ndomains);
1723 
1724 	spin_lock_init(&iommu->lock);
1725 
1726 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1727 	if (!iommu->domain_ids)
1728 		return -ENOMEM;
1729 
1730 	/*
1731 	 * If Caching mode is set, then invalid translations are tagged
1732 	 * with domain-id 0, hence we need to pre-allocate it. We also
1733 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1734 	 * make sure it is not used for a real domain.
1735 	 */
1736 	set_bit(0, iommu->domain_ids);
1737 
1738 	/*
1739 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1740 	 * entry for first-level or pass-through translation modes should
1741 	 * be programmed with a domain id different from those used for
1742 	 * second-level or nested translation. We reserve a domain id for
1743 	 * this purpose.
1744 	 */
1745 	if (sm_supported(iommu))
1746 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1747 
1748 	return 0;
1749 }
1750 
1751 static void disable_dmar_iommu(struct intel_iommu *iommu)
1752 {
1753 	struct device_domain_info *info, *tmp;
1754 	unsigned long flags;
1755 
1756 	if (!iommu->domain_ids)
1757 		return;
1758 
1759 	spin_lock_irqsave(&device_domain_lock, flags);
1760 	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1761 		if (info->iommu != iommu)
1762 			continue;
1763 
1764 		if (!info->dev || !info->domain)
1765 			continue;
1766 
1767 		__dmar_remove_one_dev_info(info);
1768 	}
1769 	spin_unlock_irqrestore(&device_domain_lock, flags);
1770 
1771 	if (iommu->gcmd & DMA_GCMD_TE)
1772 		iommu_disable_translation(iommu);
1773 }
1774 
1775 static void free_dmar_iommu(struct intel_iommu *iommu)
1776 {
1777 	if (iommu->domain_ids) {
1778 		bitmap_free(iommu->domain_ids);
1779 		iommu->domain_ids = NULL;
1780 	}
1781 
1782 	g_iommus[iommu->seq_id] = NULL;
1783 
1784 	/* free context mapping */
1785 	free_context_table(iommu);
1786 
1787 #ifdef CONFIG_INTEL_IOMMU_SVM
1788 	if (pasid_supported(iommu)) {
1789 		if (ecap_prs(iommu->ecap))
1790 			intel_svm_finish_prq(iommu);
1791 	}
1792 	if (vccap_pasid(iommu->vccap))
1793 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1794 
1795 #endif
1796 }
1797 
1798 /*
1799  * Check and return whether first level is used by default for
1800  * DMA translation.
1801  */
1802 static bool first_level_by_default(unsigned int type)
1803 {
1804 	/* Only SL is available in legacy mode */
1805 	if (!scalable_mode_support())
1806 		return false;
1807 
1808 	/* Only level (either FL or SL) is available, just use it */
1809 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1810 		return intel_cap_flts_sanity();
1811 
1812 	/* Both levels are available, decide it based on domain type */
1813 	return type != IOMMU_DOMAIN_UNMANAGED;
1814 }
1815 
1816 static struct dmar_domain *alloc_domain(unsigned int type)
1817 {
1818 	struct dmar_domain *domain;
1819 
1820 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1821 	if (!domain)
1822 		return NULL;
1823 
1824 	domain->nid = NUMA_NO_NODE;
1825 	if (first_level_by_default(type))
1826 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1827 	domain->has_iotlb_device = false;
1828 	INIT_LIST_HEAD(&domain->devices);
1829 
1830 	return domain;
1831 }
1832 
1833 /* Must be called with iommu->lock */
1834 static int domain_attach_iommu(struct dmar_domain *domain,
1835 			       struct intel_iommu *iommu)
1836 {
1837 	unsigned long ndomains;
1838 	int num;
1839 
1840 	assert_spin_locked(&device_domain_lock);
1841 	assert_spin_locked(&iommu->lock);
1842 
1843 	domain->iommu_refcnt[iommu->seq_id] += 1;
1844 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845 		ndomains = cap_ndoms(iommu->cap);
1846 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1847 
1848 		if (num >= ndomains) {
1849 			pr_err("%s: No free domain ids\n", iommu->name);
1850 			domain->iommu_refcnt[iommu->seq_id] -= 1;
1851 			return -ENOSPC;
1852 		}
1853 
1854 		set_bit(num, iommu->domain_ids);
1855 		domain->iommu_did[iommu->seq_id] = num;
1856 		domain->nid			 = iommu->node;
1857 		domain_update_iommu_cap(domain);
1858 	}
1859 
1860 	return 0;
1861 }
1862 
1863 static void domain_detach_iommu(struct dmar_domain *domain,
1864 				struct intel_iommu *iommu)
1865 {
1866 	int num;
1867 
1868 	assert_spin_locked(&device_domain_lock);
1869 	assert_spin_locked(&iommu->lock);
1870 
1871 	domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1873 		num = domain->iommu_did[iommu->seq_id];
1874 		clear_bit(num, iommu->domain_ids);
1875 		domain_update_iommu_cap(domain);
1876 		domain->iommu_did[iommu->seq_id] = 0;
1877 	}
1878 }
1879 
1880 static inline int guestwidth_to_adjustwidth(int gaw)
1881 {
1882 	int agaw;
1883 	int r = (gaw - 12) % 9;
1884 
1885 	if (r == 0)
1886 		agaw = gaw;
1887 	else
1888 		agaw = gaw + 9 - r;
1889 	if (agaw > 64)
1890 		agaw = 64;
1891 	return agaw;
1892 }
1893 
1894 static void domain_exit(struct dmar_domain *domain)
1895 {
1896 
1897 	/* Remove associated devices and clear attached or cached domains */
1898 	domain_remove_dev_info(domain);
1899 
1900 	if (domain->pgd) {
1901 		LIST_HEAD(freelist);
1902 
1903 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1904 		put_pages_list(&freelist);
1905 	}
1906 
1907 	kfree(domain);
1908 }
1909 
1910 /*
1911  * Get the PASID directory size for scalable mode context entry.
1912  * Value of X in the PDTS field of a scalable mode context entry
1913  * indicates PASID directory with 2^(X + 7) entries.
1914  */
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1916 {
1917 	unsigned long pds, max_pde;
1918 
1919 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1921 	if (pds < 7)
1922 		return 0;
1923 
1924 	return pds - 7;
1925 }
1926 
1927 /*
1928  * Set the RID_PASID field of a scalable mode context entry. The
1929  * IOMMU hardware will use the PASID value set in this field for
1930  * DMA translations of DMA requests without PASID.
1931  */
1932 static inline void
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1934 {
1935 	context->hi |= pasid & ((1 << 20) - 1);
1936 }
1937 
1938 /*
1939  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1940  * entry.
1941  */
1942 static inline void context_set_sm_dte(struct context_entry *context)
1943 {
1944 	context->lo |= (1 << 2);
1945 }
1946 
1947 /*
1948  * Set the PRE(Page Request Enable) field of a scalable mode context
1949  * entry.
1950  */
1951 static inline void context_set_sm_pre(struct context_entry *context)
1952 {
1953 	context->lo |= (1 << 4);
1954 }
1955 
1956 /* Convert value to context PASID directory size field coding. */
1957 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1958 
1959 static int domain_context_mapping_one(struct dmar_domain *domain,
1960 				      struct intel_iommu *iommu,
1961 				      struct pasid_table *table,
1962 				      u8 bus, u8 devfn)
1963 {
1964 	u16 did = domain->iommu_did[iommu->seq_id];
1965 	int translation = CONTEXT_TT_MULTI_LEVEL;
1966 	struct device_domain_info *info = NULL;
1967 	struct context_entry *context;
1968 	unsigned long flags;
1969 	int ret;
1970 
1971 	WARN_ON(did == 0);
1972 
1973 	if (hw_pass_through && domain_type_is_si(domain))
1974 		translation = CONTEXT_TT_PASS_THROUGH;
1975 
1976 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1977 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1978 
1979 	BUG_ON(!domain->pgd);
1980 
1981 	spin_lock_irqsave(&device_domain_lock, flags);
1982 	spin_lock(&iommu->lock);
1983 
1984 	ret = -ENOMEM;
1985 	context = iommu_context_addr(iommu, bus, devfn, 1);
1986 	if (!context)
1987 		goto out_unlock;
1988 
1989 	ret = 0;
1990 	if (context_present(context))
1991 		goto out_unlock;
1992 
1993 	/*
1994 	 * For kdump cases, old valid entries may be cached due to the
1995 	 * in-flight DMA and copied pgtable, but there is no unmapping
1996 	 * behaviour for them, thus we need an explicit cache flush for
1997 	 * the newly-mapped device. For kdump, at this point, the device
1998 	 * is supposed to finish reset at its driver probe stage, so no
1999 	 * in-flight DMA will exist, and we don't need to worry anymore
2000 	 * hereafter.
2001 	 */
2002 	if (context_copied(context)) {
2003 		u16 did_old = context_domain_id(context);
2004 
2005 		if (did_old < cap_ndoms(iommu->cap)) {
2006 			iommu->flush.flush_context(iommu, did_old,
2007 						   (((u16)bus) << 8) | devfn,
2008 						   DMA_CCMD_MASK_NOBIT,
2009 						   DMA_CCMD_DEVICE_INVL);
2010 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2011 						 DMA_TLB_DSI_FLUSH);
2012 		}
2013 	}
2014 
2015 	context_clear_entry(context);
2016 
2017 	if (sm_supported(iommu)) {
2018 		unsigned long pds;
2019 
2020 		WARN_ON(!table);
2021 
2022 		/* Setup the PASID DIR pointer: */
2023 		pds = context_get_sm_pds(table);
2024 		context->lo = (u64)virt_to_phys(table->table) |
2025 				context_pdts(pds);
2026 
2027 		/* Setup the RID_PASID field: */
2028 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2029 
2030 		/*
2031 		 * Setup the Device-TLB enable bit and Page request
2032 		 * Enable bit:
2033 		 */
2034 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2035 		if (info && info->ats_supported)
2036 			context_set_sm_dte(context);
2037 		if (info && info->pri_supported)
2038 			context_set_sm_pre(context);
2039 	} else {
2040 		struct dma_pte *pgd = domain->pgd;
2041 		int agaw;
2042 
2043 		context_set_domain_id(context, did);
2044 
2045 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2046 			/*
2047 			 * Skip top levels of page tables for iommu which has
2048 			 * less agaw than default. Unnecessary for PT mode.
2049 			 */
2050 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2051 				ret = -ENOMEM;
2052 				pgd = phys_to_virt(dma_pte_addr(pgd));
2053 				if (!dma_pte_present(pgd))
2054 					goto out_unlock;
2055 			}
2056 
2057 			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2058 			if (info && info->ats_supported)
2059 				translation = CONTEXT_TT_DEV_IOTLB;
2060 			else
2061 				translation = CONTEXT_TT_MULTI_LEVEL;
2062 
2063 			context_set_address_root(context, virt_to_phys(pgd));
2064 			context_set_address_width(context, agaw);
2065 		} else {
2066 			/*
2067 			 * In pass through mode, AW must be programmed to
2068 			 * indicate the largest AGAW value supported by
2069 			 * hardware. And ASR is ignored by hardware.
2070 			 */
2071 			context_set_address_width(context, iommu->msagaw);
2072 		}
2073 
2074 		context_set_translation_type(context, translation);
2075 	}
2076 
2077 	context_set_fault_enable(context);
2078 	context_set_present(context);
2079 	if (!ecap_coherent(iommu->ecap))
2080 		clflush_cache_range(context, sizeof(*context));
2081 
2082 	/*
2083 	 * It's a non-present to present mapping. If hardware doesn't cache
2084 	 * non-present entry we only need to flush the write-buffer. If the
2085 	 * _does_ cache non-present entries, then it does so in the special
2086 	 * domain #0, which we have to flush:
2087 	 */
2088 	if (cap_caching_mode(iommu->cap)) {
2089 		iommu->flush.flush_context(iommu, 0,
2090 					   (((u16)bus) << 8) | devfn,
2091 					   DMA_CCMD_MASK_NOBIT,
2092 					   DMA_CCMD_DEVICE_INVL);
2093 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2094 	} else {
2095 		iommu_flush_write_buffer(iommu);
2096 	}
2097 	iommu_enable_dev_iotlb(info);
2098 
2099 	ret = 0;
2100 
2101 out_unlock:
2102 	spin_unlock(&iommu->lock);
2103 	spin_unlock_irqrestore(&device_domain_lock, flags);
2104 
2105 	return ret;
2106 }
2107 
2108 struct domain_context_mapping_data {
2109 	struct dmar_domain *domain;
2110 	struct intel_iommu *iommu;
2111 	struct pasid_table *table;
2112 };
2113 
2114 static int domain_context_mapping_cb(struct pci_dev *pdev,
2115 				     u16 alias, void *opaque)
2116 {
2117 	struct domain_context_mapping_data *data = opaque;
2118 
2119 	return domain_context_mapping_one(data->domain, data->iommu,
2120 					  data->table, PCI_BUS_NUM(alias),
2121 					  alias & 0xff);
2122 }
2123 
2124 static int
2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2126 {
2127 	struct domain_context_mapping_data data;
2128 	struct pasid_table *table;
2129 	struct intel_iommu *iommu;
2130 	u8 bus, devfn;
2131 
2132 	iommu = device_to_iommu(dev, &bus, &devfn);
2133 	if (!iommu)
2134 		return -ENODEV;
2135 
2136 	table = intel_pasid_get_table(dev);
2137 
2138 	if (!dev_is_pci(dev))
2139 		return domain_context_mapping_one(domain, iommu, table,
2140 						  bus, devfn);
2141 
2142 	data.domain = domain;
2143 	data.iommu = iommu;
2144 	data.table = table;
2145 
2146 	return pci_for_each_dma_alias(to_pci_dev(dev),
2147 				      &domain_context_mapping_cb, &data);
2148 }
2149 
2150 static int domain_context_mapped_cb(struct pci_dev *pdev,
2151 				    u16 alias, void *opaque)
2152 {
2153 	struct intel_iommu *iommu = opaque;
2154 
2155 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2156 }
2157 
2158 static int domain_context_mapped(struct device *dev)
2159 {
2160 	struct intel_iommu *iommu;
2161 	u8 bus, devfn;
2162 
2163 	iommu = device_to_iommu(dev, &bus, &devfn);
2164 	if (!iommu)
2165 		return -ENODEV;
2166 
2167 	if (!dev_is_pci(dev))
2168 		return device_context_mapped(iommu, bus, devfn);
2169 
2170 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2171 				       domain_context_mapped_cb, iommu);
2172 }
2173 
2174 /* Returns a number of VTD pages, but aligned to MM page size */
2175 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2176 					    size_t size)
2177 {
2178 	host_addr &= ~PAGE_MASK;
2179 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2180 }
2181 
2182 /* Return largest possible superpage level for a given mapping */
2183 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2184 					  unsigned long iov_pfn,
2185 					  unsigned long phy_pfn,
2186 					  unsigned long pages)
2187 {
2188 	int support, level = 1;
2189 	unsigned long pfnmerge;
2190 
2191 	support = domain->iommu_superpage;
2192 
2193 	/* To use a large page, the virtual *and* physical addresses
2194 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2195 	   of them will mean we have to use smaller pages. So just
2196 	   merge them and check both at once. */
2197 	pfnmerge = iov_pfn | phy_pfn;
2198 
2199 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2200 		pages >>= VTD_STRIDE_SHIFT;
2201 		if (!pages)
2202 			break;
2203 		pfnmerge >>= VTD_STRIDE_SHIFT;
2204 		level++;
2205 		support--;
2206 	}
2207 	return level;
2208 }
2209 
2210 /*
2211  * Ensure that old small page tables are removed to make room for superpage(s).
2212  * We're going to add new large pages, so make sure we don't remove their parent
2213  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2214  */
2215 static void switch_to_super_page(struct dmar_domain *domain,
2216 				 unsigned long start_pfn,
2217 				 unsigned long end_pfn, int level)
2218 {
2219 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2220 	struct dma_pte *pte = NULL;
2221 	int i;
2222 
2223 	while (start_pfn <= end_pfn) {
2224 		if (!pte)
2225 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2226 
2227 		if (dma_pte_present(pte)) {
2228 			dma_pte_free_pagetable(domain, start_pfn,
2229 					       start_pfn + lvl_pages - 1,
2230 					       level + 1);
2231 
2232 			for_each_domain_iommu(i, domain)
2233 				iommu_flush_iotlb_psi(g_iommus[i], domain,
2234 						      start_pfn, lvl_pages,
2235 						      0, 0);
2236 		}
2237 
2238 		pte++;
2239 		start_pfn += lvl_pages;
2240 		if (first_pte_in_page(pte))
2241 			pte = NULL;
2242 	}
2243 }
2244 
2245 static int
2246 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2247 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2248 {
2249 	struct dma_pte *first_pte = NULL, *pte = NULL;
2250 	unsigned int largepage_lvl = 0;
2251 	unsigned long lvl_pages = 0;
2252 	phys_addr_t pteval;
2253 	u64 attr;
2254 
2255 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2256 
2257 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2258 		return -EINVAL;
2259 
2260 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2261 	attr |= DMA_FL_PTE_PRESENT;
2262 	if (domain_use_first_level(domain)) {
2263 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2264 		if (prot & DMA_PTE_WRITE)
2265 			attr |= DMA_FL_PTE_DIRTY;
2266 	}
2267 
2268 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2269 
2270 	while (nr_pages > 0) {
2271 		uint64_t tmp;
2272 
2273 		if (!pte) {
2274 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2275 					phys_pfn, nr_pages);
2276 
2277 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2278 			if (!pte)
2279 				return -ENOMEM;
2280 			first_pte = pte;
2281 
2282 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2283 
2284 			/* It is large page*/
2285 			if (largepage_lvl > 1) {
2286 				unsigned long end_pfn;
2287 				unsigned long pages_to_remove;
2288 
2289 				pteval |= DMA_PTE_LARGE_PAGE;
2290 				pages_to_remove = min_t(unsigned long, nr_pages,
2291 							nr_pte_to_next_page(pte) * lvl_pages);
2292 				end_pfn = iov_pfn + pages_to_remove - 1;
2293 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2294 			} else {
2295 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2296 			}
2297 
2298 		}
2299 		/* We don't need lock here, nobody else
2300 		 * touches the iova range
2301 		 */
2302 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2303 		if (tmp) {
2304 			static int dumps = 5;
2305 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2306 				iov_pfn, tmp, (unsigned long long)pteval);
2307 			if (dumps) {
2308 				dumps--;
2309 				debug_dma_dump_mappings(NULL);
2310 			}
2311 			WARN_ON(1);
2312 		}
2313 
2314 		nr_pages -= lvl_pages;
2315 		iov_pfn += lvl_pages;
2316 		phys_pfn += lvl_pages;
2317 		pteval += lvl_pages * VTD_PAGE_SIZE;
2318 
2319 		/* If the next PTE would be the first in a new page, then we
2320 		 * need to flush the cache on the entries we've just written.
2321 		 * And then we'll need to recalculate 'pte', so clear it and
2322 		 * let it get set again in the if (!pte) block above.
2323 		 *
2324 		 * If we're done (!nr_pages) we need to flush the cache too.
2325 		 *
2326 		 * Also if we've been setting superpages, we may need to
2327 		 * recalculate 'pte' and switch back to smaller pages for the
2328 		 * end of the mapping, if the trailing size is not enough to
2329 		 * use another superpage (i.e. nr_pages < lvl_pages).
2330 		 */
2331 		pte++;
2332 		if (!nr_pages || first_pte_in_page(pte) ||
2333 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2334 			domain_flush_cache(domain, first_pte,
2335 					   (void *)pte - (void *)first_pte);
2336 			pte = NULL;
2337 		}
2338 	}
2339 
2340 	return 0;
2341 }
2342 
2343 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2344 {
2345 	struct intel_iommu *iommu = info->iommu;
2346 	struct context_entry *context;
2347 	unsigned long flags;
2348 	u16 did_old;
2349 
2350 	if (!iommu)
2351 		return;
2352 
2353 	spin_lock_irqsave(&iommu->lock, flags);
2354 	context = iommu_context_addr(iommu, bus, devfn, 0);
2355 	if (!context) {
2356 		spin_unlock_irqrestore(&iommu->lock, flags);
2357 		return;
2358 	}
2359 
2360 	if (sm_supported(iommu)) {
2361 		if (hw_pass_through && domain_type_is_si(info->domain))
2362 			did_old = FLPT_DEFAULT_DID;
2363 		else
2364 			did_old = info->domain->iommu_did[iommu->seq_id];
2365 	} else {
2366 		did_old = context_domain_id(context);
2367 	}
2368 
2369 	context_clear_entry(context);
2370 	__iommu_flush_cache(iommu, context, sizeof(*context));
2371 	spin_unlock_irqrestore(&iommu->lock, flags);
2372 	iommu->flush.flush_context(iommu,
2373 				   did_old,
2374 				   (((u16)bus) << 8) | devfn,
2375 				   DMA_CCMD_MASK_NOBIT,
2376 				   DMA_CCMD_DEVICE_INVL);
2377 
2378 	if (sm_supported(iommu))
2379 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2380 
2381 	iommu->flush.flush_iotlb(iommu,
2382 				 did_old,
2383 				 0,
2384 				 0,
2385 				 DMA_TLB_DSI_FLUSH);
2386 
2387 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2388 }
2389 
2390 static void domain_remove_dev_info(struct dmar_domain *domain)
2391 {
2392 	struct device_domain_info *info, *tmp;
2393 	unsigned long flags;
2394 
2395 	spin_lock_irqsave(&device_domain_lock, flags);
2396 	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2397 		__dmar_remove_one_dev_info(info);
2398 	spin_unlock_irqrestore(&device_domain_lock, flags);
2399 }
2400 
2401 static inline struct device_domain_info *
2402 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2403 {
2404 	struct device_domain_info *info;
2405 
2406 	list_for_each_entry(info, &device_domain_list, global)
2407 		if (info->segment == segment && info->bus == bus &&
2408 		    info->devfn == devfn)
2409 			return info;
2410 
2411 	return NULL;
2412 }
2413 
2414 static int domain_setup_first_level(struct intel_iommu *iommu,
2415 				    struct dmar_domain *domain,
2416 				    struct device *dev,
2417 				    u32 pasid)
2418 {
2419 	struct dma_pte *pgd = domain->pgd;
2420 	int agaw, level;
2421 	int flags = 0;
2422 
2423 	/*
2424 	 * Skip top levels of page tables for iommu which has
2425 	 * less agaw than default. Unnecessary for PT mode.
2426 	 */
2427 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2428 		pgd = phys_to_virt(dma_pte_addr(pgd));
2429 		if (!dma_pte_present(pgd))
2430 			return -ENOMEM;
2431 	}
2432 
2433 	level = agaw_to_level(agaw);
2434 	if (level != 4 && level != 5)
2435 		return -EINVAL;
2436 
2437 	if (pasid != PASID_RID2PASID)
2438 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2439 	if (level == 5)
2440 		flags |= PASID_FLAG_FL5LP;
2441 
2442 	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2443 		flags |= PASID_FLAG_PAGE_SNOOP;
2444 
2445 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2446 					     domain->iommu_did[iommu->seq_id],
2447 					     flags);
2448 }
2449 
2450 static bool dev_is_real_dma_subdevice(struct device *dev)
2451 {
2452 	return dev && dev_is_pci(dev) &&
2453 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2454 }
2455 
2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2457 						    int bus, int devfn,
2458 						    struct device *dev,
2459 						    struct dmar_domain *domain)
2460 {
2461 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2462 	unsigned long flags;
2463 	int ret;
2464 
2465 	spin_lock_irqsave(&device_domain_lock, flags);
2466 	info->domain = domain;
2467 	spin_lock(&iommu->lock);
2468 	ret = domain_attach_iommu(domain, iommu);
2469 	spin_unlock(&iommu->lock);
2470 	if (ret) {
2471 		spin_unlock_irqrestore(&device_domain_lock, flags);
2472 		return NULL;
2473 	}
2474 	list_add(&info->link, &domain->devices);
2475 	spin_unlock_irqrestore(&device_domain_lock, flags);
2476 
2477 	/* PASID table is mandatory for a PCI device in scalable mode. */
2478 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2479 		ret = intel_pasid_alloc_table(dev);
2480 		if (ret) {
2481 			dev_err(dev, "PASID table allocation failed\n");
2482 			dmar_remove_one_dev_info(dev);
2483 			return NULL;
2484 		}
2485 
2486 		/* Setup the PASID entry for requests without PASID: */
2487 		spin_lock_irqsave(&iommu->lock, flags);
2488 		if (hw_pass_through && domain_type_is_si(domain))
2489 			ret = intel_pasid_setup_pass_through(iommu, domain,
2490 					dev, PASID_RID2PASID);
2491 		else if (domain_use_first_level(domain))
2492 			ret = domain_setup_first_level(iommu, domain, dev,
2493 					PASID_RID2PASID);
2494 		else
2495 			ret = intel_pasid_setup_second_level(iommu, domain,
2496 					dev, PASID_RID2PASID);
2497 		spin_unlock_irqrestore(&iommu->lock, flags);
2498 		if (ret) {
2499 			dev_err(dev, "Setup RID2PASID failed\n");
2500 			dmar_remove_one_dev_info(dev);
2501 			return NULL;
2502 		}
2503 	}
2504 
2505 	if (dev && domain_context_mapping(domain, dev)) {
2506 		dev_err(dev, "Domain context map failed\n");
2507 		dmar_remove_one_dev_info(dev);
2508 		return NULL;
2509 	}
2510 
2511 	return domain;
2512 }
2513 
2514 static int iommu_domain_identity_map(struct dmar_domain *domain,
2515 				     unsigned long first_vpfn,
2516 				     unsigned long last_vpfn)
2517 {
2518 	/*
2519 	 * RMRR range might have overlap with physical memory range,
2520 	 * clear it first
2521 	 */
2522 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2523 
2524 	return __domain_mapping(domain, first_vpfn,
2525 				first_vpfn, last_vpfn - first_vpfn + 1,
2526 				DMA_PTE_READ|DMA_PTE_WRITE);
2527 }
2528 
2529 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2530 
2531 static int __init si_domain_init(int hw)
2532 {
2533 	struct dmar_rmrr_unit *rmrr;
2534 	struct device *dev;
2535 	int i, nid, ret;
2536 
2537 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2538 	if (!si_domain)
2539 		return -EFAULT;
2540 
2541 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2542 		domain_exit(si_domain);
2543 		return -EFAULT;
2544 	}
2545 
2546 	if (hw)
2547 		return 0;
2548 
2549 	for_each_online_node(nid) {
2550 		unsigned long start_pfn, end_pfn;
2551 		int i;
2552 
2553 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2554 			ret = iommu_domain_identity_map(si_domain,
2555 					mm_to_dma_pfn(start_pfn),
2556 					mm_to_dma_pfn(end_pfn));
2557 			if (ret)
2558 				return ret;
2559 		}
2560 	}
2561 
2562 	/*
2563 	 * Identity map the RMRRs so that devices with RMRRs could also use
2564 	 * the si_domain.
2565 	 */
2566 	for_each_rmrr_units(rmrr) {
2567 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2568 					  i, dev) {
2569 			unsigned long long start = rmrr->base_address;
2570 			unsigned long long end = rmrr->end_address;
2571 
2572 			if (WARN_ON(end < start ||
2573 				    end >> agaw_to_width(si_domain->agaw)))
2574 				continue;
2575 
2576 			ret = iommu_domain_identity_map(si_domain,
2577 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2578 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2579 			if (ret)
2580 				return ret;
2581 		}
2582 	}
2583 
2584 	return 0;
2585 }
2586 
2587 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2588 {
2589 	struct dmar_domain *ndomain;
2590 	struct intel_iommu *iommu;
2591 	u8 bus, devfn;
2592 
2593 	iommu = device_to_iommu(dev, &bus, &devfn);
2594 	if (!iommu)
2595 		return -ENODEV;
2596 
2597 	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2598 	if (ndomain != domain)
2599 		return -EBUSY;
2600 
2601 	return 0;
2602 }
2603 
2604 static bool device_has_rmrr(struct device *dev)
2605 {
2606 	struct dmar_rmrr_unit *rmrr;
2607 	struct device *tmp;
2608 	int i;
2609 
2610 	rcu_read_lock();
2611 	for_each_rmrr_units(rmrr) {
2612 		/*
2613 		 * Return TRUE if this RMRR contains the device that
2614 		 * is passed in.
2615 		 */
2616 		for_each_active_dev_scope(rmrr->devices,
2617 					  rmrr->devices_cnt, i, tmp)
2618 			if (tmp == dev ||
2619 			    is_downstream_to_pci_bridge(dev, tmp)) {
2620 				rcu_read_unlock();
2621 				return true;
2622 			}
2623 	}
2624 	rcu_read_unlock();
2625 	return false;
2626 }
2627 
2628 /**
2629  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2630  * is relaxable (ie. is allowed to be not enforced under some conditions)
2631  * @dev: device handle
2632  *
2633  * We assume that PCI USB devices with RMRRs have them largely
2634  * for historical reasons and that the RMRR space is not actively used post
2635  * boot.  This exclusion may change if vendors begin to abuse it.
2636  *
2637  * The same exception is made for graphics devices, with the requirement that
2638  * any use of the RMRR regions will be torn down before assigning the device
2639  * to a guest.
2640  *
2641  * Return: true if the RMRR is relaxable, false otherwise
2642  */
2643 static bool device_rmrr_is_relaxable(struct device *dev)
2644 {
2645 	struct pci_dev *pdev;
2646 
2647 	if (!dev_is_pci(dev))
2648 		return false;
2649 
2650 	pdev = to_pci_dev(dev);
2651 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2652 		return true;
2653 	else
2654 		return false;
2655 }
2656 
2657 /*
2658  * There are a couple cases where we need to restrict the functionality of
2659  * devices associated with RMRRs.  The first is when evaluating a device for
2660  * identity mapping because problems exist when devices are moved in and out
2661  * of domains and their respective RMRR information is lost.  This means that
2662  * a device with associated RMRRs will never be in a "passthrough" domain.
2663  * The second is use of the device through the IOMMU API.  This interface
2664  * expects to have full control of the IOVA space for the device.  We cannot
2665  * satisfy both the requirement that RMRR access is maintained and have an
2666  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2667  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2668  * We therefore prevent devices associated with an RMRR from participating in
2669  * the IOMMU API, which eliminates them from device assignment.
2670  *
2671  * In both cases, devices which have relaxable RMRRs are not concerned by this
2672  * restriction. See device_rmrr_is_relaxable comment.
2673  */
2674 static bool device_is_rmrr_locked(struct device *dev)
2675 {
2676 	if (!device_has_rmrr(dev))
2677 		return false;
2678 
2679 	if (device_rmrr_is_relaxable(dev))
2680 		return false;
2681 
2682 	return true;
2683 }
2684 
2685 /*
2686  * Return the required default domain type for a specific device.
2687  *
2688  * @dev: the device in query
2689  * @startup: true if this is during early boot
2690  *
2691  * Returns:
2692  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2693  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2694  *  - 0: both identity and dynamic domains work for this device
2695  */
2696 static int device_def_domain_type(struct device *dev)
2697 {
2698 	if (dev_is_pci(dev)) {
2699 		struct pci_dev *pdev = to_pci_dev(dev);
2700 
2701 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2702 			return IOMMU_DOMAIN_IDENTITY;
2703 
2704 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2705 			return IOMMU_DOMAIN_IDENTITY;
2706 	}
2707 
2708 	return 0;
2709 }
2710 
2711 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2712 {
2713 	/*
2714 	 * Start from the sane iommu hardware state.
2715 	 * If the queued invalidation is already initialized by us
2716 	 * (for example, while enabling interrupt-remapping) then
2717 	 * we got the things already rolling from a sane state.
2718 	 */
2719 	if (!iommu->qi) {
2720 		/*
2721 		 * Clear any previous faults.
2722 		 */
2723 		dmar_fault(-1, iommu);
2724 		/*
2725 		 * Disable queued invalidation if supported and already enabled
2726 		 * before OS handover.
2727 		 */
2728 		dmar_disable_qi(iommu);
2729 	}
2730 
2731 	if (dmar_enable_qi(iommu)) {
2732 		/*
2733 		 * Queued Invalidate not enabled, use Register Based Invalidate
2734 		 */
2735 		iommu->flush.flush_context = __iommu_flush_context;
2736 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2737 		pr_info("%s: Using Register based invalidation\n",
2738 			iommu->name);
2739 	} else {
2740 		iommu->flush.flush_context = qi_flush_context;
2741 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2742 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2743 	}
2744 }
2745 
2746 static int copy_context_table(struct intel_iommu *iommu,
2747 			      struct root_entry *old_re,
2748 			      struct context_entry **tbl,
2749 			      int bus, bool ext)
2750 {
2751 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2752 	struct context_entry *new_ce = NULL, ce;
2753 	struct context_entry *old_ce = NULL;
2754 	struct root_entry re;
2755 	phys_addr_t old_ce_phys;
2756 
2757 	tbl_idx = ext ? bus * 2 : bus;
2758 	memcpy(&re, old_re, sizeof(re));
2759 
2760 	for (devfn = 0; devfn < 256; devfn++) {
2761 		/* First calculate the correct index */
2762 		idx = (ext ? devfn * 2 : devfn) % 256;
2763 
2764 		if (idx == 0) {
2765 			/* First save what we may have and clean up */
2766 			if (new_ce) {
2767 				tbl[tbl_idx] = new_ce;
2768 				__iommu_flush_cache(iommu, new_ce,
2769 						    VTD_PAGE_SIZE);
2770 				pos = 1;
2771 			}
2772 
2773 			if (old_ce)
2774 				memunmap(old_ce);
2775 
2776 			ret = 0;
2777 			if (devfn < 0x80)
2778 				old_ce_phys = root_entry_lctp(&re);
2779 			else
2780 				old_ce_phys = root_entry_uctp(&re);
2781 
2782 			if (!old_ce_phys) {
2783 				if (ext && devfn == 0) {
2784 					/* No LCTP, try UCTP */
2785 					devfn = 0x7f;
2786 					continue;
2787 				} else {
2788 					goto out;
2789 				}
2790 			}
2791 
2792 			ret = -ENOMEM;
2793 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2794 					MEMREMAP_WB);
2795 			if (!old_ce)
2796 				goto out;
2797 
2798 			new_ce = alloc_pgtable_page(iommu->node);
2799 			if (!new_ce)
2800 				goto out_unmap;
2801 
2802 			ret = 0;
2803 		}
2804 
2805 		/* Now copy the context entry */
2806 		memcpy(&ce, old_ce + idx, sizeof(ce));
2807 
2808 		if (!__context_present(&ce))
2809 			continue;
2810 
2811 		did = context_domain_id(&ce);
2812 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2813 			set_bit(did, iommu->domain_ids);
2814 
2815 		/*
2816 		 * We need a marker for copied context entries. This
2817 		 * marker needs to work for the old format as well as
2818 		 * for extended context entries.
2819 		 *
2820 		 * Bit 67 of the context entry is used. In the old
2821 		 * format this bit is available to software, in the
2822 		 * extended format it is the PGE bit, but PGE is ignored
2823 		 * by HW if PASIDs are disabled (and thus still
2824 		 * available).
2825 		 *
2826 		 * So disable PASIDs first and then mark the entry
2827 		 * copied. This means that we don't copy PASID
2828 		 * translations from the old kernel, but this is fine as
2829 		 * faults there are not fatal.
2830 		 */
2831 		context_clear_pasid_enable(&ce);
2832 		context_set_copied(&ce);
2833 
2834 		new_ce[idx] = ce;
2835 	}
2836 
2837 	tbl[tbl_idx + pos] = new_ce;
2838 
2839 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2840 
2841 out_unmap:
2842 	memunmap(old_ce);
2843 
2844 out:
2845 	return ret;
2846 }
2847 
2848 static int copy_translation_tables(struct intel_iommu *iommu)
2849 {
2850 	struct context_entry **ctxt_tbls;
2851 	struct root_entry *old_rt;
2852 	phys_addr_t old_rt_phys;
2853 	int ctxt_table_entries;
2854 	unsigned long flags;
2855 	u64 rtaddr_reg;
2856 	int bus, ret;
2857 	bool new_ext, ext;
2858 
2859 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2860 	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2861 	new_ext    = !!ecap_ecs(iommu->ecap);
2862 
2863 	/*
2864 	 * The RTT bit can only be changed when translation is disabled,
2865 	 * but disabling translation means to open a window for data
2866 	 * corruption. So bail out and don't copy anything if we would
2867 	 * have to change the bit.
2868 	 */
2869 	if (new_ext != ext)
2870 		return -EINVAL;
2871 
2872 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2873 	if (!old_rt_phys)
2874 		return -EINVAL;
2875 
2876 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2877 	if (!old_rt)
2878 		return -ENOMEM;
2879 
2880 	/* This is too big for the stack - allocate it from slab */
2881 	ctxt_table_entries = ext ? 512 : 256;
2882 	ret = -ENOMEM;
2883 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2884 	if (!ctxt_tbls)
2885 		goto out_unmap;
2886 
2887 	for (bus = 0; bus < 256; bus++) {
2888 		ret = copy_context_table(iommu, &old_rt[bus],
2889 					 ctxt_tbls, bus, ext);
2890 		if (ret) {
2891 			pr_err("%s: Failed to copy context table for bus %d\n",
2892 				iommu->name, bus);
2893 			continue;
2894 		}
2895 	}
2896 
2897 	spin_lock_irqsave(&iommu->lock, flags);
2898 
2899 	/* Context tables are copied, now write them to the root_entry table */
2900 	for (bus = 0; bus < 256; bus++) {
2901 		int idx = ext ? bus * 2 : bus;
2902 		u64 val;
2903 
2904 		if (ctxt_tbls[idx]) {
2905 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2906 			iommu->root_entry[bus].lo = val;
2907 		}
2908 
2909 		if (!ext || !ctxt_tbls[idx + 1])
2910 			continue;
2911 
2912 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2913 		iommu->root_entry[bus].hi = val;
2914 	}
2915 
2916 	spin_unlock_irqrestore(&iommu->lock, flags);
2917 
2918 	kfree(ctxt_tbls);
2919 
2920 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2921 
2922 	ret = 0;
2923 
2924 out_unmap:
2925 	memunmap(old_rt);
2926 
2927 	return ret;
2928 }
2929 
2930 #ifdef CONFIG_INTEL_IOMMU_SVM
2931 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2932 {
2933 	struct intel_iommu *iommu = data;
2934 	ioasid_t ioasid;
2935 
2936 	if (!iommu)
2937 		return INVALID_IOASID;
2938 	/*
2939 	 * VT-d virtual command interface always uses the full 20 bit
2940 	 * PASID range. Host can partition guest PASID range based on
2941 	 * policies but it is out of guest's control.
2942 	 */
2943 	if (min < PASID_MIN || max > intel_pasid_max_id)
2944 		return INVALID_IOASID;
2945 
2946 	if (vcmd_alloc_pasid(iommu, &ioasid))
2947 		return INVALID_IOASID;
2948 
2949 	return ioasid;
2950 }
2951 
2952 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2953 {
2954 	struct intel_iommu *iommu = data;
2955 
2956 	if (!iommu)
2957 		return;
2958 	/*
2959 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2960 	 * We can only free the PASID when all the devices are unbound.
2961 	 */
2962 	if (ioasid_find(NULL, ioasid, NULL)) {
2963 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2964 		return;
2965 	}
2966 	vcmd_free_pasid(iommu, ioasid);
2967 }
2968 
2969 static void register_pasid_allocator(struct intel_iommu *iommu)
2970 {
2971 	/*
2972 	 * If we are running in the host, no need for custom allocator
2973 	 * in that PASIDs are allocated from the host system-wide.
2974 	 */
2975 	if (!cap_caching_mode(iommu->cap))
2976 		return;
2977 
2978 	if (!sm_supported(iommu)) {
2979 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2980 		return;
2981 	}
2982 
2983 	/*
2984 	 * Register a custom PASID allocator if we are running in a guest,
2985 	 * guest PASID must be obtained via virtual command interface.
2986 	 * There can be multiple vIOMMUs in each guest but only one allocator
2987 	 * is active. All vIOMMU allocators will eventually be calling the same
2988 	 * host allocator.
2989 	 */
2990 	if (!vccap_pasid(iommu->vccap))
2991 		return;
2992 
2993 	pr_info("Register custom PASID allocator\n");
2994 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2995 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2996 	iommu->pasid_allocator.pdata = (void *)iommu;
2997 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2998 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2999 		/*
3000 		 * Disable scalable mode on this IOMMU if there
3001 		 * is no custom allocator. Mixing SM capable vIOMMU
3002 		 * and non-SM vIOMMU are not supported.
3003 		 */
3004 		intel_iommu_sm = 0;
3005 	}
3006 }
3007 #endif
3008 
3009 static int __init init_dmars(void)
3010 {
3011 	struct dmar_drhd_unit *drhd;
3012 	struct intel_iommu *iommu;
3013 	int ret;
3014 
3015 	/*
3016 	 * for each drhd
3017 	 *    allocate root
3018 	 *    initialize and program root entry to not present
3019 	 * endfor
3020 	 */
3021 	for_each_drhd_unit(drhd) {
3022 		/*
3023 		 * lock not needed as this is only incremented in the single
3024 		 * threaded kernel __init code path all other access are read
3025 		 * only
3026 		 */
3027 		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3028 			g_num_of_iommus++;
3029 			continue;
3030 		}
3031 		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3032 	}
3033 
3034 	/* Preallocate enough resources for IOMMU hot-addition */
3035 	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3036 		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3037 
3038 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3039 			GFP_KERNEL);
3040 	if (!g_iommus) {
3041 		ret = -ENOMEM;
3042 		goto error;
3043 	}
3044 
3045 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3046 	if (ret)
3047 		goto free_iommu;
3048 
3049 	for_each_iommu(iommu, drhd) {
3050 		if (drhd->ignored) {
3051 			iommu_disable_translation(iommu);
3052 			continue;
3053 		}
3054 
3055 		/*
3056 		 * Find the max pasid size of all IOMMU's in the system.
3057 		 * We need to ensure the system pasid table is no bigger
3058 		 * than the smallest supported.
3059 		 */
3060 		if (pasid_supported(iommu)) {
3061 			u32 temp = 2 << ecap_pss(iommu->ecap);
3062 
3063 			intel_pasid_max_id = min_t(u32, temp,
3064 						   intel_pasid_max_id);
3065 		}
3066 
3067 		g_iommus[iommu->seq_id] = iommu;
3068 
3069 		intel_iommu_init_qi(iommu);
3070 
3071 		ret = iommu_init_domains(iommu);
3072 		if (ret)
3073 			goto free_iommu;
3074 
3075 		init_translation_status(iommu);
3076 
3077 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3078 			iommu_disable_translation(iommu);
3079 			clear_translation_pre_enabled(iommu);
3080 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3081 				iommu->name);
3082 		}
3083 
3084 		/*
3085 		 * TBD:
3086 		 * we could share the same root & context tables
3087 		 * among all IOMMU's. Need to Split it later.
3088 		 */
3089 		ret = iommu_alloc_root_entry(iommu);
3090 		if (ret)
3091 			goto free_iommu;
3092 
3093 		if (translation_pre_enabled(iommu)) {
3094 			pr_info("Translation already enabled - trying to copy translation structures\n");
3095 
3096 			ret = copy_translation_tables(iommu);
3097 			if (ret) {
3098 				/*
3099 				 * We found the IOMMU with translation
3100 				 * enabled - but failed to copy over the
3101 				 * old root-entry table. Try to proceed
3102 				 * by disabling translation now and
3103 				 * allocating a clean root-entry table.
3104 				 * This might cause DMAR faults, but
3105 				 * probably the dump will still succeed.
3106 				 */
3107 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3108 				       iommu->name);
3109 				iommu_disable_translation(iommu);
3110 				clear_translation_pre_enabled(iommu);
3111 			} else {
3112 				pr_info("Copied translation tables from previous kernel for %s\n",
3113 					iommu->name);
3114 			}
3115 		}
3116 
3117 		if (!ecap_pass_through(iommu->ecap))
3118 			hw_pass_through = 0;
3119 		intel_svm_check(iommu);
3120 	}
3121 
3122 	/*
3123 	 * Now that qi is enabled on all iommus, set the root entry and flush
3124 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3125 	 * flush_context function will loop forever and the boot hangs.
3126 	 */
3127 	for_each_active_iommu(iommu, drhd) {
3128 		iommu_flush_write_buffer(iommu);
3129 #ifdef CONFIG_INTEL_IOMMU_SVM
3130 		register_pasid_allocator(iommu);
3131 #endif
3132 		iommu_set_root_entry(iommu);
3133 	}
3134 
3135 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3136 	dmar_map_gfx = 0;
3137 #endif
3138 
3139 	if (!dmar_map_gfx)
3140 		iommu_identity_mapping |= IDENTMAP_GFX;
3141 
3142 	check_tylersburg_isoch();
3143 
3144 	ret = si_domain_init(hw_pass_through);
3145 	if (ret)
3146 		goto free_iommu;
3147 
3148 	/*
3149 	 * for each drhd
3150 	 *   enable fault log
3151 	 *   global invalidate context cache
3152 	 *   global invalidate iotlb
3153 	 *   enable translation
3154 	 */
3155 	for_each_iommu(iommu, drhd) {
3156 		if (drhd->ignored) {
3157 			/*
3158 			 * we always have to disable PMRs or DMA may fail on
3159 			 * this device
3160 			 */
3161 			if (force_on)
3162 				iommu_disable_protect_mem_regions(iommu);
3163 			continue;
3164 		}
3165 
3166 		iommu_flush_write_buffer(iommu);
3167 
3168 #ifdef CONFIG_INTEL_IOMMU_SVM
3169 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3170 			/*
3171 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3172 			 * could cause possible lock race condition.
3173 			 */
3174 			up_write(&dmar_global_lock);
3175 			ret = intel_svm_enable_prq(iommu);
3176 			down_write(&dmar_global_lock);
3177 			if (ret)
3178 				goto free_iommu;
3179 		}
3180 #endif
3181 		ret = dmar_set_interrupt(iommu);
3182 		if (ret)
3183 			goto free_iommu;
3184 	}
3185 
3186 	return 0;
3187 
3188 free_iommu:
3189 	for_each_active_iommu(iommu, drhd) {
3190 		disable_dmar_iommu(iommu);
3191 		free_dmar_iommu(iommu);
3192 	}
3193 
3194 	kfree(g_iommus);
3195 
3196 error:
3197 	return ret;
3198 }
3199 
3200 static void __init init_no_remapping_devices(void)
3201 {
3202 	struct dmar_drhd_unit *drhd;
3203 	struct device *dev;
3204 	int i;
3205 
3206 	for_each_drhd_unit(drhd) {
3207 		if (!drhd->include_all) {
3208 			for_each_active_dev_scope(drhd->devices,
3209 						  drhd->devices_cnt, i, dev)
3210 				break;
3211 			/* ignore DMAR unit if no devices exist */
3212 			if (i == drhd->devices_cnt)
3213 				drhd->ignored = 1;
3214 		}
3215 	}
3216 
3217 	for_each_active_drhd_unit(drhd) {
3218 		if (drhd->include_all)
3219 			continue;
3220 
3221 		for_each_active_dev_scope(drhd->devices,
3222 					  drhd->devices_cnt, i, dev)
3223 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3224 				break;
3225 		if (i < drhd->devices_cnt)
3226 			continue;
3227 
3228 		/* This IOMMU has *only* gfx devices. Either bypass it or
3229 		   set the gfx_mapped flag, as appropriate */
3230 		drhd->gfx_dedicated = 1;
3231 		if (!dmar_map_gfx)
3232 			drhd->ignored = 1;
3233 	}
3234 }
3235 
3236 #ifdef CONFIG_SUSPEND
3237 static int init_iommu_hw(void)
3238 {
3239 	struct dmar_drhd_unit *drhd;
3240 	struct intel_iommu *iommu = NULL;
3241 
3242 	for_each_active_iommu(iommu, drhd)
3243 		if (iommu->qi)
3244 			dmar_reenable_qi(iommu);
3245 
3246 	for_each_iommu(iommu, drhd) {
3247 		if (drhd->ignored) {
3248 			/*
3249 			 * we always have to disable PMRs or DMA may fail on
3250 			 * this device
3251 			 */
3252 			if (force_on)
3253 				iommu_disable_protect_mem_regions(iommu);
3254 			continue;
3255 		}
3256 
3257 		iommu_flush_write_buffer(iommu);
3258 		iommu_set_root_entry(iommu);
3259 		iommu_enable_translation(iommu);
3260 		iommu_disable_protect_mem_regions(iommu);
3261 	}
3262 
3263 	return 0;
3264 }
3265 
3266 static void iommu_flush_all(void)
3267 {
3268 	struct dmar_drhd_unit *drhd;
3269 	struct intel_iommu *iommu;
3270 
3271 	for_each_active_iommu(iommu, drhd) {
3272 		iommu->flush.flush_context(iommu, 0, 0, 0,
3273 					   DMA_CCMD_GLOBAL_INVL);
3274 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3275 					 DMA_TLB_GLOBAL_FLUSH);
3276 	}
3277 }
3278 
3279 static int iommu_suspend(void)
3280 {
3281 	struct dmar_drhd_unit *drhd;
3282 	struct intel_iommu *iommu = NULL;
3283 	unsigned long flag;
3284 
3285 	for_each_active_iommu(iommu, drhd) {
3286 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3287 					     GFP_KERNEL);
3288 		if (!iommu->iommu_state)
3289 			goto nomem;
3290 	}
3291 
3292 	iommu_flush_all();
3293 
3294 	for_each_active_iommu(iommu, drhd) {
3295 		iommu_disable_translation(iommu);
3296 
3297 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3298 
3299 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3300 			readl(iommu->reg + DMAR_FECTL_REG);
3301 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3302 			readl(iommu->reg + DMAR_FEDATA_REG);
3303 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3304 			readl(iommu->reg + DMAR_FEADDR_REG);
3305 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3306 			readl(iommu->reg + DMAR_FEUADDR_REG);
3307 
3308 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3309 	}
3310 	return 0;
3311 
3312 nomem:
3313 	for_each_active_iommu(iommu, drhd)
3314 		kfree(iommu->iommu_state);
3315 
3316 	return -ENOMEM;
3317 }
3318 
3319 static void iommu_resume(void)
3320 {
3321 	struct dmar_drhd_unit *drhd;
3322 	struct intel_iommu *iommu = NULL;
3323 	unsigned long flag;
3324 
3325 	if (init_iommu_hw()) {
3326 		if (force_on)
3327 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3328 		else
3329 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3330 		return;
3331 	}
3332 
3333 	for_each_active_iommu(iommu, drhd) {
3334 
3335 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3336 
3337 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3338 			iommu->reg + DMAR_FECTL_REG);
3339 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3340 			iommu->reg + DMAR_FEDATA_REG);
3341 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3342 			iommu->reg + DMAR_FEADDR_REG);
3343 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3344 			iommu->reg + DMAR_FEUADDR_REG);
3345 
3346 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3347 	}
3348 
3349 	for_each_active_iommu(iommu, drhd)
3350 		kfree(iommu->iommu_state);
3351 }
3352 
3353 static struct syscore_ops iommu_syscore_ops = {
3354 	.resume		= iommu_resume,
3355 	.suspend	= iommu_suspend,
3356 };
3357 
3358 static void __init init_iommu_pm_ops(void)
3359 {
3360 	register_syscore_ops(&iommu_syscore_ops);
3361 }
3362 
3363 #else
3364 static inline void init_iommu_pm_ops(void) {}
3365 #endif	/* CONFIG_PM */
3366 
3367 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3368 {
3369 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3370 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3371 	    rmrr->end_address <= rmrr->base_address ||
3372 	    arch_rmrr_sanity_check(rmrr))
3373 		return -EINVAL;
3374 
3375 	return 0;
3376 }
3377 
3378 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3379 {
3380 	struct acpi_dmar_reserved_memory *rmrr;
3381 	struct dmar_rmrr_unit *rmrru;
3382 
3383 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3384 	if (rmrr_sanity_check(rmrr)) {
3385 		pr_warn(FW_BUG
3386 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3387 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3388 			   rmrr->base_address, rmrr->end_address,
3389 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3390 			   dmi_get_system_info(DMI_BIOS_VERSION),
3391 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3392 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3393 	}
3394 
3395 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3396 	if (!rmrru)
3397 		goto out;
3398 
3399 	rmrru->hdr = header;
3400 
3401 	rmrru->base_address = rmrr->base_address;
3402 	rmrru->end_address = rmrr->end_address;
3403 
3404 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3405 				((void *)rmrr) + rmrr->header.length,
3406 				&rmrru->devices_cnt);
3407 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3408 		goto free_rmrru;
3409 
3410 	list_add(&rmrru->list, &dmar_rmrr_units);
3411 
3412 	return 0;
3413 free_rmrru:
3414 	kfree(rmrru);
3415 out:
3416 	return -ENOMEM;
3417 }
3418 
3419 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3420 {
3421 	struct dmar_atsr_unit *atsru;
3422 	struct acpi_dmar_atsr *tmp;
3423 
3424 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3425 				dmar_rcu_check()) {
3426 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3427 		if (atsr->segment != tmp->segment)
3428 			continue;
3429 		if (atsr->header.length != tmp->header.length)
3430 			continue;
3431 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3432 			return atsru;
3433 	}
3434 
3435 	return NULL;
3436 }
3437 
3438 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3439 {
3440 	struct acpi_dmar_atsr *atsr;
3441 	struct dmar_atsr_unit *atsru;
3442 
3443 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3444 		return 0;
3445 
3446 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3447 	atsru = dmar_find_atsr(atsr);
3448 	if (atsru)
3449 		return 0;
3450 
3451 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3452 	if (!atsru)
3453 		return -ENOMEM;
3454 
3455 	/*
3456 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3457 	 * copy the memory content because the memory buffer will be freed
3458 	 * on return.
3459 	 */
3460 	atsru->hdr = (void *)(atsru + 1);
3461 	memcpy(atsru->hdr, hdr, hdr->length);
3462 	atsru->include_all = atsr->flags & 0x1;
3463 	if (!atsru->include_all) {
3464 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3465 				(void *)atsr + atsr->header.length,
3466 				&atsru->devices_cnt);
3467 		if (atsru->devices_cnt && atsru->devices == NULL) {
3468 			kfree(atsru);
3469 			return -ENOMEM;
3470 		}
3471 	}
3472 
3473 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3474 
3475 	return 0;
3476 }
3477 
3478 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3479 {
3480 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3481 	kfree(atsru);
3482 }
3483 
3484 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3485 {
3486 	struct acpi_dmar_atsr *atsr;
3487 	struct dmar_atsr_unit *atsru;
3488 
3489 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3490 	atsru = dmar_find_atsr(atsr);
3491 	if (atsru) {
3492 		list_del_rcu(&atsru->list);
3493 		synchronize_rcu();
3494 		intel_iommu_free_atsr(atsru);
3495 	}
3496 
3497 	return 0;
3498 }
3499 
3500 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3501 {
3502 	int i;
3503 	struct device *dev;
3504 	struct acpi_dmar_atsr *atsr;
3505 	struct dmar_atsr_unit *atsru;
3506 
3507 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3508 	atsru = dmar_find_atsr(atsr);
3509 	if (!atsru)
3510 		return 0;
3511 
3512 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3513 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3514 					  i, dev)
3515 			return -EBUSY;
3516 	}
3517 
3518 	return 0;
3519 }
3520 
3521 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3522 {
3523 	struct dmar_satc_unit *satcu;
3524 	struct acpi_dmar_satc *tmp;
3525 
3526 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3527 				dmar_rcu_check()) {
3528 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3529 		if (satc->segment != tmp->segment)
3530 			continue;
3531 		if (satc->header.length != tmp->header.length)
3532 			continue;
3533 		if (memcmp(satc, tmp, satc->header.length) == 0)
3534 			return satcu;
3535 	}
3536 
3537 	return NULL;
3538 }
3539 
3540 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3541 {
3542 	struct acpi_dmar_satc *satc;
3543 	struct dmar_satc_unit *satcu;
3544 
3545 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3546 		return 0;
3547 
3548 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3549 	satcu = dmar_find_satc(satc);
3550 	if (satcu)
3551 		return 0;
3552 
3553 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3554 	if (!satcu)
3555 		return -ENOMEM;
3556 
3557 	satcu->hdr = (void *)(satcu + 1);
3558 	memcpy(satcu->hdr, hdr, hdr->length);
3559 	satcu->atc_required = satc->flags & 0x1;
3560 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3561 					      (void *)satc + satc->header.length,
3562 					      &satcu->devices_cnt);
3563 	if (satcu->devices_cnt && !satcu->devices) {
3564 		kfree(satcu);
3565 		return -ENOMEM;
3566 	}
3567 	list_add_rcu(&satcu->list, &dmar_satc_units);
3568 
3569 	return 0;
3570 }
3571 
3572 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3573 {
3574 	int sp, ret;
3575 	struct intel_iommu *iommu = dmaru->iommu;
3576 
3577 	if (g_iommus[iommu->seq_id])
3578 		return 0;
3579 
3580 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3581 	if (ret)
3582 		goto out;
3583 
3584 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3585 		pr_warn("%s: Doesn't support hardware pass through.\n",
3586 			iommu->name);
3587 		return -ENXIO;
3588 	}
3589 	if (!ecap_sc_support(iommu->ecap) &&
3590 	    domain_update_iommu_snooping(iommu)) {
3591 		pr_warn("%s: Doesn't support snooping.\n",
3592 			iommu->name);
3593 		return -ENXIO;
3594 	}
3595 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3596 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3597 		pr_warn("%s: Doesn't support large page.\n",
3598 			iommu->name);
3599 		return -ENXIO;
3600 	}
3601 
3602 	/*
3603 	 * Disable translation if already enabled prior to OS handover.
3604 	 */
3605 	if (iommu->gcmd & DMA_GCMD_TE)
3606 		iommu_disable_translation(iommu);
3607 
3608 	g_iommus[iommu->seq_id] = iommu;
3609 	ret = iommu_init_domains(iommu);
3610 	if (ret == 0)
3611 		ret = iommu_alloc_root_entry(iommu);
3612 	if (ret)
3613 		goto out;
3614 
3615 	intel_svm_check(iommu);
3616 
3617 	if (dmaru->ignored) {
3618 		/*
3619 		 * we always have to disable PMRs or DMA may fail on this device
3620 		 */
3621 		if (force_on)
3622 			iommu_disable_protect_mem_regions(iommu);
3623 		return 0;
3624 	}
3625 
3626 	intel_iommu_init_qi(iommu);
3627 	iommu_flush_write_buffer(iommu);
3628 
3629 #ifdef CONFIG_INTEL_IOMMU_SVM
3630 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3631 		ret = intel_svm_enable_prq(iommu);
3632 		if (ret)
3633 			goto disable_iommu;
3634 	}
3635 #endif
3636 	ret = dmar_set_interrupt(iommu);
3637 	if (ret)
3638 		goto disable_iommu;
3639 
3640 	iommu_set_root_entry(iommu);
3641 	iommu_enable_translation(iommu);
3642 
3643 	iommu_disable_protect_mem_regions(iommu);
3644 	return 0;
3645 
3646 disable_iommu:
3647 	disable_dmar_iommu(iommu);
3648 out:
3649 	free_dmar_iommu(iommu);
3650 	return ret;
3651 }
3652 
3653 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3654 {
3655 	int ret = 0;
3656 	struct intel_iommu *iommu = dmaru->iommu;
3657 
3658 	if (!intel_iommu_enabled)
3659 		return 0;
3660 	if (iommu == NULL)
3661 		return -EINVAL;
3662 
3663 	if (insert) {
3664 		ret = intel_iommu_add(dmaru);
3665 	} else {
3666 		disable_dmar_iommu(iommu);
3667 		free_dmar_iommu(iommu);
3668 	}
3669 
3670 	return ret;
3671 }
3672 
3673 static void intel_iommu_free_dmars(void)
3674 {
3675 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3676 	struct dmar_atsr_unit *atsru, *atsr_n;
3677 	struct dmar_satc_unit *satcu, *satc_n;
3678 
3679 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3680 		list_del(&rmrru->list);
3681 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3682 		kfree(rmrru);
3683 	}
3684 
3685 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3686 		list_del(&atsru->list);
3687 		intel_iommu_free_atsr(atsru);
3688 	}
3689 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3690 		list_del(&satcu->list);
3691 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3692 		kfree(satcu);
3693 	}
3694 }
3695 
3696 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3697 {
3698 	struct dmar_satc_unit *satcu;
3699 	struct acpi_dmar_satc *satc;
3700 	struct device *tmp;
3701 	int i;
3702 
3703 	dev = pci_physfn(dev);
3704 	rcu_read_lock();
3705 
3706 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3707 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3708 		if (satc->segment != pci_domain_nr(dev->bus))
3709 			continue;
3710 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3711 			if (to_pci_dev(tmp) == dev)
3712 				goto out;
3713 	}
3714 	satcu = NULL;
3715 out:
3716 	rcu_read_unlock();
3717 	return satcu;
3718 }
3719 
3720 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3721 {
3722 	int i, ret = 1;
3723 	struct pci_bus *bus;
3724 	struct pci_dev *bridge = NULL;
3725 	struct device *tmp;
3726 	struct acpi_dmar_atsr *atsr;
3727 	struct dmar_atsr_unit *atsru;
3728 	struct dmar_satc_unit *satcu;
3729 
3730 	dev = pci_physfn(dev);
3731 	satcu = dmar_find_matched_satc_unit(dev);
3732 	if (satcu)
3733 		/*
3734 		 * This device supports ATS as it is in SATC table.
3735 		 * When IOMMU is in legacy mode, enabling ATS is done
3736 		 * automatically by HW for the device that requires
3737 		 * ATS, hence OS should not enable this device ATS
3738 		 * to avoid duplicated TLB invalidation.
3739 		 */
3740 		return !(satcu->atc_required && !sm_supported(iommu));
3741 
3742 	for (bus = dev->bus; bus; bus = bus->parent) {
3743 		bridge = bus->self;
3744 		/* If it's an integrated device, allow ATS */
3745 		if (!bridge)
3746 			return 1;
3747 		/* Connected via non-PCIe: no ATS */
3748 		if (!pci_is_pcie(bridge) ||
3749 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3750 			return 0;
3751 		/* If we found the root port, look it up in the ATSR */
3752 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3753 			break;
3754 	}
3755 
3756 	rcu_read_lock();
3757 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3758 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3759 		if (atsr->segment != pci_domain_nr(dev->bus))
3760 			continue;
3761 
3762 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3763 			if (tmp == &bridge->dev)
3764 				goto out;
3765 
3766 		if (atsru->include_all)
3767 			goto out;
3768 	}
3769 	ret = 0;
3770 out:
3771 	rcu_read_unlock();
3772 
3773 	return ret;
3774 }
3775 
3776 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3777 {
3778 	int ret;
3779 	struct dmar_rmrr_unit *rmrru;
3780 	struct dmar_atsr_unit *atsru;
3781 	struct dmar_satc_unit *satcu;
3782 	struct acpi_dmar_atsr *atsr;
3783 	struct acpi_dmar_reserved_memory *rmrr;
3784 	struct acpi_dmar_satc *satc;
3785 
3786 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3787 		return 0;
3788 
3789 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3790 		rmrr = container_of(rmrru->hdr,
3791 				    struct acpi_dmar_reserved_memory, header);
3792 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3793 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3794 				((void *)rmrr) + rmrr->header.length,
3795 				rmrr->segment, rmrru->devices,
3796 				rmrru->devices_cnt);
3797 			if (ret < 0)
3798 				return ret;
3799 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3800 			dmar_remove_dev_scope(info, rmrr->segment,
3801 				rmrru->devices, rmrru->devices_cnt);
3802 		}
3803 	}
3804 
3805 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3806 		if (atsru->include_all)
3807 			continue;
3808 
3809 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3810 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3811 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3812 					(void *)atsr + atsr->header.length,
3813 					atsr->segment, atsru->devices,
3814 					atsru->devices_cnt);
3815 			if (ret > 0)
3816 				break;
3817 			else if (ret < 0)
3818 				return ret;
3819 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3820 			if (dmar_remove_dev_scope(info, atsr->segment,
3821 					atsru->devices, atsru->devices_cnt))
3822 				break;
3823 		}
3824 	}
3825 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3826 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3827 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3828 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3829 					(void *)satc + satc->header.length,
3830 					satc->segment, satcu->devices,
3831 					satcu->devices_cnt);
3832 			if (ret > 0)
3833 				break;
3834 			else if (ret < 0)
3835 				return ret;
3836 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3837 			if (dmar_remove_dev_scope(info, satc->segment,
3838 					satcu->devices, satcu->devices_cnt))
3839 				break;
3840 		}
3841 	}
3842 
3843 	return 0;
3844 }
3845 
3846 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3847 				       unsigned long val, void *v)
3848 {
3849 	struct memory_notify *mhp = v;
3850 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3851 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3852 			mhp->nr_pages - 1);
3853 
3854 	switch (val) {
3855 	case MEM_GOING_ONLINE:
3856 		if (iommu_domain_identity_map(si_domain,
3857 					      start_vpfn, last_vpfn)) {
3858 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3859 				start_vpfn, last_vpfn);
3860 			return NOTIFY_BAD;
3861 		}
3862 		break;
3863 
3864 	case MEM_OFFLINE:
3865 	case MEM_CANCEL_ONLINE:
3866 		{
3867 			struct dmar_drhd_unit *drhd;
3868 			struct intel_iommu *iommu;
3869 			LIST_HEAD(freelist);
3870 
3871 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3872 
3873 			rcu_read_lock();
3874 			for_each_active_iommu(iommu, drhd)
3875 				iommu_flush_iotlb_psi(iommu, si_domain,
3876 					start_vpfn, mhp->nr_pages,
3877 					list_empty(&freelist), 0);
3878 			rcu_read_unlock();
3879 			put_pages_list(&freelist);
3880 		}
3881 		break;
3882 	}
3883 
3884 	return NOTIFY_OK;
3885 }
3886 
3887 static struct notifier_block intel_iommu_memory_nb = {
3888 	.notifier_call = intel_iommu_memory_notifier,
3889 	.priority = 0
3890 };
3891 
3892 static void intel_disable_iommus(void)
3893 {
3894 	struct intel_iommu *iommu = NULL;
3895 	struct dmar_drhd_unit *drhd;
3896 
3897 	for_each_iommu(iommu, drhd)
3898 		iommu_disable_translation(iommu);
3899 }
3900 
3901 void intel_iommu_shutdown(void)
3902 {
3903 	struct dmar_drhd_unit *drhd;
3904 	struct intel_iommu *iommu = NULL;
3905 
3906 	if (no_iommu || dmar_disabled)
3907 		return;
3908 
3909 	down_write(&dmar_global_lock);
3910 
3911 	/* Disable PMRs explicitly here. */
3912 	for_each_iommu(iommu, drhd)
3913 		iommu_disable_protect_mem_regions(iommu);
3914 
3915 	/* Make sure the IOMMUs are switched off */
3916 	intel_disable_iommus();
3917 
3918 	up_write(&dmar_global_lock);
3919 }
3920 
3921 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3922 {
3923 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3924 
3925 	return container_of(iommu_dev, struct intel_iommu, iommu);
3926 }
3927 
3928 static ssize_t version_show(struct device *dev,
3929 			    struct device_attribute *attr, char *buf)
3930 {
3931 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3932 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3933 	return sprintf(buf, "%d:%d\n",
3934 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3935 }
3936 static DEVICE_ATTR_RO(version);
3937 
3938 static ssize_t address_show(struct device *dev,
3939 			    struct device_attribute *attr, char *buf)
3940 {
3941 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3942 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3943 }
3944 static DEVICE_ATTR_RO(address);
3945 
3946 static ssize_t cap_show(struct device *dev,
3947 			struct device_attribute *attr, char *buf)
3948 {
3949 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3950 	return sprintf(buf, "%llx\n", iommu->cap);
3951 }
3952 static DEVICE_ATTR_RO(cap);
3953 
3954 static ssize_t ecap_show(struct device *dev,
3955 			 struct device_attribute *attr, char *buf)
3956 {
3957 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3958 	return sprintf(buf, "%llx\n", iommu->ecap);
3959 }
3960 static DEVICE_ATTR_RO(ecap);
3961 
3962 static ssize_t domains_supported_show(struct device *dev,
3963 				      struct device_attribute *attr, char *buf)
3964 {
3965 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3966 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3967 }
3968 static DEVICE_ATTR_RO(domains_supported);
3969 
3970 static ssize_t domains_used_show(struct device *dev,
3971 				 struct device_attribute *attr, char *buf)
3972 {
3973 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3974 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3975 						  cap_ndoms(iommu->cap)));
3976 }
3977 static DEVICE_ATTR_RO(domains_used);
3978 
3979 static struct attribute *intel_iommu_attrs[] = {
3980 	&dev_attr_version.attr,
3981 	&dev_attr_address.attr,
3982 	&dev_attr_cap.attr,
3983 	&dev_attr_ecap.attr,
3984 	&dev_attr_domains_supported.attr,
3985 	&dev_attr_domains_used.attr,
3986 	NULL,
3987 };
3988 
3989 static struct attribute_group intel_iommu_group = {
3990 	.name = "intel-iommu",
3991 	.attrs = intel_iommu_attrs,
3992 };
3993 
3994 const struct attribute_group *intel_iommu_groups[] = {
3995 	&intel_iommu_group,
3996 	NULL,
3997 };
3998 
3999 static inline bool has_external_pci(void)
4000 {
4001 	struct pci_dev *pdev = NULL;
4002 
4003 	for_each_pci_dev(pdev)
4004 		if (pdev->external_facing)
4005 			return true;
4006 
4007 	return false;
4008 }
4009 
4010 static int __init platform_optin_force_iommu(void)
4011 {
4012 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4013 		return 0;
4014 
4015 	if (no_iommu || dmar_disabled)
4016 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4017 
4018 	/*
4019 	 * If Intel-IOMMU is disabled by default, we will apply identity
4020 	 * map for all devices except those marked as being untrusted.
4021 	 */
4022 	if (dmar_disabled)
4023 		iommu_set_default_passthrough(false);
4024 
4025 	dmar_disabled = 0;
4026 	no_iommu = 0;
4027 
4028 	return 1;
4029 }
4030 
4031 static int __init probe_acpi_namespace_devices(void)
4032 {
4033 	struct dmar_drhd_unit *drhd;
4034 	/* To avoid a -Wunused-but-set-variable warning. */
4035 	struct intel_iommu *iommu __maybe_unused;
4036 	struct device *dev;
4037 	int i, ret = 0;
4038 
4039 	for_each_active_iommu(iommu, drhd) {
4040 		for_each_active_dev_scope(drhd->devices,
4041 					  drhd->devices_cnt, i, dev) {
4042 			struct acpi_device_physical_node *pn;
4043 			struct iommu_group *group;
4044 			struct acpi_device *adev;
4045 
4046 			if (dev->bus != &acpi_bus_type)
4047 				continue;
4048 
4049 			adev = to_acpi_device(dev);
4050 			mutex_lock(&adev->physical_node_lock);
4051 			list_for_each_entry(pn,
4052 					    &adev->physical_node_list, node) {
4053 				group = iommu_group_get(pn->dev);
4054 				if (group) {
4055 					iommu_group_put(group);
4056 					continue;
4057 				}
4058 
4059 				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4060 				ret = iommu_probe_device(pn->dev);
4061 				if (ret)
4062 					break;
4063 			}
4064 			mutex_unlock(&adev->physical_node_lock);
4065 
4066 			if (ret)
4067 				return ret;
4068 		}
4069 	}
4070 
4071 	return 0;
4072 }
4073 
4074 int __init intel_iommu_init(void)
4075 {
4076 	int ret = -ENODEV;
4077 	struct dmar_drhd_unit *drhd;
4078 	struct intel_iommu *iommu;
4079 
4080 	/*
4081 	 * Intel IOMMU is required for a TXT/tboot launch or platform
4082 	 * opt in, so enforce that.
4083 	 */
4084 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4085 		    platform_optin_force_iommu();
4086 
4087 	down_write(&dmar_global_lock);
4088 	if (dmar_table_init()) {
4089 		if (force_on)
4090 			panic("tboot: Failed to initialize DMAR table\n");
4091 		goto out_free_dmar;
4092 	}
4093 
4094 	if (dmar_dev_scope_init() < 0) {
4095 		if (force_on)
4096 			panic("tboot: Failed to initialize DMAR device scope\n");
4097 		goto out_free_dmar;
4098 	}
4099 
4100 	up_write(&dmar_global_lock);
4101 
4102 	/*
4103 	 * The bus notifier takes the dmar_global_lock, so lockdep will
4104 	 * complain later when we register it under the lock.
4105 	 */
4106 	dmar_register_bus_notifier();
4107 
4108 	down_write(&dmar_global_lock);
4109 
4110 	if (!no_iommu)
4111 		intel_iommu_debugfs_init();
4112 
4113 	if (no_iommu || dmar_disabled) {
4114 		/*
4115 		 * We exit the function here to ensure IOMMU's remapping and
4116 		 * mempool aren't setup, which means that the IOMMU's PMRs
4117 		 * won't be disabled via the call to init_dmars(). So disable
4118 		 * it explicitly here. The PMRs were setup by tboot prior to
4119 		 * calling SENTER, but the kernel is expected to reset/tear
4120 		 * down the PMRs.
4121 		 */
4122 		if (intel_iommu_tboot_noforce) {
4123 			for_each_iommu(iommu, drhd)
4124 				iommu_disable_protect_mem_regions(iommu);
4125 		}
4126 
4127 		/*
4128 		 * Make sure the IOMMUs are switched off, even when we
4129 		 * boot into a kexec kernel and the previous kernel left
4130 		 * them enabled
4131 		 */
4132 		intel_disable_iommus();
4133 		goto out_free_dmar;
4134 	}
4135 
4136 	if (list_empty(&dmar_rmrr_units))
4137 		pr_info("No RMRR found\n");
4138 
4139 	if (list_empty(&dmar_atsr_units))
4140 		pr_info("No ATSR found\n");
4141 
4142 	if (list_empty(&dmar_satc_units))
4143 		pr_info("No SATC found\n");
4144 
4145 	if (dmar_map_gfx)
4146 		intel_iommu_gfx_mapped = 1;
4147 
4148 	init_no_remapping_devices();
4149 
4150 	ret = init_dmars();
4151 	if (ret) {
4152 		if (force_on)
4153 			panic("tboot: Failed to initialize DMARs\n");
4154 		pr_err("Initialization failed\n");
4155 		goto out_free_dmar;
4156 	}
4157 	up_write(&dmar_global_lock);
4158 
4159 	init_iommu_pm_ops();
4160 
4161 	down_read(&dmar_global_lock);
4162 	for_each_active_iommu(iommu, drhd) {
4163 		/*
4164 		 * The flush queue implementation does not perform
4165 		 * page-selective invalidations that are required for efficient
4166 		 * TLB flushes in virtual environments.  The benefit of batching
4167 		 * is likely to be much lower than the overhead of synchronizing
4168 		 * the virtual and physical IOMMU page-tables.
4169 		 */
4170 		if (cap_caching_mode(iommu->cap)) {
4171 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4172 			iommu_set_dma_strict();
4173 		}
4174 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4175 				       intel_iommu_groups,
4176 				       "%s", iommu->name);
4177 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4178 	}
4179 	up_read(&dmar_global_lock);
4180 
4181 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4182 	if (si_domain && !hw_pass_through)
4183 		register_memory_notifier(&intel_iommu_memory_nb);
4184 
4185 	down_read(&dmar_global_lock);
4186 	if (probe_acpi_namespace_devices())
4187 		pr_warn("ACPI name space devices didn't probe correctly\n");
4188 
4189 	/* Finally, we enable the DMA remapping hardware. */
4190 	for_each_iommu(iommu, drhd) {
4191 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4192 			iommu_enable_translation(iommu);
4193 
4194 		iommu_disable_protect_mem_regions(iommu);
4195 	}
4196 	up_read(&dmar_global_lock);
4197 
4198 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4199 
4200 	intel_iommu_enabled = 1;
4201 
4202 	return 0;
4203 
4204 out_free_dmar:
4205 	intel_iommu_free_dmars();
4206 	up_write(&dmar_global_lock);
4207 	return ret;
4208 }
4209 
4210 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4211 {
4212 	struct device_domain_info *info = opaque;
4213 
4214 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4215 	return 0;
4216 }
4217 
4218 /*
4219  * NB - intel-iommu lacks any sort of reference counting for the users of
4220  * dependent devices.  If multiple endpoints have intersecting dependent
4221  * devices, unbinding the driver from any one of them will possibly leave
4222  * the others unable to operate.
4223  */
4224 static void domain_context_clear(struct device_domain_info *info)
4225 {
4226 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4227 		return;
4228 
4229 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4230 			       &domain_context_clear_one_cb, info);
4231 }
4232 
4233 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4234 {
4235 	struct dmar_domain *domain;
4236 	struct intel_iommu *iommu;
4237 	unsigned long flags;
4238 
4239 	assert_spin_locked(&device_domain_lock);
4240 
4241 	if (WARN_ON(!info))
4242 		return;
4243 
4244 	iommu = info->iommu;
4245 	domain = info->domain;
4246 
4247 	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4248 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4249 			intel_pasid_tear_down_entry(iommu, info->dev,
4250 					PASID_RID2PASID, false);
4251 
4252 		iommu_disable_dev_iotlb(info);
4253 		domain_context_clear(info);
4254 		intel_pasid_free_table(info->dev);
4255 	}
4256 
4257 	list_del(&info->link);
4258 
4259 	spin_lock_irqsave(&iommu->lock, flags);
4260 	domain_detach_iommu(domain, iommu);
4261 	spin_unlock_irqrestore(&iommu->lock, flags);
4262 }
4263 
4264 static void dmar_remove_one_dev_info(struct device *dev)
4265 {
4266 	struct device_domain_info *info;
4267 	unsigned long flags;
4268 
4269 	spin_lock_irqsave(&device_domain_lock, flags);
4270 	info = dev_iommu_priv_get(dev);
4271 	if (info)
4272 		__dmar_remove_one_dev_info(info);
4273 	spin_unlock_irqrestore(&device_domain_lock, flags);
4274 }
4275 
4276 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4277 {
4278 	int adjust_width;
4279 
4280 	/* calculate AGAW */
4281 	domain->gaw = guest_width;
4282 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4283 	domain->agaw = width_to_agaw(adjust_width);
4284 
4285 	domain->iommu_coherency = false;
4286 	domain->iommu_snooping = false;
4287 	domain->iommu_superpage = 0;
4288 	domain->max_addr = 0;
4289 
4290 	/* always allocate the top pgd */
4291 	domain->pgd = alloc_pgtable_page(domain->nid);
4292 	if (!domain->pgd)
4293 		return -ENOMEM;
4294 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4295 	return 0;
4296 }
4297 
4298 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4299 {
4300 	struct dmar_domain *dmar_domain;
4301 	struct iommu_domain *domain;
4302 
4303 	switch (type) {
4304 	case IOMMU_DOMAIN_DMA:
4305 	case IOMMU_DOMAIN_DMA_FQ:
4306 	case IOMMU_DOMAIN_UNMANAGED:
4307 		dmar_domain = alloc_domain(type);
4308 		if (!dmar_domain) {
4309 			pr_err("Can't allocate dmar_domain\n");
4310 			return NULL;
4311 		}
4312 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4313 			pr_err("Domain initialization failed\n");
4314 			domain_exit(dmar_domain);
4315 			return NULL;
4316 		}
4317 
4318 		domain = &dmar_domain->domain;
4319 		domain->geometry.aperture_start = 0;
4320 		domain->geometry.aperture_end   =
4321 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4322 		domain->geometry.force_aperture = true;
4323 
4324 		return domain;
4325 	case IOMMU_DOMAIN_IDENTITY:
4326 		return &si_domain->domain;
4327 	default:
4328 		return NULL;
4329 	}
4330 
4331 	return NULL;
4332 }
4333 
4334 static void intel_iommu_domain_free(struct iommu_domain *domain)
4335 {
4336 	if (domain != &si_domain->domain)
4337 		domain_exit(to_dmar_domain(domain));
4338 }
4339 
4340 static int prepare_domain_attach_device(struct iommu_domain *domain,
4341 					struct device *dev)
4342 {
4343 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4344 	struct intel_iommu *iommu;
4345 	int addr_width;
4346 
4347 	iommu = device_to_iommu(dev, NULL, NULL);
4348 	if (!iommu)
4349 		return -ENODEV;
4350 
4351 	/* check if this iommu agaw is sufficient for max mapped address */
4352 	addr_width = agaw_to_width(iommu->agaw);
4353 	if (addr_width > cap_mgaw(iommu->cap))
4354 		addr_width = cap_mgaw(iommu->cap);
4355 
4356 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4357 		dev_err(dev, "%s: iommu width (%d) is not "
4358 		        "sufficient for the mapped address (%llx)\n",
4359 		        __func__, addr_width, dmar_domain->max_addr);
4360 		return -EFAULT;
4361 	}
4362 	dmar_domain->gaw = addr_width;
4363 
4364 	/*
4365 	 * Knock out extra levels of page tables if necessary
4366 	 */
4367 	while (iommu->agaw < dmar_domain->agaw) {
4368 		struct dma_pte *pte;
4369 
4370 		pte = dmar_domain->pgd;
4371 		if (dma_pte_present(pte)) {
4372 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4373 			free_pgtable_page(pte);
4374 		}
4375 		dmar_domain->agaw--;
4376 	}
4377 
4378 	return 0;
4379 }
4380 
4381 static int intel_iommu_attach_device(struct iommu_domain *domain,
4382 				     struct device *dev)
4383 {
4384 	int ret;
4385 
4386 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4387 	    device_is_rmrr_locked(dev)) {
4388 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4389 		return -EPERM;
4390 	}
4391 
4392 	/* normally dev is not mapped */
4393 	if (unlikely(domain_context_mapped(dev))) {
4394 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4395 
4396 		if (info->domain)
4397 			dmar_remove_one_dev_info(dev);
4398 	}
4399 
4400 	ret = prepare_domain_attach_device(domain, dev);
4401 	if (ret)
4402 		return ret;
4403 
4404 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4405 }
4406 
4407 static void intel_iommu_detach_device(struct iommu_domain *domain,
4408 				      struct device *dev)
4409 {
4410 	dmar_remove_one_dev_info(dev);
4411 }
4412 
4413 static int intel_iommu_map(struct iommu_domain *domain,
4414 			   unsigned long iova, phys_addr_t hpa,
4415 			   size_t size, int iommu_prot, gfp_t gfp)
4416 {
4417 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4418 	u64 max_addr;
4419 	int prot = 0;
4420 
4421 	if (iommu_prot & IOMMU_READ)
4422 		prot |= DMA_PTE_READ;
4423 	if (iommu_prot & IOMMU_WRITE)
4424 		prot |= DMA_PTE_WRITE;
4425 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4426 		prot |= DMA_PTE_SNP;
4427 
4428 	max_addr = iova + size;
4429 	if (dmar_domain->max_addr < max_addr) {
4430 		u64 end;
4431 
4432 		/* check if minimum agaw is sufficient for mapped address */
4433 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4434 		if (end < max_addr) {
4435 			pr_err("%s: iommu width (%d) is not "
4436 			       "sufficient for the mapped address (%llx)\n",
4437 			       __func__, dmar_domain->gaw, max_addr);
4438 			return -EFAULT;
4439 		}
4440 		dmar_domain->max_addr = max_addr;
4441 	}
4442 	/* Round up size to next multiple of PAGE_SIZE, if it and
4443 	   the low bits of hpa would take us onto the next page */
4444 	size = aligned_nrpages(hpa, size);
4445 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4446 				hpa >> VTD_PAGE_SHIFT, size, prot);
4447 }
4448 
4449 static int intel_iommu_map_pages(struct iommu_domain *domain,
4450 				 unsigned long iova, phys_addr_t paddr,
4451 				 size_t pgsize, size_t pgcount,
4452 				 int prot, gfp_t gfp, size_t *mapped)
4453 {
4454 	unsigned long pgshift = __ffs(pgsize);
4455 	size_t size = pgcount << pgshift;
4456 	int ret;
4457 
4458 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4459 		return -EINVAL;
4460 
4461 	if (!IS_ALIGNED(iova | paddr, pgsize))
4462 		return -EINVAL;
4463 
4464 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4465 	if (!ret && mapped)
4466 		*mapped = size;
4467 
4468 	return ret;
4469 }
4470 
4471 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4472 				unsigned long iova, size_t size,
4473 				struct iommu_iotlb_gather *gather)
4474 {
4475 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4476 	unsigned long start_pfn, last_pfn;
4477 	int level = 0;
4478 
4479 	/* Cope with horrid API which requires us to unmap more than the
4480 	   size argument if it happens to be a large-page mapping. */
4481 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4482 
4483 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4484 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4485 
4486 	start_pfn = iova >> VTD_PAGE_SHIFT;
4487 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4488 
4489 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4490 
4491 	if (dmar_domain->max_addr == iova + size)
4492 		dmar_domain->max_addr = iova;
4493 
4494 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4495 
4496 	return size;
4497 }
4498 
4499 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4500 				      unsigned long iova,
4501 				      size_t pgsize, size_t pgcount,
4502 				      struct iommu_iotlb_gather *gather)
4503 {
4504 	unsigned long pgshift = __ffs(pgsize);
4505 	size_t size = pgcount << pgshift;
4506 
4507 	return intel_iommu_unmap(domain, iova, size, gather);
4508 }
4509 
4510 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4511 				 struct iommu_iotlb_gather *gather)
4512 {
4513 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4514 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4515 	size_t size = gather->end - gather->start;
4516 	unsigned long start_pfn;
4517 	unsigned long nrpages;
4518 	int iommu_id;
4519 
4520 	nrpages = aligned_nrpages(gather->start, size);
4521 	start_pfn = mm_to_dma_pfn(iova_pfn);
4522 
4523 	for_each_domain_iommu(iommu_id, dmar_domain)
4524 		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4525 				      start_pfn, nrpages,
4526 				      list_empty(&gather->freelist), 0);
4527 
4528 	put_pages_list(&gather->freelist);
4529 }
4530 
4531 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4532 					    dma_addr_t iova)
4533 {
4534 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4535 	struct dma_pte *pte;
4536 	int level = 0;
4537 	u64 phys = 0;
4538 
4539 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4540 	if (pte && dma_pte_present(pte))
4541 		phys = dma_pte_addr(pte) +
4542 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4543 						VTD_PAGE_SHIFT) - 1));
4544 
4545 	return phys;
4546 }
4547 
4548 static bool intel_iommu_capable(enum iommu_cap cap)
4549 {
4550 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4551 		return domain_update_iommu_snooping(NULL);
4552 	if (cap == IOMMU_CAP_INTR_REMAP)
4553 		return irq_remapping_enabled == 1;
4554 
4555 	return false;
4556 }
4557 
4558 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4559 {
4560 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4561 	struct device_domain_info *info;
4562 	struct intel_iommu *iommu;
4563 	unsigned long flags;
4564 	u8 bus, devfn;
4565 
4566 	iommu = device_to_iommu(dev, &bus, &devfn);
4567 	if (!iommu)
4568 		return ERR_PTR(-ENODEV);
4569 
4570 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4571 	if (!info)
4572 		return ERR_PTR(-ENOMEM);
4573 
4574 	if (dev_is_real_dma_subdevice(dev)) {
4575 		info->bus = pdev->bus->number;
4576 		info->devfn = pdev->devfn;
4577 		info->segment = pci_domain_nr(pdev->bus);
4578 	} else {
4579 		info->bus = bus;
4580 		info->devfn = devfn;
4581 		info->segment = iommu->segment;
4582 	}
4583 
4584 	info->dev = dev;
4585 	info->iommu = iommu;
4586 	if (dev_is_pci(dev)) {
4587 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4588 		    pci_ats_supported(pdev) &&
4589 		    dmar_ats_supported(pdev, iommu))
4590 			info->ats_supported = 1;
4591 
4592 		if (sm_supported(iommu)) {
4593 			if (pasid_supported(iommu)) {
4594 				int features = pci_pasid_features(pdev);
4595 
4596 				if (features >= 0)
4597 					info->pasid_supported = features | 1;
4598 			}
4599 
4600 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4601 			    pci_pri_supported(pdev))
4602 				info->pri_supported = 1;
4603 		}
4604 	}
4605 
4606 	spin_lock_irqsave(&device_domain_lock, flags);
4607 	list_add(&info->global, &device_domain_list);
4608 	dev_iommu_priv_set(dev, info);
4609 	spin_unlock_irqrestore(&device_domain_lock, flags);
4610 
4611 	return &iommu->iommu;
4612 }
4613 
4614 static void intel_iommu_release_device(struct device *dev)
4615 {
4616 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4617 	unsigned long flags;
4618 
4619 	dmar_remove_one_dev_info(dev);
4620 
4621 	spin_lock_irqsave(&device_domain_lock, flags);
4622 	dev_iommu_priv_set(dev, NULL);
4623 	list_del(&info->global);
4624 	spin_unlock_irqrestore(&device_domain_lock, flags);
4625 
4626 	kfree(info);
4627 	set_dma_ops(dev, NULL);
4628 }
4629 
4630 static void intel_iommu_probe_finalize(struct device *dev)
4631 {
4632 	set_dma_ops(dev, NULL);
4633 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4634 }
4635 
4636 static void intel_iommu_get_resv_regions(struct device *device,
4637 					 struct list_head *head)
4638 {
4639 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4640 	struct iommu_resv_region *reg;
4641 	struct dmar_rmrr_unit *rmrr;
4642 	struct device *i_dev;
4643 	int i;
4644 
4645 	down_read(&dmar_global_lock);
4646 	for_each_rmrr_units(rmrr) {
4647 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4648 					  i, i_dev) {
4649 			struct iommu_resv_region *resv;
4650 			enum iommu_resv_type type;
4651 			size_t length;
4652 
4653 			if (i_dev != device &&
4654 			    !is_downstream_to_pci_bridge(device, i_dev))
4655 				continue;
4656 
4657 			length = rmrr->end_address - rmrr->base_address + 1;
4658 
4659 			type = device_rmrr_is_relaxable(device) ?
4660 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4661 
4662 			resv = iommu_alloc_resv_region(rmrr->base_address,
4663 						       length, prot, type);
4664 			if (!resv)
4665 				break;
4666 
4667 			list_add_tail(&resv->list, head);
4668 		}
4669 	}
4670 	up_read(&dmar_global_lock);
4671 
4672 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4673 	if (dev_is_pci(device)) {
4674 		struct pci_dev *pdev = to_pci_dev(device);
4675 
4676 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4677 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4678 						   IOMMU_RESV_DIRECT_RELAXABLE);
4679 			if (reg)
4680 				list_add_tail(&reg->list, head);
4681 		}
4682 	}
4683 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4684 
4685 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4686 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4687 				      0, IOMMU_RESV_MSI);
4688 	if (!reg)
4689 		return;
4690 	list_add_tail(&reg->list, head);
4691 }
4692 
4693 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4694 {
4695 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4696 	struct context_entry *context;
4697 	struct dmar_domain *domain;
4698 	unsigned long flags;
4699 	u64 ctx_lo;
4700 	int ret;
4701 
4702 	domain = info->domain;
4703 	if (!domain)
4704 		return -EINVAL;
4705 
4706 	spin_lock_irqsave(&device_domain_lock, flags);
4707 	spin_lock(&iommu->lock);
4708 
4709 	ret = -EINVAL;
4710 	if (!info->pasid_supported)
4711 		goto out;
4712 
4713 	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4714 	if (WARN_ON(!context))
4715 		goto out;
4716 
4717 	ctx_lo = context[0].lo;
4718 
4719 	if (!(ctx_lo & CONTEXT_PASIDE)) {
4720 		ctx_lo |= CONTEXT_PASIDE;
4721 		context[0].lo = ctx_lo;
4722 		wmb();
4723 		iommu->flush.flush_context(iommu,
4724 					   domain->iommu_did[iommu->seq_id],
4725 					   PCI_DEVID(info->bus, info->devfn),
4726 					   DMA_CCMD_MASK_NOBIT,
4727 					   DMA_CCMD_DEVICE_INVL);
4728 	}
4729 
4730 	/* Enable PASID support in the device, if it wasn't already */
4731 	if (!info->pasid_enabled)
4732 		iommu_enable_dev_iotlb(info);
4733 
4734 	ret = 0;
4735 
4736  out:
4737 	spin_unlock(&iommu->lock);
4738 	spin_unlock_irqrestore(&device_domain_lock, flags);
4739 
4740 	return ret;
4741 }
4742 
4743 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4744 {
4745 	if (dev_is_pci(dev))
4746 		return pci_device_group(dev);
4747 	return generic_device_group(dev);
4748 }
4749 
4750 static int intel_iommu_enable_sva(struct device *dev)
4751 {
4752 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4753 	struct intel_iommu *iommu;
4754 	int ret;
4755 
4756 	if (!info || dmar_disabled)
4757 		return -EINVAL;
4758 
4759 	iommu = info->iommu;
4760 	if (!iommu)
4761 		return -EINVAL;
4762 
4763 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4764 		return -ENODEV;
4765 
4766 	if (intel_iommu_enable_pasid(iommu, dev))
4767 		return -ENODEV;
4768 
4769 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4770 		return -EINVAL;
4771 
4772 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4773 	if (!ret)
4774 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4775 
4776 	return ret;
4777 }
4778 
4779 static int intel_iommu_disable_sva(struct device *dev)
4780 {
4781 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4782 	struct intel_iommu *iommu = info->iommu;
4783 	int ret;
4784 
4785 	ret = iommu_unregister_device_fault_handler(dev);
4786 	if (!ret)
4787 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4788 
4789 	return ret;
4790 }
4791 
4792 static int intel_iommu_enable_iopf(struct device *dev)
4793 {
4794 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4795 
4796 	if (info && info->pri_supported)
4797 		return 0;
4798 
4799 	return -ENODEV;
4800 }
4801 
4802 static int
4803 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4804 {
4805 	switch (feat) {
4806 	case IOMMU_DEV_FEAT_IOPF:
4807 		return intel_iommu_enable_iopf(dev);
4808 
4809 	case IOMMU_DEV_FEAT_SVA:
4810 		return intel_iommu_enable_sva(dev);
4811 
4812 	default:
4813 		return -ENODEV;
4814 	}
4815 }
4816 
4817 static int
4818 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4819 {
4820 	switch (feat) {
4821 	case IOMMU_DEV_FEAT_IOPF:
4822 		return 0;
4823 
4824 	case IOMMU_DEV_FEAT_SVA:
4825 		return intel_iommu_disable_sva(dev);
4826 
4827 	default:
4828 		return -ENODEV;
4829 	}
4830 }
4831 
4832 static bool intel_iommu_is_attach_deferred(struct device *dev)
4833 {
4834 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4835 
4836 	return translation_pre_enabled(info->iommu) && !info->domain;
4837 }
4838 
4839 /*
4840  * Check that the device does not live on an external facing PCI port that is
4841  * marked as untrusted. Such devices should not be able to apply quirks and
4842  * thus not be able to bypass the IOMMU restrictions.
4843  */
4844 static bool risky_device(struct pci_dev *pdev)
4845 {
4846 	if (pdev->untrusted) {
4847 		pci_info(pdev,
4848 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4849 			 pdev->vendor, pdev->device);
4850 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4851 		return true;
4852 	}
4853 	return false;
4854 }
4855 
4856 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4857 				       unsigned long iova, size_t size)
4858 {
4859 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4860 	unsigned long pages = aligned_nrpages(iova, size);
4861 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4862 	struct intel_iommu *iommu;
4863 	int iommu_id;
4864 
4865 	for_each_domain_iommu(iommu_id, dmar_domain) {
4866 		iommu = g_iommus[iommu_id];
4867 		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
4868 	}
4869 }
4870 
4871 const struct iommu_ops intel_iommu_ops = {
4872 	.capable		= intel_iommu_capable,
4873 	.domain_alloc		= intel_iommu_domain_alloc,
4874 	.probe_device		= intel_iommu_probe_device,
4875 	.probe_finalize		= intel_iommu_probe_finalize,
4876 	.release_device		= intel_iommu_release_device,
4877 	.get_resv_regions	= intel_iommu_get_resv_regions,
4878 	.put_resv_regions	= generic_iommu_put_resv_regions,
4879 	.device_group		= intel_iommu_device_group,
4880 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4881 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4882 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4883 	.def_domain_type	= device_def_domain_type,
4884 	.pgsize_bitmap		= SZ_4K,
4885 #ifdef CONFIG_INTEL_IOMMU_SVM
4886 	.sva_bind		= intel_svm_bind,
4887 	.sva_unbind		= intel_svm_unbind,
4888 	.sva_get_pasid		= intel_svm_get_pasid,
4889 	.page_response		= intel_svm_page_response,
4890 #endif
4891 	.default_domain_ops = &(const struct iommu_domain_ops) {
4892 		.attach_dev		= intel_iommu_attach_device,
4893 		.detach_dev		= intel_iommu_detach_device,
4894 		.map_pages		= intel_iommu_map_pages,
4895 		.unmap_pages		= intel_iommu_unmap_pages,
4896 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4897 		.flush_iotlb_all        = intel_flush_iotlb_all,
4898 		.iotlb_sync		= intel_iommu_tlb_sync,
4899 		.iova_to_phys		= intel_iommu_iova_to_phys,
4900 		.free			= intel_iommu_domain_free,
4901 	}
4902 };
4903 
4904 static void quirk_iommu_igfx(struct pci_dev *dev)
4905 {
4906 	if (risky_device(dev))
4907 		return;
4908 
4909 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4910 	dmar_map_gfx = 0;
4911 }
4912 
4913 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4915 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4916 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4917 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4918 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4919 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4921 
4922 /* Broadwell igfx malfunctions with dmar */
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4947 
4948 static void quirk_iommu_rwbf(struct pci_dev *dev)
4949 {
4950 	if (risky_device(dev))
4951 		return;
4952 
4953 	/*
4954 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4955 	 * but needs it. Same seems to hold for the desktop versions.
4956 	 */
4957 	pci_info(dev, "Forcing write-buffer flush capability\n");
4958 	rwbf_quirk = 1;
4959 }
4960 
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4968 
4969 #define GGC 0x52
4970 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4971 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4972 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4973 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4974 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4975 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4976 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4977 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4978 
4979 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4980 {
4981 	unsigned short ggc;
4982 
4983 	if (risky_device(dev))
4984 		return;
4985 
4986 	if (pci_read_config_word(dev, GGC, &ggc))
4987 		return;
4988 
4989 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4990 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4991 		dmar_map_gfx = 0;
4992 	} else if (dmar_map_gfx) {
4993 		/* we have to ensure the gfx device is idle before we flush */
4994 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4995 		iommu_set_dma_strict();
4996 	}
4997 }
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5002 
5003 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5004 {
5005 	unsigned short ver;
5006 
5007 	if (!IS_GFX_DEVICE(dev))
5008 		return;
5009 
5010 	ver = (dev->device >> 8) & 0xff;
5011 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5012 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5013 	    ver != 0x9a)
5014 		return;
5015 
5016 	if (risky_device(dev))
5017 		return;
5018 
5019 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5020 	iommu_skip_te_disable = 1;
5021 }
5022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5023 
5024 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5025    ISOCH DMAR unit for the Azalia sound device, but not give it any
5026    TLB entries, which causes it to deadlock. Check for that.  We do
5027    this in a function called from init_dmars(), instead of in a PCI
5028    quirk, because we don't want to print the obnoxious "BIOS broken"
5029    message if VT-d is actually disabled.
5030 */
5031 static void __init check_tylersburg_isoch(void)
5032 {
5033 	struct pci_dev *pdev;
5034 	uint32_t vtisochctrl;
5035 
5036 	/* If there's no Azalia in the system anyway, forget it. */
5037 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5038 	if (!pdev)
5039 		return;
5040 
5041 	if (risky_device(pdev)) {
5042 		pci_dev_put(pdev);
5043 		return;
5044 	}
5045 
5046 	pci_dev_put(pdev);
5047 
5048 	/* System Management Registers. Might be hidden, in which case
5049 	   we can't do the sanity check. But that's OK, because the
5050 	   known-broken BIOSes _don't_ actually hide it, so far. */
5051 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5052 	if (!pdev)
5053 		return;
5054 
5055 	if (risky_device(pdev)) {
5056 		pci_dev_put(pdev);
5057 		return;
5058 	}
5059 
5060 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5061 		pci_dev_put(pdev);
5062 		return;
5063 	}
5064 
5065 	pci_dev_put(pdev);
5066 
5067 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5068 	if (vtisochctrl & 1)
5069 		return;
5070 
5071 	/* Drop all bits other than the number of TLB entries */
5072 	vtisochctrl &= 0x1c;
5073 
5074 	/* If we have the recommended number of TLB entries (16), fine. */
5075 	if (vtisochctrl == 0x10)
5076 		return;
5077 
5078 	/* Zero TLB entries? You get to ride the short bus to school. */
5079 	if (!vtisochctrl) {
5080 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5081 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5082 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5083 		     dmi_get_system_info(DMI_BIOS_VERSION),
5084 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5085 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5086 		return;
5087 	}
5088 
5089 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5090 	       vtisochctrl);
5091 }
5092